src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 #include "vulkan/radv_shader.h"
  41
  42 struct radv_nir_compiler_options;
  43 struct radv_shader_args;
  44 struct radv_shader_info;
  45
  46 namespace aco {
  47
  48 extern uint64_t debug_flags;
  49
  50 enum {
  51    DEBUG_VALIDATE_IR = 0x1,
  52    DEBUG_VALIDATE_RA = 0x2,
  53    DEBUG_PERFWARN = 0x4,
  54    DEBUG_FORCE_WAITCNT = 0x8,
  55    DEBUG_NO_VN = 0x10,
  56    DEBUG_NO_OPT = 0x20,
  57    DEBUG_NO_SCHED = 0x40,
  58 };
  59
  60 /**
  61  * Representation of the instruction's microcode encoding format
  62  * Note: Some Vector ALU Formats can be combined, such that:
  63  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  64  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  65  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  66  *
  67  * (*) The same is applicable for VOP1 and VOPC instructions.
  68  */
  69 enum class Format : std::uint16_t {
  70    /* Pseudo Instruction Format */
  71    PSEUDO = 0,
  72    /* Scalar ALU & Control Formats */
  73    SOP1 = 1,
  74    SOP2 = 2,
  75    SOPK = 3,
  76    SOPP = 4,
  77    SOPC = 5,
  78    /* Scalar Memory Format */
  79    SMEM = 6,
  80    /* LDS/GDS Format */
  81    DS = 8,
  82    /* Vector Memory Buffer Formats */
  83    MTBUF = 9,
  84    MUBUF = 10,
  85    /* Vector Memory Image Format */
  86    MIMG = 11,
  87    /* Export Format */
  88    EXP = 12,
  89    /* Flat Formats */
  90    FLAT = 13,
  91    GLOBAL = 14,
  92    SCRATCH = 15,
  93
  94    PSEUDO_BRANCH = 16,
  95    PSEUDO_BARRIER = 17,
  96    PSEUDO_REDUCTION = 18,
  97
  98    /* Vector ALU Formats */
  99    VOP3P = 19,
 100    VOP1 = 1 << 8,
 101    VOP2 = 1 << 9,
 102    VOPC = 1 << 10,
 103    VOP3 = 1 << 11,
 104    VOP3A = 1 << 11,
 105    VOP3B = 1 << 11,
 106    /* Vector Parameter Interpolation Format */
 107    VINTRP = 1 << 12,
 108    DPP = 1 << 13,
 109    SDWA = 1 << 14,
 110 };
 111
 112 enum storage_class : uint8_t {
 113    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
 114    storage_buffer = 0x1, /* SSBOs and global memory */
 115    storage_atomic_counter = 0x2, /* not used for Vulkan */
 116    storage_image = 0x4,
 117    storage_shared = 0x8, /* or TCS output */
 118    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
 119    storage_scratch = 0x20,
 120    storage_vgpr_spill = 0x40,
 121    storage_count = 8,
 122 };
 123
 124 enum memory_semantics : uint8_t {
 125    semantic_none = 0x0,
 126    /* for loads: don't move any access after this load to before this load (even other loads)
 127     * for barriers: don't move any access after the barrier to before any
 128     * atomics/control_barriers/sendmsg_gs_done before the barrier */
 129    semantic_acquire = 0x1,
 130    /* for stores: don't move any access before this store to after this store
 131     * for barriers: don't move any access before the barrier to after any
 132     * atomics/control_barriers/sendmsg_gs_done after the barrier */
 133    semantic_release = 0x2,
 134
 135    /* the rest are for load/stores/atomics only */
 136    /* cannot be DCE'd or CSE'd */
 137    semantic_volatile = 0x4,
 138    /* does not interact with barriers and assumes this lane is the only lane
 139     * accessing this memory */
 140    semantic_private = 0x8,
 141    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
 142    semantic_can_reorder = 0x10,
 143    /* this is a atomic instruction (may only read or write memory) */
 144    semantic_atomic = 0x20,
 145    /* this is instruction both reads and writes memory */
 146    semantic_rmw = 0x40,
 147
 148    semantic_acqrel = semantic_acquire | semantic_release,
 149    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
 150 };
 151
 152 enum sync_scope : uint8_t {
 153    scope_invocation = 0,
 154    scope_subgroup = 1,
 155    scope_workgroup = 2,
 156    scope_queuefamily = 3,
 157    scope_device = 4,
 158 };
 159
 160 struct memory_sync_info {
 161    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
 162    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
 163       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
 164
 165    storage_class storage:8;
 166    memory_semantics semantics:8;
 167    sync_scope scope:8;
 168
 169    bool operator == (const memory_sync_info& rhs) const {
 170       return storage == rhs.storage &&
 171              semantics == rhs.semantics &&
 172              scope == rhs.scope;
 173    }
 174
 175    bool can_reorder() const {
 176       if (semantics & semantic_acqrel)
 177          return false;
 178       /* Also check storage so that zero-initialized memory_sync_info can be
 179        * reordered. */
 180       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
 181    }
 182 };
 183 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
 184
 185 enum fp_round {
 186    fp_round_ne = 0,
 187    fp_round_pi = 1,
 188    fp_round_ni = 2,
 189    fp_round_tz = 3,
 190 };
 191
 192 enum fp_denorm {
 193    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 194     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 195    fp_denorm_flush = 0x0,
 196    fp_denorm_keep = 0x3,
 197 };
 198
 199 struct float_mode {
 200    /* matches encoding of the MODE register */
 201    union {
 202       struct {
 203           fp_round round32:2;
 204           fp_round round16_64:2;
 205           unsigned denorm32:2;
 206           unsigned denorm16_64:2;
 207       };
 208       struct {
 209          uint8_t round:4;
 210          uint8_t denorm:4;
 211       };
 212       uint8_t val = 0;
 213    };
 214    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 215    bool preserve_signed_zero_inf_nan32:1;
 216    bool preserve_signed_zero_inf_nan16_64:1;
 217    /* if false, optimizations which may remove denormal flushing can be done */
 218    bool must_flush_denorms32:1;
 219    bool must_flush_denorms16_64:1;
 220    bool care_about_round32:1;
 221    bool care_about_round16_64:1;
 222
 223    /* Returns true if instructions using the mode "other" can safely use the
 224     * current one instead. */
 225    bool canReplace(float_mode other) const noexcept {
 226       return val == other.val &&
 227              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 228              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 229              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 230              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 231              (care_about_round32 || !other.care_about_round32) &&
 232              (care_about_round16_64 || !other.care_about_round16_64);
 233    }
 234 };
 235
 236 constexpr Format asVOP3(Format format) {
 237    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 238 };
 239
 240 constexpr Format asSDWA(Format format) {
 241    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 242    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 243 }
 244
 245 enum class RegType {
 246    none = 0,
 247    sgpr,
 248    vgpr,
 249    linear_vgpr,
 250 };
 251
 252 struct RegClass {
 253
 254    enum RC : uint8_t {
 255       s1 = 1,
 256       s2 = 2,
 257       s3 = 3,
 258       s4 = 4,
 259       s6 = 6,
 260       s8 = 8,
 261       s16 = 16,
 262       v1 = s1 | (1 << 5),
 263       v2 = s2 | (1 << 5),
 264       v3 = s3 | (1 << 5),
 265       v4 = s4 | (1 << 5),
 266       v5 = 5  | (1 << 5),
 267       v6 = 6  | (1 << 5),
 268       v7 = 7  | (1 << 5),
 269       v8 = 8  | (1 << 5),
 270       /* byte-sized register class */
 271       v1b = v1 | (1 << 7),
 272       v2b = v2 | (1 << 7),
 273       v3b = v3 | (1 << 7),
 274       v4b = v4 | (1 << 7),
 275       v6b = v6 | (1 << 7),
 276       v8b = v8 | (1 << 7),
 277       /* these are used for WWM and spills to vgpr */
 278       v1_linear = v1 | (1 << 6),
 279       v2_linear = v2 | (1 << 6),
 280    };
 281
 282    RegClass() = default;
 283    constexpr RegClass(RC rc)
 284       : rc(rc) {}
 285    constexpr RegClass(RegType type, unsigned size)
 286       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 287
 288    constexpr operator RC() const { return rc; }
 289    explicit operator bool() = delete;
 290
 291    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 292    constexpr bool is_subdword() const { return rc & (1 << 7); }
 293    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 294    //TODO: use size() less in favor of bytes()
 295    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 296    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 297    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 298    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 299
 300    static constexpr RegClass get(RegType type, unsigned bytes) {
 301       if (type == RegType::sgpr) {
 302          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 303       } else {
 304          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 305                              RegClass(type, bytes / 4u);
 306       }
 307    }
 308
 309 private:
 310    RC rc;
 311 };
 312
 313 /* transitional helper expressions */
 314 static constexpr RegClass s1{RegClass::s1};
 315 static constexpr RegClass s2{RegClass::s2};
 316 static constexpr RegClass s3{RegClass::s3};
 317 static constexpr RegClass s4{RegClass::s4};
 318 static constexpr RegClass s8{RegClass::s8};
 319 static constexpr RegClass s16{RegClass::s16};
 320 static constexpr RegClass v1{RegClass::v1};
 321 static constexpr RegClass v2{RegClass::v2};
 322 static constexpr RegClass v3{RegClass::v3};
 323 static constexpr RegClass v4{RegClass::v4};
 324 static constexpr RegClass v5{RegClass::v5};
 325 static constexpr RegClass v6{RegClass::v6};
 326 static constexpr RegClass v7{RegClass::v7};
 327 static constexpr RegClass v8{RegClass::v8};
 328 static constexpr RegClass v1b{RegClass::v1b};
 329 static constexpr RegClass v2b{RegClass::v2b};
 330 static constexpr RegClass v3b{RegClass::v3b};
 331 static constexpr RegClass v4b{RegClass::v4b};
 332 static constexpr RegClass v6b{RegClass::v6b};
 333 static constexpr RegClass v8b{RegClass::v8b};
 334
 335 /**
 336  * Temp Class
 337  * Each temporary virtual register has a
 338  * register class (i.e. size and type)
 339  * and SSA id.
 340  */
 341 struct Temp {
 342    Temp() noexcept : id_(0), reg_class(0) {}
 343    constexpr Temp(uint32_t id, RegClass cls) noexcept
 344       : id_(id), reg_class(uint8_t(cls)) {}
 345
 346    constexpr uint32_t id() const noexcept { return id_; }
 347    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 348
 349    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 350    constexpr unsigned size() const noexcept { return regClass().size(); }
 351    constexpr RegType type() const noexcept { return regClass().type(); }
 352    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 353
 354    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 355    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 356    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 357
 358 private:
 359    uint32_t id_: 24;
 360    uint32_t reg_class : 8;
 361 };
 362
 363 /**
 364  * PhysReg
 365  * Represents the physical register for each
 366  * Operand and Definition.
 367  */
 368 struct PhysReg {
 369    constexpr PhysReg() = default;
 370    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 371    constexpr unsigned reg() const { return reg_b >> 2; }
 372    constexpr unsigned byte() const { return reg_b & 0x3; }
 373    constexpr operator unsigned() const { return reg(); }
 374    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 375    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 376    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 377    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 378
 379    uint16_t reg_b = 0;
 380 };
 381
 382 /* helper expressions for special registers */
 383 static constexpr PhysReg m0{124};
 384 static constexpr PhysReg vcc{106};
 385 static constexpr PhysReg vcc_hi{107};
 386 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
 387 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
 388 static constexpr PhysReg ttmp0{112};
 389 static constexpr PhysReg ttmp1{113};
 390 static constexpr PhysReg ttmp2{114};
 391 static constexpr PhysReg ttmp3{115};
 392 static constexpr PhysReg ttmp4{116};
 393 static constexpr PhysReg ttmp5{117};
 394 static constexpr PhysReg ttmp6{118};
 395 static constexpr PhysReg ttmp7{119};
 396 static constexpr PhysReg ttmp8{120};
 397 static constexpr PhysReg ttmp9{121};
 398 static constexpr PhysReg ttmp10{122};
 399 static constexpr PhysReg ttmp11{123};
 400 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 401 static constexpr PhysReg exec{126};
 402 static constexpr PhysReg exec_lo{126};
 403 static constexpr PhysReg exec_hi{127};
 404 static constexpr PhysReg vccz{251};
 405 static constexpr PhysReg execz{252};
 406 static constexpr PhysReg scc{253};
 407
 408 /**
 409  * Operand Class
 410  * Initially, each Operand refers to either
 411  * a temporary virtual register
 412  * or to a constant value
 413  * Temporary registers get mapped to physical register during RA
 414  * Constant values are inlined into the instruction sequence.
 415  */
 416 class Operand final
 417 {
 418 public:
 419    constexpr Operand()
 420       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 421         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 422         isLateKill_(false) {}
 423
 424    explicit Operand(Temp r) noexcept
 425    {
 426       data_.temp = r;
 427       if (r.id()) {
 428          isTemp_ = true;
 429       } else {
 430          isUndef_ = true;
 431          setFixed(PhysReg{128});
 432       }
 433    };
 434    explicit Operand(uint8_t v) noexcept
 435    {
 436       /* 8-bit constants are only used for copies and copies from any 8-bit
 437        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 438        * to be inline constants. */
 439       data_.i = v;
 440       isConstant_ = true;
 441       constSize = 0;
 442       setFixed(PhysReg{0u});
 443    };
 444    explicit Operand(uint16_t v) noexcept
 445    {
 446       data_.i = v;
 447       isConstant_ = true;
 448       constSize = 1;
 449       if (v <= 64)
 450          setFixed(PhysReg{128u + v});
 451       else if (v >= 0xFFF0) /* [-16 .. -1] */
 452          setFixed(PhysReg{192u + (0xFFFF - v)});
 453       else if (v == 0x3800) /* 0.5 */
 454          setFixed(PhysReg{240});
 455       else if (v == 0xB800) /* -0.5 */
 456          setFixed(PhysReg{241});
 457       else if (v == 0x3C00) /* 1.0 */
 458          setFixed(PhysReg{242});
 459       else if (v == 0xBC00) /* -1.0 */
 460          setFixed(PhysReg{243});
 461       else if (v == 0x4000) /* 2.0 */
 462          setFixed(PhysReg{244});
 463       else if (v == 0xC000) /* -2.0 */
 464          setFixed(PhysReg{245});
 465       else if (v == 0x4400) /* 4.0 */
 466          setFixed(PhysReg{246});
 467       else if (v == 0xC400) /* -4.0 */
 468          setFixed(PhysReg{247});
 469       else if (v == 0x3118) /* 1/2 PI */
 470          setFixed(PhysReg{248});
 471       else /* Literal Constant */
 472          setFixed(PhysReg{255});
 473    };
 474    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 475    {
 476       data_.i = v;
 477       isConstant_ = true;
 478       constSize = is64bit ? 3 : 2;
 479       if (v <= 64)
 480          setFixed(PhysReg{128 + v});
 481       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 482          setFixed(PhysReg{192 - v});
 483       else if (v == 0x3f000000) /* 0.5 */
 484          setFixed(PhysReg{240});
 485       else if (v == 0xbf000000) /* -0.5 */
 486          setFixed(PhysReg{241});
 487       else if (v == 0x3f800000) /* 1.0 */
 488          setFixed(PhysReg{242});
 489       else if (v == 0xbf800000) /* -1.0 */
 490          setFixed(PhysReg{243});
 491       else if (v == 0x40000000) /* 2.0 */
 492          setFixed(PhysReg{244});
 493       else if (v == 0xc0000000) /* -2.0 */
 494          setFixed(PhysReg{245});
 495       else if (v == 0x40800000) /* 4.0 */
 496          setFixed(PhysReg{246});
 497       else if (v == 0xc0800000) /* -4.0 */
 498          setFixed(PhysReg{247});
 499       else { /* Literal Constant */
 500          assert(!is64bit && "attempt to create a 64-bit literal constant");
 501          setFixed(PhysReg{255});
 502       }
 503    };
 504    explicit Operand(uint64_t v) noexcept
 505    {
 506       isConstant_ = true;
 507       constSize = 3;
 508       if (v <= 64) {
 509          data_.i = (uint32_t) v;
 510          setFixed(PhysReg{128 + (uint32_t) v});
 511       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 512          data_.i = (uint32_t) v;
 513          setFixed(PhysReg{192 - (uint32_t) v});
 514       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 515          data_.i = 0x3f000000;
 516          setFixed(PhysReg{240});
 517       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 518          data_.i = 0xbf000000;
 519          setFixed(PhysReg{241});
 520       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 521          data_.i = 0x3f800000;
 522          setFixed(PhysReg{242});
 523       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 524          data_.i = 0xbf800000;
 525          setFixed(PhysReg{243});
 526       } else if (v == 0x4000000000000000) { /* 2.0 */
 527          data_.i = 0x40000000;
 528          setFixed(PhysReg{244});
 529       } else if (v == 0xC000000000000000) { /* -2.0 */
 530          data_.i = 0xc0000000;
 531          setFixed(PhysReg{245});
 532       } else if (v == 0x4010000000000000) { /* 4.0 */
 533          data_.i = 0x40800000;
 534          setFixed(PhysReg{246});
 535       } else if (v == 0xC010000000000000) { /* -4.0 */
 536          data_.i = 0xc0800000;
 537          setFixed(PhysReg{247});
 538       } else { /* Literal Constant: we don't know if it is a long or double.*/
 539          isConstant_ = 0;
 540          assert(false && "attempt to create a 64-bit literal constant");
 541       }
 542    };
 543    explicit Operand(RegClass type) noexcept
 544    {
 545       isUndef_ = true;
 546       data_.temp = Temp(0, type);
 547       setFixed(PhysReg{128});
 548    };
 549    explicit Operand(PhysReg reg, RegClass type) noexcept
 550    {
 551       data_.temp = Temp(0, type);
 552       setFixed(reg);
 553    }
 554
 555    constexpr bool isTemp() const noexcept
 556    {
 557       return isTemp_;
 558    }
 559
 560    constexpr void setTemp(Temp t) noexcept {
 561       assert(!isConstant_);
 562       isTemp_ = true;
 563       data_.temp = t;
 564    }
 565
 566    constexpr Temp getTemp() const noexcept
 567    {
 568       return data_.temp;
 569    }
 570
 571    constexpr uint32_t tempId() const noexcept
 572    {
 573       return data_.temp.id();
 574    }
 575
 576    constexpr bool hasRegClass() const noexcept
 577    {
 578       return isTemp() || isUndefined();
 579    }
 580
 581    constexpr RegClass regClass() const noexcept
 582    {
 583       return data_.temp.regClass();
 584    }
 585
 586    constexpr unsigned bytes() const noexcept
 587    {
 588       if (isConstant())
 589          return 1 << constSize;
 590       else
 591          return data_.temp.bytes();
 592    }
 593
 594    constexpr unsigned size() const noexcept
 595    {
 596       if (isConstant())
 597          return constSize > 2 ? 2 : 1;
 598       else
 599          return data_.temp.size();
 600    }
 601
 602    constexpr bool isFixed() const noexcept
 603    {
 604       return isFixed_;
 605    }
 606
 607    constexpr PhysReg physReg() const noexcept
 608    {
 609       return reg_;
 610    }
 611
 612    constexpr void setFixed(PhysReg reg) noexcept
 613    {
 614       isFixed_ = reg != unsigned(-1);
 615       reg_ = reg;
 616    }
 617
 618    constexpr bool isConstant() const noexcept
 619    {
 620       return isConstant_;
 621    }
 622
 623    constexpr bool isLiteral() const noexcept
 624    {
 625       return isConstant() && reg_ == 255;
 626    }
 627
 628    constexpr bool isUndefined() const noexcept
 629    {
 630       return isUndef_;
 631    }
 632
 633    constexpr uint32_t constantValue() const noexcept
 634    {
 635       return data_.i;
 636    }
 637
 638    constexpr bool constantEquals(uint32_t cmp) const noexcept
 639    {
 640       return isConstant() && constantValue() == cmp;
 641    }
 642
 643    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 644    {
 645       if (constSize == 3) {
 646          if (reg_ <= 192)
 647             return reg_ - 128;
 648          else if (reg_ <= 208)
 649             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 650
 651          switch (reg_) {
 652          case 240:
 653             return 0x3FE0000000000000;
 654          case 241:
 655             return 0xBFE0000000000000;
 656          case 242:
 657             return 0x3FF0000000000000;
 658          case 243:
 659             return 0xBFF0000000000000;
 660          case 244:
 661             return 0x4000000000000000;
 662          case 245:
 663             return 0xC000000000000000;
 664          case 246:
 665             return 0x4010000000000000;
 666          case 247:
 667             return 0xC010000000000000;
 668          }
 669       } else if (constSize == 1) {
 670          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 671       } else if (constSize == 0) {
 672          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 673       }
 674       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 675    }
 676
 677    constexpr bool isOfType(RegType type) const noexcept
 678    {
 679       return hasRegClass() && regClass().type() == type;
 680    }
 681
 682    /* Indicates that the killed operand's live range intersects with the
 683     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 684     * not set by liveness analysis. */
 685    constexpr void setLateKill(bool flag) noexcept
 686    {
 687       isLateKill_ = flag;
 688    }
 689
 690    constexpr bool isLateKill() const noexcept
 691    {
 692       return isLateKill_;
 693    }
 694
 695    constexpr void setKill(bool flag) noexcept
 696    {
 697       isKill_ = flag;
 698       if (!flag)
 699          setFirstKill(false);
 700    }
 701
 702    constexpr bool isKill() const noexcept
 703    {
 704       return isKill_ || isFirstKill();
 705    }
 706
 707    constexpr void setFirstKill(bool flag) noexcept
 708    {
 709       isFirstKill_ = flag;
 710       if (flag)
 711          setKill(flag);
 712    }
 713
 714    /* When there are multiple operands killing the same temporary,
 715     * isFirstKill() is only returns true for the first one. */
 716    constexpr bool isFirstKill() const noexcept
 717    {
 718       return isFirstKill_;
 719    }
 720
 721    constexpr bool isKillBeforeDef() const noexcept
 722    {
 723       return isKill() && !isLateKill();
 724    }
 725
 726    constexpr bool isFirstKillBeforeDef() const noexcept
 727    {
 728       return isFirstKill() && !isLateKill();
 729    }
 730
 731    constexpr bool operator == (Operand other) const noexcept
 732    {
 733       if (other.size() != size())
 734          return false;
 735       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 736          return false;
 737       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 738          return false;
 739       if (isLiteral())
 740          return other.isLiteral() && other.constantValue() == constantValue();
 741       else if (isConstant())
 742          return other.isConstant() && other.physReg() == physReg();
 743       else if (isUndefined())
 744          return other.isUndefined() && other.regClass() == regClass();
 745       else
 746          return other.isTemp() && other.getTemp() == getTemp();
 747    }
 748 private:
 749    union {
 750       uint32_t i;
 751       float f;
 752       Temp temp = Temp(0, s1);
 753    } data_;
 754    PhysReg reg_;
 755    union {
 756       struct {
 757          uint8_t isTemp_:1;
 758          uint8_t isFixed_:1;
 759          uint8_t isConstant_:1;
 760          uint8_t isKill_:1;
 761          uint8_t isUndef_:1;
 762          uint8_t isFirstKill_:1;
 763          uint8_t constSize:2;
 764          uint8_t isLateKill_:1;
 765       };
 766       /* can't initialize bit-fields in c++11, so work around using a union */
 767       uint16_t control_ = 0;
 768    };
 769 };
 770
 771 /**
 772  * Definition Class
 773  * Definitions are the results of Instructions
 774  * and refer to temporary virtual registers
 775  * which are later mapped to physical registers
 776  */
 777 class Definition final
 778 {
 779 public:
 780    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 781                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 782    Definition(uint32_t index, RegClass type) noexcept
 783       : temp(index, type) {}
 784    explicit Definition(Temp tmp) noexcept
 785       : temp(tmp) {}
 786    Definition(PhysReg reg, RegClass type) noexcept
 787       : temp(Temp(0, type))
 788    {
 789       setFixed(reg);
 790    }
 791    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 792       : temp(Temp(tmpId, type))
 793    {
 794       setFixed(reg);
 795    }
 796
 797    constexpr bool isTemp() const noexcept
 798    {
 799       return tempId() > 0;
 800    }
 801
 802    constexpr Temp getTemp() const noexcept
 803    {
 804       return temp;
 805    }
 806
 807    constexpr uint32_t tempId() const noexcept
 808    {
 809       return temp.id();
 810    }
 811
 812    constexpr void setTemp(Temp t) noexcept {
 813       temp = t;
 814    }
 815
 816    constexpr RegClass regClass() const noexcept
 817    {
 818       return temp.regClass();
 819    }
 820
 821    constexpr unsigned bytes() const noexcept
 822    {
 823       return temp.bytes();
 824    }
 825
 826    constexpr unsigned size() const noexcept
 827    {
 828       return temp.size();
 829    }
 830
 831    constexpr bool isFixed() const noexcept
 832    {
 833       return isFixed_;
 834    }
 835
 836    constexpr PhysReg physReg() const noexcept
 837    {
 838       return reg_;
 839    }
 840
 841    constexpr void setFixed(PhysReg reg) noexcept
 842    {
 843       isFixed_ = 1;
 844       reg_ = reg;
 845    }
 846
 847    constexpr void setHint(PhysReg reg) noexcept
 848    {
 849       hasHint_ = 1;
 850       reg_ = reg;
 851    }
 852
 853    constexpr bool hasHint() const noexcept
 854    {
 855       return hasHint_;
 856    }
 857
 858    constexpr void setKill(bool flag) noexcept
 859    {
 860       isKill_ = flag;
 861    }
 862
 863    constexpr bool isKill() const noexcept
 864    {
 865       return isKill_;
 866    }
 867
 868    constexpr void setPrecise(bool precise) noexcept
 869    {
 870       isPrecise_ = precise;
 871    }
 872
 873    constexpr bool isPrecise() const noexcept
 874    {
 875       return isPrecise_;
 876    }
 877
 878    /* No Unsigned Wrap */
 879    constexpr void setNUW(bool nuw) noexcept
 880    {
 881       isNUW_ = nuw;
 882    }
 883
 884    constexpr bool isNUW() const noexcept
 885    {
 886       return isNUW_;
 887    }
 888
 889 private:
 890    Temp temp = Temp(0, s1);
 891    PhysReg reg_;
 892    union {
 893       struct {
 894          uint8_t isFixed_:1;
 895          uint8_t hasHint_:1;
 896          uint8_t isKill_:1;
 897          uint8_t isPrecise_:1;
 898          uint8_t isNUW_:1;
 899       };
 900       /* can't initialize bit-fields in c++11, so work around using a union */
 901       uint8_t control_ = 0;
 902    };
 903 };
 904
 905 struct Block;
 906
 907 struct Instruction {
 908    aco_opcode opcode;
 909    Format format;
 910    uint32_t pass_flags;
 911
 912    aco::span<Operand> operands;
 913    aco::span<Definition> definitions;
 914
 915    constexpr bool isVALU() const noexcept
 916    {
 917       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 918           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 919           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 920           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 921           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 922           || format == Format::VOP3P;
 923    }
 924
 925    constexpr bool isSALU() const noexcept
 926    {
 927       return format == Format::SOP1 ||
 928              format == Format::SOP2 ||
 929              format == Format::SOPC ||
 930              format == Format::SOPK ||
 931              format == Format::SOPP;
 932    }
 933
 934    constexpr bool isVMEM() const noexcept
 935    {
 936       return format == Format::MTBUF ||
 937              format == Format::MUBUF ||
 938              format == Format::MIMG;
 939    }
 940
 941    constexpr bool isDPP() const noexcept
 942    {
 943       return (uint16_t) format & (uint16_t) Format::DPP;
 944    }
 945
 946    constexpr bool isVOP3() const noexcept
 947    {
 948       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 949              ((uint16_t) format & (uint16_t) Format::VOP3B);
 950    }
 951
 952    constexpr bool isSDWA() const noexcept
 953    {
 954       return (uint16_t) format & (uint16_t) Format::SDWA;
 955    }
 956
 957    constexpr bool isFlatOrGlobal() const noexcept
 958    {
 959       return format == Format::FLAT || format == Format::GLOBAL;
 960    }
 961
 962    constexpr bool usesModifiers() const noexcept;
 963
 964    constexpr bool reads_exec() const noexcept
 965    {
 966       for (const Operand& op : operands) {
 967          if (op.isFixed() && op.physReg() == exec)
 968             return true;
 969       }
 970       return false;
 971    }
 972 };
 973 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 974
 975 struct SOPK_instruction : public Instruction {
 976    uint16_t imm;
 977    uint16_t padding;
 978 };
 979 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 980
 981 struct SOPP_instruction : public Instruction {
 982    uint32_t imm;
 983    int block;
 984 };
 985 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 986
 987 struct SOPC_instruction : public Instruction {
 988 };
 989 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 990
 991 struct SOP1_instruction : public Instruction {
 992 };
 993 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 994
 995 struct SOP2_instruction : public Instruction {
 996 };
 997 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 998
 999 /**
1000  * Scalar Memory Format:
1001  * For s_(buffer_)load_dword*:
1002  * Operand(0): SBASE - SGPR-pair which provides base address
1003  * Operand(1): Offset - immediate (un)signed offset or SGPR
1004  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1005  * Operand(n-1): SOffset - SGPR offset (Vega only)
1006  *
1007  * Having no operands is also valid for instructions such as s_dcache_inv.
1008  *
1009  */
1010 struct SMEM_instruction : public Instruction {
1011    memory_sync_info sync;
1012    bool glc : 1; /* VI+: globally coherent */
1013    bool dlc : 1; /* NAVI: device level coherent */
1014    bool nv : 1; /* VEGA only: Non-volatile */
1015    bool disable_wqm : 1;
1016    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1017    uint32_t padding: 3;
1018 };
1019 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1020
1021 struct VOP1_instruction : public Instruction {
1022 };
1023 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1024
1025 struct VOP2_instruction : public Instruction {
1026 };
1027 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1028
1029 struct VOPC_instruction : public Instruction {
1030 };
1031 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1032
1033 struct VOP3A_instruction : public Instruction {
1034    bool abs[3];
1035    bool neg[3];
1036    uint8_t opsel : 4;
1037    uint8_t omod : 2;
1038    bool clamp : 1;
1039    uint32_t padding : 9;
1040 };
1041 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1042
1043 struct VOP3P_instruction : public Instruction {
1044    bool neg_lo[3];
1045    bool neg_hi[3];
1046    uint8_t opsel_lo : 3;
1047    uint8_t opsel_hi : 3;
1048    bool clamp : 1;
1049    uint32_t padding : 9;
1050 };
1051 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1052
1053 /**
1054  * Data Parallel Primitives Format:
1055  * This format can be used for VOP1, VOP2 or VOPC instructions.
1056  * The swizzle applies to the src0 operand.
1057  *
1058  */
1059 struct DPP_instruction : public Instruction {
1060    bool abs[2];
1061    bool neg[2];
1062    uint16_t dpp_ctrl;
1063    uint8_t row_mask : 4;
1064    uint8_t bank_mask : 4;
1065    bool bound_ctrl : 1;
1066    uint32_t padding : 7;
1067 };
1068 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1069
1070 enum sdwa_sel : uint8_t {
1071     /* masks */
1072     sdwa_wordnum = 0x1,
1073     sdwa_bytenum = 0x3,
1074     sdwa_asuint = 0x7 | 0x10,
1075     sdwa_rasize = 0x3,
1076
1077     /* flags */
1078     sdwa_isword = 0x4,
1079     sdwa_sext = 0x8,
1080     sdwa_isra = 0x10,
1081
1082     /* specific values */
1083     sdwa_ubyte0 = 0,
1084     sdwa_ubyte1 = 1,
1085     sdwa_ubyte2 = 2,
1086     sdwa_ubyte3 = 3,
1087     sdwa_uword0 = sdwa_isword | 0,
1088     sdwa_uword1 = sdwa_isword | 1,
1089     sdwa_udword = 6,
1090
1091     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1092     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1093     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1094     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1095     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1096     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1097     sdwa_sdword = sdwa_udword | sdwa_sext,
1098
1099     /* register-allocated */
1100     sdwa_ubyte = 1 | sdwa_isra,
1101     sdwa_uword = 2 | sdwa_isra,
1102     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1103     sdwa_sword = sdwa_uword | sdwa_sext,
1104 };
1105
1106 /**
1107  * Sub-Dword Addressing Format:
1108  * This format can be used for VOP1, VOP2 or VOPC instructions.
1109  *
1110  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1111  * the definition doesn't have to be VCC on GFX9+.
1112  *
1113  */
1114 struct SDWA_instruction : public Instruction {
1115    /* these destination modifiers aren't available with VOPC except for
1116     * clamp on GFX8 */
1117    uint8_t sel[2];
1118    uint8_t dst_sel;
1119    bool neg[2];
1120    bool abs[2];
1121    bool dst_preserve : 1;
1122    bool clamp : 1;
1123    uint8_t omod : 2; /* GFX9+ */
1124    uint32_t padding : 4;
1125 };
1126 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1127
1128 struct Interp_instruction : public Instruction {
1129    uint8_t attribute;
1130    uint8_t component;
1131    uint16_t padding;
1132 };
1133 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1134
1135 /**
1136  * Local and Global Data Sharing instructions
1137  * Operand(0): ADDR - VGPR which supplies the address.
1138  * Operand(1): DATA0 - First data VGPR.
1139  * Operand(2): DATA1 - Second data VGPR.
1140  * Operand(n-1): M0 - LDS size.
1141  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1142  *
1143  */
1144 struct DS_instruction : public Instruction {
1145    memory_sync_info sync;
1146    bool gds;
1147    int16_t offset0;
1148    int8_t offset1;
1149    uint8_t padding;
1150 };
1151 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1152
1153 /**
1154  * Vector Memory Untyped-buffer Instructions
1155  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1156  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1157  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1158  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1159  *
1160  */
1161 struct MUBUF_instruction : public Instruction {
1162    memory_sync_info sync;
1163    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1164    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1165    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1166    bool glc : 1; /* globally coherent */
1167    bool dlc : 1; /* NAVI: device level coherent */
1168    bool slc : 1; /* system level coherent */
1169    bool tfe : 1; /* texture fail enable */
1170    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1171    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1172    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1173    bool swizzled : 1;
1174    uint32_t padding1 : 18;
1175 };
1176 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1177
1178 /**
1179  * Vector Memory Typed-buffer Instructions
1180  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1181  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1182  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1183  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1184  *
1185  */
1186 struct MTBUF_instruction : public Instruction {
1187    memory_sync_info sync;
1188    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1189    uint8_t nfmt : 3; /* Numeric format of data in memory */
1190    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1191    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1192    bool glc : 1; /* globally coherent */
1193    bool dlc : 1; /* NAVI: device level coherent */
1194    bool slc : 1; /* system level coherent */
1195    bool tfe : 1; /* texture fail enable */
1196    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1197    uint32_t padding : 10;
1198    uint16_t offset; /* Unsigned byte offset - 12 bit */
1199 };
1200 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1201
1202 /**
1203  * Vector Memory Image Instructions
1204  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1205  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1206  *             or VDATA - Vector GPR for write data.
1207  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1208  * Definition(0): VDATA - Vector GPR for read result.
1209  *
1210  */
1211 struct MIMG_instruction : public Instruction {
1212    memory_sync_info sync;
1213    uint8_t dmask; /* Data VGPR enable mask */
1214    uint8_t dim : 3; /* NAVI: dimensionality */
1215    bool unrm : 1; /* Force address to be un-normalized */
1216    bool dlc : 1; /* NAVI: device level coherent */
1217    bool glc : 1; /* globally coherent */
1218    bool slc : 1; /* system level coherent */
1219    bool tfe : 1; /* texture fail enable */
1220    bool da : 1; /* declare an array */
1221    bool lwe : 1; /* Force data to be un-normalized */
1222    bool r128 : 1; /* NAVI: Texture resource size */
1223    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1224    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1225    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1226    uint32_t padding : 18;
1227 };
1228 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1229
1230 /**
1231  * Flat/Scratch/Global Instructions
1232  * Operand(0): ADDR
1233  * Operand(1): SADDR
1234  * Operand(2) / Definition(0): DATA/VDST
1235  *
1236  */
1237 struct FLAT_instruction : public Instruction {
1238    memory_sync_info sync;
1239    bool slc : 1; /* system level coherent */
1240    bool glc : 1; /* globally coherent */
1241    bool dlc : 1; /* NAVI: device level coherent */
1242    bool lds : 1;
1243    bool nv : 1;
1244    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1245    uint32_t padding0 : 2;
1246    uint16_t offset; /* Vega/Navi only */
1247    uint16_t padding1;
1248 };
1249 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1250
1251 struct Export_instruction : public Instruction {
1252    uint8_t enabled_mask;
1253    uint8_t dest;
1254    bool compressed : 1;
1255    bool done : 1;
1256    bool valid_mask : 1;
1257    uint32_t padding : 13;
1258 };
1259 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1260
1261 struct Pseudo_instruction : public Instruction {
1262    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1263    bool tmp_in_scc;
1264    uint8_t padding;
1265 };
1266 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1267
1268 struct Pseudo_branch_instruction : public Instruction {
1269    /* target[0] is the block index of the branch target.
1270     * For conditional branches, target[1] contains the fall-through alternative.
1271     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1272     */
1273    uint32_t target[2];
1274 };
1275 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1276
1277 struct Pseudo_barrier_instruction : public Instruction {
1278    memory_sync_info sync;
1279    sync_scope exec_scope;
1280 };
1281 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1282
1283 enum ReduceOp : uint16_t {
1284    iadd8, iadd16, iadd32, iadd64,
1285    imul8, imul16, imul32, imul64,
1286           fadd16, fadd32, fadd64,
1287           fmul16, fmul32, fmul64,
1288    imin8, imin16, imin32, imin64,
1289    imax8, imax16, imax32, imax64,
1290    umin8, umin16, umin32, umin64,
1291    umax8, umax16, umax32, umax64,
1292           fmin16, fmin32, fmin64,
1293           fmax16, fmax32, fmax64,
1294    iand8, iand16, iand32, iand64,
1295    ior8, ior16, ior32, ior64,
1296    ixor8, ixor16, ixor32, ixor64,
1297 };
1298
1299 /**
1300  * Subgroup Reduction Instructions, everything except for the data to be
1301  * reduced and the result as inserted by setup_reduce_temp().
1302  * Operand(0): data to be reduced
1303  * Operand(1): reduce temporary
1304  * Operand(2): vector temporary
1305  * Definition(0): result
1306  * Definition(1): scalar temporary
1307  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1308  * Definition(3): scc clobber
1309  * Definition(4): vcc clobber
1310  *
1311  */
1312 struct Pseudo_reduction_instruction : public Instruction {
1313    ReduceOp reduce_op;
1314    uint16_t cluster_size; // must be 0 for scans
1315 };
1316 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1317
1318 struct instr_deleter_functor {
1319    void operator()(void* p) {
1320       free(p);
1321    }
1322 };
1323
1324 template<typename T>
1325 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1326
1327 template<typename T>
1328 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1329 {
1330    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1331    char *data = (char*) calloc(1, size);
1332    T* inst = (T*) data;
1333
1334    inst->opcode = opcode;
1335    inst->format = format;
1336
1337    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1338    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1339    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1340    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1341
1342    return inst;
1343 }
1344
1345 constexpr bool Instruction::usesModifiers() const noexcept
1346 {
1347    if (isDPP() || isSDWA())
1348       return true;
1349
1350    if (format == Format::VOP3P) {
1351       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1352       for (unsigned i = 0; i < operands.size(); i++) {
1353          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1354             return true;
1355       }
1356       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1357    } else if (isVOP3()) {
1358       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1359       for (unsigned i = 0; i < operands.size(); i++) {
1360          if (vop3->abs[i] || vop3->neg[i])
1361             return true;
1362       }
1363       return vop3->opsel || vop3->clamp || vop3->omod;
1364    }
1365    return false;
1366 }
1367
1368 constexpr bool is_phi(Instruction* instr)
1369 {
1370    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1371 }
1372
1373 static inline bool is_phi(aco_ptr<Instruction>& instr)
1374 {
1375    return is_phi(instr.get());
1376 }
1377
1378 memory_sync_info get_sync_info(const Instruction* instr);
1379
1380 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1381
1382 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1383 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1384 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1385 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1386
1387 enum block_kind {
1388    /* uniform indicates that leaving this block,
1389     * all actives lanes stay active */
1390    block_kind_uniform = 1 << 0,
1391    block_kind_top_level = 1 << 1,
1392    block_kind_loop_preheader = 1 << 2,
1393    block_kind_loop_header = 1 << 3,
1394    block_kind_loop_exit = 1 << 4,
1395    block_kind_continue = 1 << 5,
1396    block_kind_break = 1 << 6,
1397    block_kind_continue_or_break = 1 << 7,
1398    block_kind_discard = 1 << 8,
1399    block_kind_branch = 1 << 9,
1400    block_kind_merge = 1 << 10,
1401    block_kind_invert = 1 << 11,
1402    block_kind_uses_discard_if = 1 << 12,
1403    block_kind_needs_lowering = 1 << 13,
1404    block_kind_uses_demote = 1 << 14,
1405    block_kind_export_end = 1 << 15,
1406 };
1407
1408
1409 struct RegisterDemand {
1410    constexpr RegisterDemand() = default;
1411    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1412       : vgpr{v}, sgpr{s} {}
1413    int16_t vgpr = 0;
1414    int16_t sgpr = 0;
1415
1416    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1417       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1418    }
1419
1420    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1421       return vgpr > other.vgpr || sgpr > other.sgpr;
1422    }
1423
1424    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1425       if (t.type() == RegType::sgpr)
1426          return RegisterDemand( vgpr, sgpr + t.size() );
1427       else
1428          return RegisterDemand( vgpr + t.size(), sgpr );
1429    }
1430
1431    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1432       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1433    }
1434
1435    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1436       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1437    }
1438
1439    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1440       vgpr += other.vgpr;
1441       sgpr += other.sgpr;
1442       return *this;
1443    }
1444
1445    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1446       vgpr -= other.vgpr;
1447       sgpr -= other.sgpr;
1448       return *this;
1449    }
1450
1451    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1452       if (t.type() == RegType::sgpr)
1453          sgpr += t.size();
1454       else
1455          vgpr += t.size();
1456       return *this;
1457    }
1458
1459    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1460       if (t.type() == RegType::sgpr)
1461          sgpr -= t.size();
1462       else
1463          vgpr -= t.size();
1464       return *this;
1465    }
1466
1467    constexpr void update(const RegisterDemand other) noexcept {
1468       vgpr = std::max(vgpr, other.vgpr);
1469       sgpr = std::max(sgpr, other.sgpr);
1470    }
1471
1472 };
1473
1474 /* CFG */
1475 struct Block {
1476    float_mode fp_mode;
1477    unsigned index;
1478    unsigned offset = 0;
1479    std::vector<aco_ptr<Instruction>> instructions;
1480    std::vector<unsigned> logical_preds;
1481    std::vector<unsigned> linear_preds;
1482    std::vector<unsigned> logical_succs;
1483    std::vector<unsigned> linear_succs;
1484    RegisterDemand register_demand = RegisterDemand();
1485    uint16_t loop_nest_depth = 0;
1486    uint16_t kind = 0;
1487    int logical_idom = -1;
1488    int linear_idom = -1;
1489    Temp live_out_exec = Temp();
1490
1491    /* this information is needed for predecessors to blocks with phis when
1492     * moving out of ssa */
1493    bool scc_live_out = false;
1494    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1495
1496    Block(unsigned idx) : index(idx) {}
1497    Block() : index(0) {}
1498 };
1499
1500 using Stage = uint16_t;
1501
1502 /* software stages */
1503 static constexpr Stage sw_vs = 1 << 0;
1504 static constexpr Stage sw_gs = 1 << 1;
1505 static constexpr Stage sw_tcs = 1 << 2;
1506 static constexpr Stage sw_tes = 1 << 3;
1507 static constexpr Stage sw_fs = 1 << 4;
1508 static constexpr Stage sw_cs = 1 << 5;
1509 static constexpr Stage sw_gs_copy = 1 << 6;
1510 static constexpr Stage sw_mask = 0x7f;
1511
1512 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1513 static constexpr Stage hw_vs = 1 << 7;
1514 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1515 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1516 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1517 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1518 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1519 static constexpr Stage hw_fs = 1 << 13;
1520 static constexpr Stage hw_cs = 1 << 14;
1521 static constexpr Stage hw_mask = 0xff << 7;
1522
1523 /* possible settings of Program::stage */
1524 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1525 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1526 static constexpr Stage compute_cs = sw_cs | hw_cs;
1527 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1528 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1529 /* GFX10/NGG */
1530 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1531 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1532 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1533 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1534 /* GFX9 (and GFX10 if NGG isn't used) */
1535 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1536 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1537 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1538 /* pre-GFX9 */
1539 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1540 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1541 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1542 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1543 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1544
1545 enum statistic {
1546    statistic_hash,
1547    statistic_instructions,
1548    statistic_copies,
1549    statistic_branches,
1550    statistic_cycles,
1551    statistic_vmem_clauses,
1552    statistic_smem_clauses,
1553    statistic_vmem_score,
1554    statistic_smem_score,
1555    statistic_sgpr_presched,
1556    statistic_vgpr_presched,
1557    num_statistics
1558 };
1559
1560 class Program final {
1561 public:
1562    float_mode next_fp_mode;
1563    std::vector<Block> blocks;
1564    RegisterDemand max_reg_demand = RegisterDemand();
1565    uint16_t num_waves = 0;
1566    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1567    ac_shader_config* config;
1568    struct radv_shader_info *info;
1569    enum chip_class chip_class;
1570    enum radeon_family family;
1571    unsigned wave_size;
1572    RegClass lane_mask;
1573    Stage stage; /* Stage */
1574    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1575    bool needs_wqm = false; /* there exists a p_wqm instruction */
1576    bool wb_smem_l1_on_end = false;
1577
1578    std::vector<uint8_t> constant_data;
1579    Temp private_segment_buffer;
1580    Temp scratch_offset;
1581
1582    uint16_t min_waves = 0;
1583    uint16_t lds_alloc_granule;
1584    uint32_t lds_limit; /* in bytes */
1585    bool has_16bank_lds;
1586    uint16_t vgpr_limit;
1587    uint16_t sgpr_limit;
1588    uint16_t physical_sgprs;
1589    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1590    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1591    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1592
1593    bool xnack_enabled = false;
1594    bool sram_ecc_enabled = false;
1595    bool has_fast_fma32 = false;
1596
1597    bool needs_vcc = false;
1598    bool needs_flat_scr = false;
1599
1600    bool collect_statistics = false;
1601    uint32_t statistics[num_statistics];
1602
1603    struct {
1604       void (*func)(void *private_data,
1605                    enum radv_compiler_debug_level level,
1606                    const char *message);
1607       void *private_data;
1608    } debug;
1609
1610    uint32_t allocateId()
1611    {
1612       assert(allocationID <= 16777215);
1613       return allocationID++;
1614    }
1615
1616    uint32_t peekAllocationId()
1617    {
1618       return allocationID;
1619    }
1620
1621    void setAllocationId(uint32_t id)
1622    {
1623       allocationID = id;
1624    }
1625
1626    Block* create_and_insert_block() {
1627       blocks.emplace_back(blocks.size());
1628       blocks.back().fp_mode = next_fp_mode;
1629       return &blocks.back();
1630    }
1631
1632    Block* insert_block(Block&& block) {
1633       block.index = blocks.size();
1634       block.fp_mode = next_fp_mode;
1635       blocks.emplace_back(std::move(block));
1636       return &blocks.back();
1637    }
1638
1639 private:
1640    uint32_t allocationID = 1;
1641 };
1642
1643 struct TempHash {
1644    std::size_t operator()(Temp t) const {
1645       return t.id();
1646    }
1647 };
1648 using TempSet = std::unordered_set<Temp, TempHash>;
1649
1650 struct live {
1651    /* live temps out per block */
1652    std::vector<TempSet> live_out;
1653    /* register demand (sgpr/vgpr) per instruction per block */
1654    std::vector<std::vector<RegisterDemand>> register_demand;
1655 };
1656
1657 void init();
1658
1659 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1660                   enum chip_class chip_class, enum radeon_family family,
1661                   ac_shader_config *config);
1662
1663 void select_program(Program *program,
1664                     unsigned shader_count,
1665                     struct nir_shader *const *shaders,
1666                     ac_shader_config* config,
1667                     struct radv_shader_args *args);
1668 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1669                            ac_shader_config* config,
1670                            struct radv_shader_args *args);
1671 void select_trap_handler_shader(Program *program, struct nir_shader *shader,
1672                                 ac_shader_config* config,
1673                                 struct radv_shader_args *args);
1674
1675 void lower_wqm(Program* program, live& live_vars,
1676                const struct radv_nir_compiler_options *options);
1677 void lower_phis(Program* program);
1678 void calc_min_waves(Program* program);
1679 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1680 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1681 std::vector<uint16_t> dead_code_analysis(Program *program);
1682 void dominator_tree(Program* program);
1683 void insert_exec_mask(Program *program);
1684 void value_numbering(Program* program);
1685 void optimize(Program* program);
1686 void setup_reduce_temp(Program* program);
1687 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1688 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1689 void ssa_elimination(Program* program);
1690 void lower_to_hw_instr(Program* program);
1691 void schedule_program(Program* program, live& live_vars);
1692 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1693 void insert_wait_states(Program* program);
1694 void insert_NOPs(Program* program);
1695 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1696 void print_asm(Program *program, std::vector<uint32_t>& binary,
1697                unsigned exec_size, std::ostream& out);
1698 bool validate_ir(Program* program);
1699 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1700 #ifndef NDEBUG
1701 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1702 #else
1703 #define perfwarn(program, cond, msg, ...) do {} while(0)
1704 #endif
1705
1706 void collect_presched_stats(Program *program);
1707 void collect_preasm_stats(Program *program);
1708 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1709
1710 void aco_print_instr(const Instruction *instr, FILE *output);
1711 void aco_print_program(const Program *program, FILE *output);
1712
1713 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1714                    const char *fmt, ...);
1715 void _aco_err(Program *program, const char *file, unsigned line,
1716               const char *fmt, ...);
1717
1718 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1719 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1720
1721 /* utilities for dealing with register demand */
1722 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1723 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1724 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1725
1726 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1727 uint16_t get_extra_sgprs(Program *program);
1728
1729 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1730 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1731 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1732
1733 /* return number of addressable sgprs/vgprs for max_waves */
1734 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1735 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1736
1737 typedef struct {
1738    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1739    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1740    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1741    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1742    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1743    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1744    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1745    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1746    /* sizes used for input/output modifiers and constants */
1747    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1748    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1749 } Info;
1750
1751 extern const Info instr_info;
1752
1753 }
1754
1755 #endif /* ACO_IR_H */
1756