src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 #include "vulkan/radv_shader.h"
  41
  42 struct radv_nir_compiler_options;
  43 struct radv_shader_args;
  44 struct radv_shader_info;
  45
  46 namespace aco {
  47
  48 extern uint64_t debug_flags;
  49
  50 enum {
  51    DEBUG_VALIDATE_IR = 0x1,
  52    DEBUG_VALIDATE_RA = 0x2,
  53    DEBUG_PERFWARN = 0x4,
  54    DEBUG_FORCE_WAITCNT = 0x8,
  55 };
  56
  57 /**
  58  * Representation of the instruction's microcode encoding format
  59  * Note: Some Vector ALU Formats can be combined, such that:
  60  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  61  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  62  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  63  *
  64  * (*) The same is applicable for VOP1 and VOPC instructions.
  65  */
  66 enum class Format : std::uint16_t {
  67    /* Pseudo Instruction Format */
  68    PSEUDO = 0,
  69    /* Scalar ALU & Control Formats */
  70    SOP1 = 1,
  71    SOP2 = 2,
  72    SOPK = 3,
  73    SOPP = 4,
  74    SOPC = 5,
  75    /* Scalar Memory Format */
  76    SMEM = 6,
  77    /* LDS/GDS Format */
  78    DS = 8,
  79    /* Vector Memory Buffer Formats */
  80    MTBUF = 9,
  81    MUBUF = 10,
  82    /* Vector Memory Image Format */
  83    MIMG = 11,
  84    /* Export Format */
  85    EXP = 12,
  86    /* Flat Formats */
  87    FLAT = 13,
  88    GLOBAL = 14,
  89    SCRATCH = 15,
  90
  91    PSEUDO_BRANCH = 16,
  92    PSEUDO_BARRIER = 17,
  93    PSEUDO_REDUCTION = 18,
  94
  95    /* Vector ALU Formats */
  96    VOP3P = 19,
  97    VOP1 = 1 << 8,
  98    VOP2 = 1 << 9,
  99    VOPC = 1 << 10,
 100    VOP3 = 1 << 11,
 101    VOP3A = 1 << 11,
 102    VOP3B = 1 << 11,
 103    /* Vector Parameter Interpolation Format */
 104    VINTRP = 1 << 12,
 105    DPP = 1 << 13,
 106    SDWA = 1 << 14,
 107 };
 108
 109 enum storage_class : uint8_t {
 110    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
 111    storage_buffer = 0x1, /* SSBOs and global memory */
 112    storage_atomic_counter = 0x2, /* not used for Vulkan */
 113    storage_image = 0x4,
 114    storage_shared = 0x8, /* or TCS output */
 115    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
 116    storage_scratch = 0x20,
 117    storage_vgpr_spill = 0x40,
 118    storage_count = 8,
 119 };
 120
 121 enum memory_semantics : uint8_t {
 122    semantic_none = 0x0,
 123    /* for loads: don't move any access after this load to before this load (even other loads)
 124     * for barriers: don't move any access after the barrier to before any
 125     * atomics/control_barriers/sendmsg_gs_done before the barrier */
 126    semantic_acquire = 0x1,
 127    /* for stores: don't move any access before this store to after this store
 128     * for barriers: don't move any access before the barrier to after any
 129     * atomics/control_barriers/sendmsg_gs_done after the barrier */
 130    semantic_release = 0x2,
 131
 132    /* the rest are for load/stores/atomics only */
 133    /* cannot be DCE'd or CSE'd */
 134    semantic_volatile = 0x4,
 135    /* does not interact with barriers and assumes this lane is the only lane
 136     * accessing this memory */
 137    semantic_private = 0x8,
 138    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
 139    semantic_can_reorder = 0x10,
 140    /* this is a atomic instruction (may only read or write memory) */
 141    semantic_atomic = 0x20,
 142    /* this is instruction both reads and writes memory */
 143    semantic_rmw = 0x40,
 144
 145    semantic_acqrel = semantic_acquire | semantic_release,
 146    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
 147 };
 148
 149 enum sync_scope : uint8_t {
 150    scope_invocation = 0,
 151    scope_subgroup = 1,
 152    scope_workgroup = 2,
 153    scope_queuefamily = 3,
 154    scope_device = 4,
 155 };
 156
 157 struct memory_sync_info {
 158    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
 159    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
 160       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
 161
 162    storage_class storage:8;
 163    memory_semantics semantics:8;
 164    sync_scope scope:8;
 165
 166    bool operator == (const memory_sync_info& rhs) const {
 167       return storage == rhs.storage &&
 168              semantics == rhs.semantics &&
 169              scope == rhs.scope;
 170    }
 171
 172    bool can_reorder() const {
 173       if (semantics & semantic_acqrel)
 174          return false;
 175       /* Also check storage so that zero-initialized memory_sync_info can be
 176        * reordered. */
 177       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
 178    }
 179 };
 180 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
 181
 182 enum fp_round {
 183    fp_round_ne = 0,
 184    fp_round_pi = 1,
 185    fp_round_ni = 2,
 186    fp_round_tz = 3,
 187 };
 188
 189 enum fp_denorm {
 190    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 191     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 192    fp_denorm_flush = 0x0,
 193    fp_denorm_keep = 0x3,
 194 };
 195
 196 struct float_mode {
 197    /* matches encoding of the MODE register */
 198    union {
 199       struct {
 200           fp_round round32:2;
 201           fp_round round16_64:2;
 202           unsigned denorm32:2;
 203           unsigned denorm16_64:2;
 204       };
 205       struct {
 206          uint8_t round:4;
 207          uint8_t denorm:4;
 208       };
 209       uint8_t val = 0;
 210    };
 211    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 212    bool preserve_signed_zero_inf_nan32:1;
 213    bool preserve_signed_zero_inf_nan16_64:1;
 214    /* if false, optimizations which may remove denormal flushing can be done */
 215    bool must_flush_denorms32:1;
 216    bool must_flush_denorms16_64:1;
 217    bool care_about_round32:1;
 218    bool care_about_round16_64:1;
 219
 220    /* Returns true if instructions using the mode "other" can safely use the
 221     * current one instead. */
 222    bool canReplace(float_mode other) const noexcept {
 223       return val == other.val &&
 224              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 225              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 226              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 227              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 228              (care_about_round32 || !other.care_about_round32) &&
 229              (care_about_round16_64 || !other.care_about_round16_64);
 230    }
 231 };
 232
 233 constexpr Format asVOP3(Format format) {
 234    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 235 };
 236
 237 constexpr Format asSDWA(Format format) {
 238    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 239    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 240 }
 241
 242 enum class RegType {
 243    none = 0,
 244    sgpr,
 245    vgpr,
 246    linear_vgpr,
 247 };
 248
 249 struct RegClass {
 250
 251    enum RC : uint8_t {
 252       s1 = 1,
 253       s2 = 2,
 254       s3 = 3,
 255       s4 = 4,
 256       s6 = 6,
 257       s8 = 8,
 258       s16 = 16,
 259       v1 = s1 | (1 << 5),
 260       v2 = s2 | (1 << 5),
 261       v3 = s3 | (1 << 5),
 262       v4 = s4 | (1 << 5),
 263       v5 = 5  | (1 << 5),
 264       v6 = 6  | (1 << 5),
 265       v7 = 7  | (1 << 5),
 266       v8 = 8  | (1 << 5),
 267       /* byte-sized register class */
 268       v1b = v1 | (1 << 7),
 269       v2b = v2 | (1 << 7),
 270       v3b = v3 | (1 << 7),
 271       v4b = v4 | (1 << 7),
 272       v6b = v6 | (1 << 7),
 273       v8b = v8 | (1 << 7),
 274       /* these are used for WWM and spills to vgpr */
 275       v1_linear = v1 | (1 << 6),
 276       v2_linear = v2 | (1 << 6),
 277    };
 278
 279    RegClass() = default;
 280    constexpr RegClass(RC rc)
 281       : rc(rc) {}
 282    constexpr RegClass(RegType type, unsigned size)
 283       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 284
 285    constexpr operator RC() const { return rc; }
 286    explicit operator bool() = delete;
 287
 288    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 289    constexpr bool is_subdword() const { return rc & (1 << 7); }
 290    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 291    //TODO: use size() less in favor of bytes()
 292    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 293    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 294    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 295    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 296
 297    static constexpr RegClass get(RegType type, unsigned bytes) {
 298       if (type == RegType::sgpr) {
 299          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 300       } else {
 301          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 302                              RegClass(type, bytes / 4u);
 303       }
 304    }
 305
 306 private:
 307    RC rc;
 308 };
 309
 310 /* transitional helper expressions */
 311 static constexpr RegClass s1{RegClass::s1};
 312 static constexpr RegClass s2{RegClass::s2};
 313 static constexpr RegClass s3{RegClass::s3};
 314 static constexpr RegClass s4{RegClass::s4};
 315 static constexpr RegClass s8{RegClass::s8};
 316 static constexpr RegClass s16{RegClass::s16};
 317 static constexpr RegClass v1{RegClass::v1};
 318 static constexpr RegClass v2{RegClass::v2};
 319 static constexpr RegClass v3{RegClass::v3};
 320 static constexpr RegClass v4{RegClass::v4};
 321 static constexpr RegClass v5{RegClass::v5};
 322 static constexpr RegClass v6{RegClass::v6};
 323 static constexpr RegClass v7{RegClass::v7};
 324 static constexpr RegClass v8{RegClass::v8};
 325 static constexpr RegClass v1b{RegClass::v1b};
 326 static constexpr RegClass v2b{RegClass::v2b};
 327 static constexpr RegClass v3b{RegClass::v3b};
 328 static constexpr RegClass v4b{RegClass::v4b};
 329 static constexpr RegClass v6b{RegClass::v6b};
 330 static constexpr RegClass v8b{RegClass::v8b};
 331
 332 /**
 333  * Temp Class
 334  * Each temporary virtual register has a
 335  * register class (i.e. size and type)
 336  * and SSA id.
 337  */
 338 struct Temp {
 339    Temp() noexcept : id_(0), reg_class(0) {}
 340    constexpr Temp(uint32_t id, RegClass cls) noexcept
 341       : id_(id), reg_class(uint8_t(cls)) {}
 342
 343    constexpr uint32_t id() const noexcept { return id_; }
 344    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 345
 346    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 347    constexpr unsigned size() const noexcept { return regClass().size(); }
 348    constexpr RegType type() const noexcept { return regClass().type(); }
 349    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 350
 351    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 352    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 353    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 354
 355 private:
 356    uint32_t id_: 24;
 357    uint32_t reg_class : 8;
 358 };
 359
 360 /**
 361  * PhysReg
 362  * Represents the physical register for each
 363  * Operand and Definition.
 364  */
 365 struct PhysReg {
 366    constexpr PhysReg() = default;
 367    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 368    constexpr unsigned reg() const { return reg_b >> 2; }
 369    constexpr unsigned byte() const { return reg_b & 0x3; }
 370    constexpr operator unsigned() const { return reg(); }
 371    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 372    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 373    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 374    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 375
 376    uint16_t reg_b = 0;
 377 };
 378
 379 /* helper expressions for special registers */
 380 static constexpr PhysReg m0{124};
 381 static constexpr PhysReg vcc{106};
 382 static constexpr PhysReg vcc_hi{107};
 383 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
 384 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
 385 static constexpr PhysReg ttmp0{112};
 386 static constexpr PhysReg ttmp1{113};
 387 static constexpr PhysReg ttmp2{114};
 388 static constexpr PhysReg ttmp3{115};
 389 static constexpr PhysReg ttmp4{116};
 390 static constexpr PhysReg ttmp5{117};
 391 static constexpr PhysReg ttmp6{118};
 392 static constexpr PhysReg ttmp7{119};
 393 static constexpr PhysReg ttmp8{120};
 394 static constexpr PhysReg ttmp9{121};
 395 static constexpr PhysReg ttmp10{122};
 396 static constexpr PhysReg ttmp11{123};
 397 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 398 static constexpr PhysReg exec{126};
 399 static constexpr PhysReg exec_lo{126};
 400 static constexpr PhysReg exec_hi{127};
 401 static constexpr PhysReg vccz{251};
 402 static constexpr PhysReg execz{252};
 403 static constexpr PhysReg scc{253};
 404
 405 /**
 406  * Operand Class
 407  * Initially, each Operand refers to either
 408  * a temporary virtual register
 409  * or to a constant value
 410  * Temporary registers get mapped to physical register during RA
 411  * Constant values are inlined into the instruction sequence.
 412  */
 413 class Operand final
 414 {
 415 public:
 416    constexpr Operand()
 417       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 418         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 419         isLateKill_(false) {}
 420
 421    explicit Operand(Temp r) noexcept
 422    {
 423       data_.temp = r;
 424       if (r.id()) {
 425          isTemp_ = true;
 426       } else {
 427          isUndef_ = true;
 428          setFixed(PhysReg{128});
 429       }
 430    };
 431    explicit Operand(uint8_t v) noexcept
 432    {
 433       /* 8-bit constants are only used for copies and copies from any 8-bit
 434        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 435        * to be inline constants. */
 436       data_.i = v;
 437       isConstant_ = true;
 438       constSize = 0;
 439       setFixed(PhysReg{0u});
 440    };
 441    explicit Operand(uint16_t v) noexcept
 442    {
 443       data_.i = v;
 444       isConstant_ = true;
 445       constSize = 1;
 446       if (v <= 64)
 447          setFixed(PhysReg{128u + v});
 448       else if (v >= 0xFFF0) /* [-16 .. -1] */
 449          setFixed(PhysReg{192u + (0xFFFF - v)});
 450       else if (v == 0x3800) /* 0.5 */
 451          setFixed(PhysReg{240});
 452       else if (v == 0xB800) /* -0.5 */
 453          setFixed(PhysReg{241});
 454       else if (v == 0x3C00) /* 1.0 */
 455          setFixed(PhysReg{242});
 456       else if (v == 0xBC00) /* -1.0 */
 457          setFixed(PhysReg{243});
 458       else if (v == 0x4000) /* 2.0 */
 459          setFixed(PhysReg{244});
 460       else if (v == 0xC000) /* -2.0 */
 461          setFixed(PhysReg{245});
 462       else if (v == 0x4400) /* 4.0 */
 463          setFixed(PhysReg{246});
 464       else if (v == 0xC400) /* -4.0 */
 465          setFixed(PhysReg{247});
 466       else if (v == 0x3118) /* 1/2 PI */
 467          setFixed(PhysReg{248});
 468       else /* Literal Constant */
 469          setFixed(PhysReg{255});
 470    };
 471    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 472    {
 473       data_.i = v;
 474       isConstant_ = true;
 475       constSize = is64bit ? 3 : 2;
 476       if (v <= 64)
 477          setFixed(PhysReg{128 + v});
 478       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 479          setFixed(PhysReg{192 - v});
 480       else if (v == 0x3f000000) /* 0.5 */
 481          setFixed(PhysReg{240});
 482       else if (v == 0xbf000000) /* -0.5 */
 483          setFixed(PhysReg{241});
 484       else if (v == 0x3f800000) /* 1.0 */
 485          setFixed(PhysReg{242});
 486       else if (v == 0xbf800000) /* -1.0 */
 487          setFixed(PhysReg{243});
 488       else if (v == 0x40000000) /* 2.0 */
 489          setFixed(PhysReg{244});
 490       else if (v == 0xc0000000) /* -2.0 */
 491          setFixed(PhysReg{245});
 492       else if (v == 0x40800000) /* 4.0 */
 493          setFixed(PhysReg{246});
 494       else if (v == 0xc0800000) /* -4.0 */
 495          setFixed(PhysReg{247});
 496       else { /* Literal Constant */
 497          assert(!is64bit && "attempt to create a 64-bit literal constant");
 498          setFixed(PhysReg{255});
 499       }
 500    };
 501    explicit Operand(uint64_t v) noexcept
 502    {
 503       isConstant_ = true;
 504       constSize = 3;
 505       if (v <= 64) {
 506          data_.i = (uint32_t) v;
 507          setFixed(PhysReg{128 + (uint32_t) v});
 508       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 509          data_.i = (uint32_t) v;
 510          setFixed(PhysReg{192 - (uint32_t) v});
 511       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 512          data_.i = 0x3f000000;
 513          setFixed(PhysReg{240});
 514       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 515          data_.i = 0xbf000000;
 516          setFixed(PhysReg{241});
 517       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 518          data_.i = 0x3f800000;
 519          setFixed(PhysReg{242});
 520       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 521          data_.i = 0xbf800000;
 522          setFixed(PhysReg{243});
 523       } else if (v == 0x4000000000000000) { /* 2.0 */
 524          data_.i = 0x40000000;
 525          setFixed(PhysReg{244});
 526       } else if (v == 0xC000000000000000) { /* -2.0 */
 527          data_.i = 0xc0000000;
 528          setFixed(PhysReg{245});
 529       } else if (v == 0x4010000000000000) { /* 4.0 */
 530          data_.i = 0x40800000;
 531          setFixed(PhysReg{246});
 532       } else if (v == 0xC010000000000000) { /* -4.0 */
 533          data_.i = 0xc0800000;
 534          setFixed(PhysReg{247});
 535       } else { /* Literal Constant: we don't know if it is a long or double.*/
 536          isConstant_ = 0;
 537          assert(false && "attempt to create a 64-bit literal constant");
 538       }
 539    };
 540    explicit Operand(RegClass type) noexcept
 541    {
 542       isUndef_ = true;
 543       data_.temp = Temp(0, type);
 544       setFixed(PhysReg{128});
 545    };
 546    explicit Operand(PhysReg reg, RegClass type) noexcept
 547    {
 548       data_.temp = Temp(0, type);
 549       setFixed(reg);
 550    }
 551
 552    constexpr bool isTemp() const noexcept
 553    {
 554       return isTemp_;
 555    }
 556
 557    constexpr void setTemp(Temp t) noexcept {
 558       assert(!isConstant_);
 559       isTemp_ = true;
 560       data_.temp = t;
 561    }
 562
 563    constexpr Temp getTemp() const noexcept
 564    {
 565       return data_.temp;
 566    }
 567
 568    constexpr uint32_t tempId() const noexcept
 569    {
 570       return data_.temp.id();
 571    }
 572
 573    constexpr bool hasRegClass() const noexcept
 574    {
 575       return isTemp() || isUndefined();
 576    }
 577
 578    constexpr RegClass regClass() const noexcept
 579    {
 580       return data_.temp.regClass();
 581    }
 582
 583    constexpr unsigned bytes() const noexcept
 584    {
 585       if (isConstant())
 586          return 1 << constSize;
 587       else
 588          return data_.temp.bytes();
 589    }
 590
 591    constexpr unsigned size() const noexcept
 592    {
 593       if (isConstant())
 594          return constSize > 2 ? 2 : 1;
 595       else
 596          return data_.temp.size();
 597    }
 598
 599    constexpr bool isFixed() const noexcept
 600    {
 601       return isFixed_;
 602    }
 603
 604    constexpr PhysReg physReg() const noexcept
 605    {
 606       return reg_;
 607    }
 608
 609    constexpr void setFixed(PhysReg reg) noexcept
 610    {
 611       isFixed_ = reg != unsigned(-1);
 612       reg_ = reg;
 613    }
 614
 615    constexpr bool isConstant() const noexcept
 616    {
 617       return isConstant_;
 618    }
 619
 620    constexpr bool isLiteral() const noexcept
 621    {
 622       return isConstant() && reg_ == 255;
 623    }
 624
 625    constexpr bool isUndefined() const noexcept
 626    {
 627       return isUndef_;
 628    }
 629
 630    constexpr uint32_t constantValue() const noexcept
 631    {
 632       return data_.i;
 633    }
 634
 635    constexpr bool constantEquals(uint32_t cmp) const noexcept
 636    {
 637       return isConstant() && constantValue() == cmp;
 638    }
 639
 640    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 641    {
 642       if (constSize == 3) {
 643          if (reg_ <= 192)
 644             return reg_ - 128;
 645          else if (reg_ <= 208)
 646             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 647
 648          switch (reg_) {
 649          case 240:
 650             return 0x3FE0000000000000;
 651          case 241:
 652             return 0xBFE0000000000000;
 653          case 242:
 654             return 0x3FF0000000000000;
 655          case 243:
 656             return 0xBFF0000000000000;
 657          case 244:
 658             return 0x4000000000000000;
 659          case 245:
 660             return 0xC000000000000000;
 661          case 246:
 662             return 0x4010000000000000;
 663          case 247:
 664             return 0xC010000000000000;
 665          }
 666       } else if (constSize == 1) {
 667          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 668       } else if (constSize == 0) {
 669          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 670       }
 671       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 672    }
 673
 674    constexpr bool isOfType(RegType type) const noexcept
 675    {
 676       return hasRegClass() && regClass().type() == type;
 677    }
 678
 679    /* Indicates that the killed operand's live range intersects with the
 680     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 681     * not set by liveness analysis. */
 682    constexpr void setLateKill(bool flag) noexcept
 683    {
 684       isLateKill_ = flag;
 685    }
 686
 687    constexpr bool isLateKill() const noexcept
 688    {
 689       return isLateKill_;
 690    }
 691
 692    constexpr void setKill(bool flag) noexcept
 693    {
 694       isKill_ = flag;
 695       if (!flag)
 696          setFirstKill(false);
 697    }
 698
 699    constexpr bool isKill() const noexcept
 700    {
 701       return isKill_ || isFirstKill();
 702    }
 703
 704    constexpr void setFirstKill(bool flag) noexcept
 705    {
 706       isFirstKill_ = flag;
 707       if (flag)
 708          setKill(flag);
 709    }
 710
 711    /* When there are multiple operands killing the same temporary,
 712     * isFirstKill() is only returns true for the first one. */
 713    constexpr bool isFirstKill() const noexcept
 714    {
 715       return isFirstKill_;
 716    }
 717
 718    constexpr bool isKillBeforeDef() const noexcept
 719    {
 720       return isKill() && !isLateKill();
 721    }
 722
 723    constexpr bool isFirstKillBeforeDef() const noexcept
 724    {
 725       return isFirstKill() && !isLateKill();
 726    }
 727
 728    constexpr bool operator == (Operand other) const noexcept
 729    {
 730       if (other.size() != size())
 731          return false;
 732       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 733          return false;
 734       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 735          return false;
 736       if (isLiteral())
 737          return other.isLiteral() && other.constantValue() == constantValue();
 738       else if (isConstant())
 739          return other.isConstant() && other.physReg() == physReg();
 740       else if (isUndefined())
 741          return other.isUndefined() && other.regClass() == regClass();
 742       else
 743          return other.isTemp() && other.getTemp() == getTemp();
 744    }
 745 private:
 746    union {
 747       uint32_t i;
 748       float f;
 749       Temp temp = Temp(0, s1);
 750    } data_;
 751    PhysReg reg_;
 752    union {
 753       struct {
 754          uint8_t isTemp_:1;
 755          uint8_t isFixed_:1;
 756          uint8_t isConstant_:1;
 757          uint8_t isKill_:1;
 758          uint8_t isUndef_:1;
 759          uint8_t isFirstKill_:1;
 760          uint8_t constSize:2;
 761          uint8_t isLateKill_:1;
 762       };
 763       /* can't initialize bit-fields in c++11, so work around using a union */
 764       uint16_t control_ = 0;
 765    };
 766 };
 767
 768 /**
 769  * Definition Class
 770  * Definitions are the results of Instructions
 771  * and refer to temporary virtual registers
 772  * which are later mapped to physical registers
 773  */
 774 class Definition final
 775 {
 776 public:
 777    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 778                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 779    Definition(uint32_t index, RegClass type) noexcept
 780       : temp(index, type) {}
 781    explicit Definition(Temp tmp) noexcept
 782       : temp(tmp) {}
 783    Definition(PhysReg reg, RegClass type) noexcept
 784       : temp(Temp(0, type))
 785    {
 786       setFixed(reg);
 787    }
 788    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 789       : temp(Temp(tmpId, type))
 790    {
 791       setFixed(reg);
 792    }
 793
 794    constexpr bool isTemp() const noexcept
 795    {
 796       return tempId() > 0;
 797    }
 798
 799    constexpr Temp getTemp() const noexcept
 800    {
 801       return temp;
 802    }
 803
 804    constexpr uint32_t tempId() const noexcept
 805    {
 806       return temp.id();
 807    }
 808
 809    constexpr void setTemp(Temp t) noexcept {
 810       temp = t;
 811    }
 812
 813    constexpr RegClass regClass() const noexcept
 814    {
 815       return temp.regClass();
 816    }
 817
 818    constexpr unsigned bytes() const noexcept
 819    {
 820       return temp.bytes();
 821    }
 822
 823    constexpr unsigned size() const noexcept
 824    {
 825       return temp.size();
 826    }
 827
 828    constexpr bool isFixed() const noexcept
 829    {
 830       return isFixed_;
 831    }
 832
 833    constexpr PhysReg physReg() const noexcept
 834    {
 835       return reg_;
 836    }
 837
 838    constexpr void setFixed(PhysReg reg) noexcept
 839    {
 840       isFixed_ = 1;
 841       reg_ = reg;
 842    }
 843
 844    constexpr void setHint(PhysReg reg) noexcept
 845    {
 846       hasHint_ = 1;
 847       reg_ = reg;
 848    }
 849
 850    constexpr bool hasHint() const noexcept
 851    {
 852       return hasHint_;
 853    }
 854
 855    constexpr void setKill(bool flag) noexcept
 856    {
 857       isKill_ = flag;
 858    }
 859
 860    constexpr bool isKill() const noexcept
 861    {
 862       return isKill_;
 863    }
 864
 865    constexpr void setPrecise(bool precise) noexcept
 866    {
 867       isPrecise_ = precise;
 868    }
 869
 870    constexpr bool isPrecise() const noexcept
 871    {
 872       return isPrecise_;
 873    }
 874
 875    /* No Unsigned Wrap */
 876    constexpr void setNUW(bool nuw) noexcept
 877    {
 878       isNUW_ = nuw;
 879    }
 880
 881    constexpr bool isNUW() const noexcept
 882    {
 883       return isNUW_;
 884    }
 885
 886 private:
 887    Temp temp = Temp(0, s1);
 888    PhysReg reg_;
 889    union {
 890       struct {
 891          uint8_t isFixed_:1;
 892          uint8_t hasHint_:1;
 893          uint8_t isKill_:1;
 894          uint8_t isPrecise_:1;
 895          uint8_t isNUW_:1;
 896       };
 897       /* can't initialize bit-fields in c++11, so work around using a union */
 898       uint8_t control_ = 0;
 899    };
 900 };
 901
 902 struct Block;
 903
 904 struct Instruction {
 905    aco_opcode opcode;
 906    Format format;
 907    uint32_t pass_flags;
 908
 909    aco::span<Operand> operands;
 910    aco::span<Definition> definitions;
 911
 912    constexpr bool isVALU() const noexcept
 913    {
 914       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 915           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 916           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 917           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 918           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 919           || format == Format::VOP3P;
 920    }
 921
 922    constexpr bool isSALU() const noexcept
 923    {
 924       return format == Format::SOP1 ||
 925              format == Format::SOP2 ||
 926              format == Format::SOPC ||
 927              format == Format::SOPK ||
 928              format == Format::SOPP;
 929    }
 930
 931    constexpr bool isVMEM() const noexcept
 932    {
 933       return format == Format::MTBUF ||
 934              format == Format::MUBUF ||
 935              format == Format::MIMG;
 936    }
 937
 938    constexpr bool isDPP() const noexcept
 939    {
 940       return (uint16_t) format & (uint16_t) Format::DPP;
 941    }
 942
 943    constexpr bool isVOP3() const noexcept
 944    {
 945       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 946              ((uint16_t) format & (uint16_t) Format::VOP3B);
 947    }
 948
 949    constexpr bool isSDWA() const noexcept
 950    {
 951       return (uint16_t) format & (uint16_t) Format::SDWA;
 952    }
 953
 954    constexpr bool isFlatOrGlobal() const noexcept
 955    {
 956       return format == Format::FLAT || format == Format::GLOBAL;
 957    }
 958
 959    constexpr bool usesModifiers() const noexcept;
 960
 961    constexpr bool reads_exec() const noexcept
 962    {
 963       for (const Operand& op : operands) {
 964          if (op.isFixed() && op.physReg() == exec)
 965             return true;
 966       }
 967       return false;
 968    }
 969 };
 970 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 971
 972 struct SOPK_instruction : public Instruction {
 973    uint16_t imm;
 974    uint16_t padding;
 975 };
 976 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 977
 978 struct SOPP_instruction : public Instruction {
 979    uint32_t imm;
 980    int block;
 981 };
 982 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 983
 984 struct SOPC_instruction : public Instruction {
 985 };
 986 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 987
 988 struct SOP1_instruction : public Instruction {
 989 };
 990 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 991
 992 struct SOP2_instruction : public Instruction {
 993 };
 994 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 995
 996 /**
 997  * Scalar Memory Format:
 998  * For s_(buffer_)load_dword*:
 999  * Operand(0): SBASE - SGPR-pair which provides base address
1000  * Operand(1): Offset - immediate (un)signed offset or SGPR
1001  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1002  * Operand(n-1): SOffset - SGPR offset (Vega only)
1003  *
1004  * Having no operands is also valid for instructions such as s_dcache_inv.
1005  *
1006  */
1007 struct SMEM_instruction : public Instruction {
1008    memory_sync_info sync;
1009    bool glc : 1; /* VI+: globally coherent */
1010    bool dlc : 1; /* NAVI: device level coherent */
1011    bool nv : 1; /* VEGA only: Non-volatile */
1012    bool disable_wqm : 1;
1013    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1014    uint32_t padding: 3;
1015 };
1016 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1017
1018 struct VOP1_instruction : public Instruction {
1019 };
1020 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1021
1022 struct VOP2_instruction : public Instruction {
1023 };
1024 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1025
1026 struct VOPC_instruction : public Instruction {
1027 };
1028 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1029
1030 struct VOP3A_instruction : public Instruction {
1031    bool abs[3];
1032    bool neg[3];
1033    uint8_t opsel : 4;
1034    uint8_t omod : 2;
1035    bool clamp : 1;
1036    uint32_t padding : 9;
1037 };
1038 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1039
1040 struct VOP3P_instruction : public Instruction {
1041    bool neg_lo[3];
1042    bool neg_hi[3];
1043    uint8_t opsel_lo : 3;
1044    uint8_t opsel_hi : 3;
1045    bool clamp : 1;
1046    uint32_t padding : 9;
1047 };
1048 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1049
1050 /**
1051  * Data Parallel Primitives Format:
1052  * This format can be used for VOP1, VOP2 or VOPC instructions.
1053  * The swizzle applies to the src0 operand.
1054  *
1055  */
1056 struct DPP_instruction : public Instruction {
1057    bool abs[2];
1058    bool neg[2];
1059    uint16_t dpp_ctrl;
1060    uint8_t row_mask : 4;
1061    uint8_t bank_mask : 4;
1062    bool bound_ctrl : 1;
1063    uint32_t padding : 7;
1064 };
1065 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1066
1067 enum sdwa_sel : uint8_t {
1068     /* masks */
1069     sdwa_wordnum = 0x1,
1070     sdwa_bytenum = 0x3,
1071     sdwa_asuint = 0x7 | 0x10,
1072     sdwa_rasize = 0x3,
1073
1074     /* flags */
1075     sdwa_isword = 0x4,
1076     sdwa_sext = 0x8,
1077     sdwa_isra = 0x10,
1078
1079     /* specific values */
1080     sdwa_ubyte0 = 0,
1081     sdwa_ubyte1 = 1,
1082     sdwa_ubyte2 = 2,
1083     sdwa_ubyte3 = 3,
1084     sdwa_uword0 = sdwa_isword | 0,
1085     sdwa_uword1 = sdwa_isword | 1,
1086     sdwa_udword = 6,
1087
1088     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1089     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1090     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1091     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1092     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1093     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1094     sdwa_sdword = sdwa_udword | sdwa_sext,
1095
1096     /* register-allocated */
1097     sdwa_ubyte = 1 | sdwa_isra,
1098     sdwa_uword = 2 | sdwa_isra,
1099     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1100     sdwa_sword = sdwa_uword | sdwa_sext,
1101 };
1102
1103 /**
1104  * Sub-Dword Addressing Format:
1105  * This format can be used for VOP1, VOP2 or VOPC instructions.
1106  *
1107  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1108  * the definition doesn't have to be VCC on GFX9+.
1109  *
1110  */
1111 struct SDWA_instruction : public Instruction {
1112    /* these destination modifiers aren't available with VOPC except for
1113     * clamp on GFX8 */
1114    uint8_t sel[2];
1115    uint8_t dst_sel;
1116    bool neg[2];
1117    bool abs[2];
1118    bool dst_preserve : 1;
1119    bool clamp : 1;
1120    uint8_t omod : 2; /* GFX9+ */
1121    uint32_t padding : 4;
1122 };
1123 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1124
1125 struct Interp_instruction : public Instruction {
1126    uint8_t attribute;
1127    uint8_t component;
1128    uint16_t padding;
1129 };
1130 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1131
1132 /**
1133  * Local and Global Data Sharing instructions
1134  * Operand(0): ADDR - VGPR which supplies the address.
1135  * Operand(1): DATA0 - First data VGPR.
1136  * Operand(2): DATA1 - Second data VGPR.
1137  * Operand(n-1): M0 - LDS size.
1138  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1139  *
1140  */
1141 struct DS_instruction : public Instruction {
1142    memory_sync_info sync;
1143    bool gds;
1144    int16_t offset0;
1145    int8_t offset1;
1146    uint8_t padding;
1147 };
1148 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1149
1150 /**
1151  * Vector Memory Untyped-buffer Instructions
1152  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1153  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1154  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1155  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1156  *
1157  */
1158 struct MUBUF_instruction : public Instruction {
1159    memory_sync_info sync;
1160    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1161    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1162    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1163    bool glc : 1; /* globally coherent */
1164    bool dlc : 1; /* NAVI: device level coherent */
1165    bool slc : 1; /* system level coherent */
1166    bool tfe : 1; /* texture fail enable */
1167    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1168    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1169    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1170    bool swizzled : 1;
1171    uint32_t padding1 : 18;
1172 };
1173 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1174
1175 /**
1176  * Vector Memory Typed-buffer Instructions
1177  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1178  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1179  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1180  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1181  *
1182  */
1183 struct MTBUF_instruction : public Instruction {
1184    memory_sync_info sync;
1185    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1186    uint8_t nfmt : 3; /* Numeric format of data in memory */
1187    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1188    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1189    bool glc : 1; /* globally coherent */
1190    bool dlc : 1; /* NAVI: device level coherent */
1191    bool slc : 1; /* system level coherent */
1192    bool tfe : 1; /* texture fail enable */
1193    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1194    uint32_t padding : 10;
1195    uint16_t offset; /* Unsigned byte offset - 12 bit */
1196 };
1197 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1198
1199 /**
1200  * Vector Memory Image Instructions
1201  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1202  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1203  *             or VDATA - Vector GPR for write data.
1204  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1205  * Definition(0): VDATA - Vector GPR for read result.
1206  *
1207  */
1208 struct MIMG_instruction : public Instruction {
1209    memory_sync_info sync;
1210    uint8_t dmask; /* Data VGPR enable mask */
1211    uint8_t dim : 3; /* NAVI: dimensionality */
1212    bool unrm : 1; /* Force address to be un-normalized */
1213    bool dlc : 1; /* NAVI: device level coherent */
1214    bool glc : 1; /* globally coherent */
1215    bool slc : 1; /* system level coherent */
1216    bool tfe : 1; /* texture fail enable */
1217    bool da : 1; /* declare an array */
1218    bool lwe : 1; /* Force data to be un-normalized */
1219    bool r128 : 1; /* NAVI: Texture resource size */
1220    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1221    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1222    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1223    uint32_t padding : 18;
1224 };
1225 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1226
1227 /**
1228  * Flat/Scratch/Global Instructions
1229  * Operand(0): ADDR
1230  * Operand(1): SADDR
1231  * Operand(2) / Definition(0): DATA/VDST
1232  *
1233  */
1234 struct FLAT_instruction : public Instruction {
1235    memory_sync_info sync;
1236    bool slc : 1; /* system level coherent */
1237    bool glc : 1; /* globally coherent */
1238    bool dlc : 1; /* NAVI: device level coherent */
1239    bool lds : 1;
1240    bool nv : 1;
1241    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1242    uint32_t padding0 : 2;
1243    uint16_t offset; /* Vega/Navi only */
1244    uint16_t padding1;
1245 };
1246 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1247
1248 struct Export_instruction : public Instruction {
1249    uint8_t enabled_mask;
1250    uint8_t dest;
1251    bool compressed : 1;
1252    bool done : 1;
1253    bool valid_mask : 1;
1254    uint32_t padding : 13;
1255 };
1256 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1257
1258 struct Pseudo_instruction : public Instruction {
1259    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1260    bool tmp_in_scc;
1261    uint8_t padding;
1262 };
1263 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1264
1265 struct Pseudo_branch_instruction : public Instruction {
1266    /* target[0] is the block index of the branch target.
1267     * For conditional branches, target[1] contains the fall-through alternative.
1268     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1269     */
1270    uint32_t target[2];
1271 };
1272 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1273
1274 struct Pseudo_barrier_instruction : public Instruction {
1275    memory_sync_info sync;
1276    sync_scope exec_scope;
1277 };
1278 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1279
1280 enum ReduceOp : uint16_t {
1281    iadd8, iadd16, iadd32, iadd64,
1282    imul8, imul16, imul32, imul64,
1283           fadd16, fadd32, fadd64,
1284           fmul16, fmul32, fmul64,
1285    imin8, imin16, imin32, imin64,
1286    imax8, imax16, imax32, imax64,
1287    umin8, umin16, umin32, umin64,
1288    umax8, umax16, umax32, umax64,
1289           fmin16, fmin32, fmin64,
1290           fmax16, fmax32, fmax64,
1291    iand8, iand16, iand32, iand64,
1292    ior8, ior16, ior32, ior64,
1293    ixor8, ixor16, ixor32, ixor64,
1294 };
1295
1296 /**
1297  * Subgroup Reduction Instructions, everything except for the data to be
1298  * reduced and the result as inserted by setup_reduce_temp().
1299  * Operand(0): data to be reduced
1300  * Operand(1): reduce temporary
1301  * Operand(2): vector temporary
1302  * Definition(0): result
1303  * Definition(1): scalar temporary
1304  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1305  * Definition(3): scc clobber
1306  * Definition(4): vcc clobber
1307  *
1308  */
1309 struct Pseudo_reduction_instruction : public Instruction {
1310    ReduceOp reduce_op;
1311    uint16_t cluster_size; // must be 0 for scans
1312 };
1313 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1314
1315 struct instr_deleter_functor {
1316    void operator()(void* p) {
1317       free(p);
1318    }
1319 };
1320
1321 template<typename T>
1322 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1323
1324 template<typename T>
1325 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1326 {
1327    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1328    char *data = (char*) calloc(1, size);
1329    T* inst = (T*) data;
1330
1331    inst->opcode = opcode;
1332    inst->format = format;
1333
1334    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1335    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1336    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1337    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1338
1339    return inst;
1340 }
1341
1342 constexpr bool Instruction::usesModifiers() const noexcept
1343 {
1344    if (isDPP() || isSDWA())
1345       return true;
1346
1347    if (format == Format::VOP3P) {
1348       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1349       for (unsigned i = 0; i < operands.size(); i++) {
1350          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1351             return true;
1352       }
1353       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1354    } else if (isVOP3()) {
1355       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1356       for (unsigned i = 0; i < operands.size(); i++) {
1357          if (vop3->abs[i] || vop3->neg[i])
1358             return true;
1359       }
1360       return vop3->opsel || vop3->clamp || vop3->omod;
1361    }
1362    return false;
1363 }
1364
1365 constexpr bool is_phi(Instruction* instr)
1366 {
1367    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1368 }
1369
1370 static inline bool is_phi(aco_ptr<Instruction>& instr)
1371 {
1372    return is_phi(instr.get());
1373 }
1374
1375 memory_sync_info get_sync_info(const Instruction* instr);
1376
1377 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1378
1379 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1380 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1381 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1382 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1383
1384 enum block_kind {
1385    /* uniform indicates that leaving this block,
1386     * all actives lanes stay active */
1387    block_kind_uniform = 1 << 0,
1388    block_kind_top_level = 1 << 1,
1389    block_kind_loop_preheader = 1 << 2,
1390    block_kind_loop_header = 1 << 3,
1391    block_kind_loop_exit = 1 << 4,
1392    block_kind_continue = 1 << 5,
1393    block_kind_break = 1 << 6,
1394    block_kind_continue_or_break = 1 << 7,
1395    block_kind_discard = 1 << 8,
1396    block_kind_branch = 1 << 9,
1397    block_kind_merge = 1 << 10,
1398    block_kind_invert = 1 << 11,
1399    block_kind_uses_discard_if = 1 << 12,
1400    block_kind_needs_lowering = 1 << 13,
1401    block_kind_uses_demote = 1 << 14,
1402    block_kind_export_end = 1 << 15,
1403 };
1404
1405
1406 struct RegisterDemand {
1407    constexpr RegisterDemand() = default;
1408    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1409       : vgpr{v}, sgpr{s} {}
1410    int16_t vgpr = 0;
1411    int16_t sgpr = 0;
1412
1413    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1414       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1415    }
1416
1417    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1418       return vgpr > other.vgpr || sgpr > other.sgpr;
1419    }
1420
1421    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1422       if (t.type() == RegType::sgpr)
1423          return RegisterDemand( vgpr, sgpr + t.size() );
1424       else
1425          return RegisterDemand( vgpr + t.size(), sgpr );
1426    }
1427
1428    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1429       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1430    }
1431
1432    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1433       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1434    }
1435
1436    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1437       vgpr += other.vgpr;
1438       sgpr += other.sgpr;
1439       return *this;
1440    }
1441
1442    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1443       vgpr -= other.vgpr;
1444       sgpr -= other.sgpr;
1445       return *this;
1446    }
1447
1448    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1449       if (t.type() == RegType::sgpr)
1450          sgpr += t.size();
1451       else
1452          vgpr += t.size();
1453       return *this;
1454    }
1455
1456    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1457       if (t.type() == RegType::sgpr)
1458          sgpr -= t.size();
1459       else
1460          vgpr -= t.size();
1461       return *this;
1462    }
1463
1464    constexpr void update(const RegisterDemand other) noexcept {
1465       vgpr = std::max(vgpr, other.vgpr);
1466       sgpr = std::max(sgpr, other.sgpr);
1467    }
1468
1469 };
1470
1471 /* CFG */
1472 struct Block {
1473    float_mode fp_mode;
1474    unsigned index;
1475    unsigned offset = 0;
1476    std::vector<aco_ptr<Instruction>> instructions;
1477    std::vector<unsigned> logical_preds;
1478    std::vector<unsigned> linear_preds;
1479    std::vector<unsigned> logical_succs;
1480    std::vector<unsigned> linear_succs;
1481    RegisterDemand register_demand = RegisterDemand();
1482    uint16_t loop_nest_depth = 0;
1483    uint16_t kind = 0;
1484    int logical_idom = -1;
1485    int linear_idom = -1;
1486    Temp live_out_exec = Temp();
1487
1488    /* this information is needed for predecessors to blocks with phis when
1489     * moving out of ssa */
1490    bool scc_live_out = false;
1491    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1492
1493    Block(unsigned idx) : index(idx) {}
1494    Block() : index(0) {}
1495 };
1496
1497 using Stage = uint16_t;
1498
1499 /* software stages */
1500 static constexpr Stage sw_vs = 1 << 0;
1501 static constexpr Stage sw_gs = 1 << 1;
1502 static constexpr Stage sw_tcs = 1 << 2;
1503 static constexpr Stage sw_tes = 1 << 3;
1504 static constexpr Stage sw_fs = 1 << 4;
1505 static constexpr Stage sw_cs = 1 << 5;
1506 static constexpr Stage sw_gs_copy = 1 << 6;
1507 static constexpr Stage sw_mask = 0x7f;
1508
1509 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1510 static constexpr Stage hw_vs = 1 << 7;
1511 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1512 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1513 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1514 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1515 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1516 static constexpr Stage hw_fs = 1 << 13;
1517 static constexpr Stage hw_cs = 1 << 14;
1518 static constexpr Stage hw_mask = 0xff << 7;
1519
1520 /* possible settings of Program::stage */
1521 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1522 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1523 static constexpr Stage compute_cs = sw_cs | hw_cs;
1524 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1525 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1526 /* GFX10/NGG */
1527 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1528 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1529 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1530 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1531 /* GFX9 (and GFX10 if NGG isn't used) */
1532 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1533 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1534 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1535 /* pre-GFX9 */
1536 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1537 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1538 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1539 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1540 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1541
1542 enum statistic {
1543    statistic_hash,
1544    statistic_instructions,
1545    statistic_copies,
1546    statistic_branches,
1547    statistic_cycles,
1548    statistic_vmem_clauses,
1549    statistic_smem_clauses,
1550    statistic_vmem_score,
1551    statistic_smem_score,
1552    statistic_sgpr_presched,
1553    statistic_vgpr_presched,
1554    num_statistics
1555 };
1556
1557 class Program final {
1558 public:
1559    float_mode next_fp_mode;
1560    std::vector<Block> blocks;
1561    RegisterDemand max_reg_demand = RegisterDemand();
1562    uint16_t num_waves = 0;
1563    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1564    ac_shader_config* config;
1565    struct radv_shader_info *info;
1566    enum chip_class chip_class;
1567    enum radeon_family family;
1568    unsigned wave_size;
1569    RegClass lane_mask;
1570    Stage stage; /* Stage */
1571    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1572    bool needs_wqm = false; /* there exists a p_wqm instruction */
1573    bool wb_smem_l1_on_end = false;
1574
1575    std::vector<uint8_t> constant_data;
1576    Temp private_segment_buffer;
1577    Temp scratch_offset;
1578
1579    uint16_t min_waves = 0;
1580    uint16_t lds_alloc_granule;
1581    uint32_t lds_limit; /* in bytes */
1582    bool has_16bank_lds;
1583    uint16_t vgpr_limit;
1584    uint16_t sgpr_limit;
1585    uint16_t physical_sgprs;
1586    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1587    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1588    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1589
1590    bool xnack_enabled = false;
1591    bool sram_ecc_enabled = false;
1592    bool has_fast_fma32 = false;
1593
1594    bool needs_vcc = false;
1595    bool needs_flat_scr = false;
1596
1597    bool collect_statistics = false;
1598    uint32_t statistics[num_statistics];
1599
1600    struct {
1601       void (*func)(void *private_data,
1602                    enum radv_compiler_debug_level level,
1603                    const char *message);
1604       void *private_data;
1605    } debug;
1606
1607    uint32_t allocateId()
1608    {
1609       assert(allocationID <= 16777215);
1610       return allocationID++;
1611    }
1612
1613    uint32_t peekAllocationId()
1614    {
1615       return allocationID;
1616    }
1617
1618    void setAllocationId(uint32_t id)
1619    {
1620       allocationID = id;
1621    }
1622
1623    Block* create_and_insert_block() {
1624       blocks.emplace_back(blocks.size());
1625       blocks.back().fp_mode = next_fp_mode;
1626       return &blocks.back();
1627    }
1628
1629    Block* insert_block(Block&& block) {
1630       block.index = blocks.size();
1631       block.fp_mode = next_fp_mode;
1632       blocks.emplace_back(std::move(block));
1633       return &blocks.back();
1634    }
1635
1636 private:
1637    uint32_t allocationID = 1;
1638 };
1639
1640 struct TempHash {
1641    std::size_t operator()(Temp t) const {
1642       return t.id();
1643    }
1644 };
1645 using TempSet = std::unordered_set<Temp, TempHash>;
1646
1647 struct live {
1648    /* live temps out per block */
1649    std::vector<TempSet> live_out;
1650    /* register demand (sgpr/vgpr) per instruction per block */
1651    std::vector<std::vector<RegisterDemand>> register_demand;
1652 };
1653
1654 void init();
1655
1656 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1657                   enum chip_class chip_class, enum radeon_family family,
1658                   ac_shader_config *config);
1659
1660 void select_program(Program *program,
1661                     unsigned shader_count,
1662                     struct nir_shader *const *shaders,
1663                     ac_shader_config* config,
1664                     struct radv_shader_args *args);
1665 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1666                            ac_shader_config* config,
1667                            struct radv_shader_args *args);
1668
1669 void lower_wqm(Program* program, live& live_vars,
1670                const struct radv_nir_compiler_options *options);
1671 void lower_phis(Program* program);
1672 void calc_min_waves(Program* program);
1673 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1674 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1675 std::vector<uint16_t> dead_code_analysis(Program *program);
1676 void dominator_tree(Program* program);
1677 void insert_exec_mask(Program *program);
1678 void value_numbering(Program* program);
1679 void optimize(Program* program);
1680 void setup_reduce_temp(Program* program);
1681 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1682 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1683 void ssa_elimination(Program* program);
1684 void lower_to_hw_instr(Program* program);
1685 void schedule_program(Program* program, live& live_vars);
1686 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1687 void insert_wait_states(Program* program);
1688 void insert_NOPs(Program* program);
1689 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1690 void print_asm(Program *program, std::vector<uint32_t>& binary,
1691                unsigned exec_size, std::ostream& out);
1692 bool validate_ir(Program* program);
1693 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1694 #ifndef NDEBUG
1695 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1696 #else
1697 #define perfwarn(program, cond, msg, ...) do {} while(0)
1698 #endif
1699
1700 void collect_presched_stats(Program *program);
1701 void collect_preasm_stats(Program *program);
1702 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1703
1704 void aco_print_instr(const Instruction *instr, FILE *output);
1705 void aco_print_program(const Program *program, FILE *output);
1706
1707 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1708                    const char *fmt, ...);
1709 void _aco_err(Program *program, const char *file, unsigned line,
1710               const char *fmt, ...);
1711
1712 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1713 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1714
1715 /* utilities for dealing with register demand */
1716 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1717 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1718 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1719
1720 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1721 uint16_t get_extra_sgprs(Program *program);
1722
1723 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1724 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1725 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1726
1727 /* return number of addressable sgprs/vgprs for max_waves */
1728 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1729 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1730
1731 typedef struct {
1732    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1733    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1734    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1735    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1736    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1737    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1738    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1739    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1740    /* sizes used for input/output modifiers and constants */
1741    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1742    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1743 } Info;
1744
1745 extern const Info instr_info;
1746
1747 }
1748
1749 #endif /* ACO_IR_H */
1750