src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 struct radv_nir_compiler_options;
  41 struct radv_shader_args;
  42 struct radv_shader_info;
  43
  44 namespace aco {
  45
  46 extern uint64_t debug_flags;
  47
  48 enum {
  49    DEBUG_VALIDATE = 0x1,
  50    DEBUG_VALIDATE_RA = 0x2,
  51    DEBUG_PERFWARN = 0x4,
  52 };
  53
  54 /**
  55  * Representation of the instruction's microcode encoding format
  56  * Note: Some Vector ALU Formats can be combined, such that:
  57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  60  *
  61  * (*) The same is applicable for VOP1 and VOPC instructions.
  62  */
  63 enum class Format : std::uint16_t {
  64    /* Pseudo Instruction Format */
  65    PSEUDO = 0,
  66    /* Scalar ALU & Control Formats */
  67    SOP1 = 1,
  68    SOP2 = 2,
  69    SOPK = 3,
  70    SOPP = 4,
  71    SOPC = 5,
  72    /* Scalar Memory Format */
  73    SMEM = 6,
  74    /* LDS/GDS Format */
  75    DS = 8,
  76    /* Vector Memory Buffer Formats */
  77    MTBUF = 9,
  78    MUBUF = 10,
  79    /* Vector Memory Image Format */
  80    MIMG = 11,
  81    /* Export Format */
  82    EXP = 12,
  83    /* Flat Formats */
  84    FLAT = 13,
  85    GLOBAL = 14,
  86    SCRATCH = 15,
  87
  88    PSEUDO_BRANCH = 16,
  89    PSEUDO_BARRIER = 17,
  90    PSEUDO_REDUCTION = 18,
  91
  92    /* Vector ALU Formats */
  93    VOP3P = 19,
  94    VOP1 = 1 << 8,
  95    VOP2 = 1 << 9,
  96    VOPC = 1 << 10,
  97    VOP3 = 1 << 11,
  98    VOP3A = 1 << 11,
  99    VOP3B = 1 << 11,
 100    /* Vector Parameter Interpolation Format */
 101    VINTRP = 1 << 12,
 102    DPP = 1 << 13,
 103    SDWA = 1 << 14,
 104 };
 105
 106 enum barrier_interaction : uint8_t {
 107    barrier_none = 0,
 108    barrier_buffer = 0x1,
 109    barrier_image = 0x2,
 110    barrier_atomic = 0x4,
 111    barrier_shared = 0x8,
 112    /* used for geometry shaders to ensure vertex data writes are before the
 113     * GS_DONE s_sendmsg. */
 114    barrier_gs_data = 0x10,
 115    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 116    barrier_gs_sendmsg = 0x20,
 117    /* used by barriers. created by s_barrier */
 118    barrier_barrier = 0x40,
 119    barrier_count = 7,
 120 };
 121
 122 enum fp_round {
 123    fp_round_ne = 0,
 124    fp_round_pi = 1,
 125    fp_round_ni = 2,
 126    fp_round_tz = 3,
 127 };
 128
 129 enum fp_denorm {
 130    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 131     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 132    fp_denorm_flush = 0x0,
 133    fp_denorm_keep = 0x3,
 134 };
 135
 136 struct float_mode {
 137    /* matches encoding of the MODE register */
 138    union {
 139       struct {
 140           fp_round round32:2;
 141           fp_round round16_64:2;
 142           unsigned denorm32:2;
 143           unsigned denorm16_64:2;
 144       };
 145       struct {
 146          uint8_t round:4;
 147          uint8_t denorm:4;
 148       };
 149       uint8_t val = 0;
 150    };
 151    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 152    bool preserve_signed_zero_inf_nan32:1;
 153    bool preserve_signed_zero_inf_nan16_64:1;
 154    /* if false, optimizations which may remove denormal flushing can be done */
 155    bool must_flush_denorms32:1;
 156    bool must_flush_denorms16_64:1;
 157    bool care_about_round32:1;
 158    bool care_about_round16_64:1;
 159
 160    /* Returns true if instructions using the mode "other" can safely use the
 161     * current one instead. */
 162    bool canReplace(float_mode other) const noexcept {
 163       return val == other.val &&
 164              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 165              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 166              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 167              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 168              (care_about_round32 || !other.care_about_round32) &&
 169              (care_about_round16_64 || !other.care_about_round16_64);
 170    }
 171 };
 172
 173 constexpr Format asVOP3(Format format) {
 174    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 175 };
 176
 177 constexpr Format asSDWA(Format format) {
 178    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 179    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 180 }
 181
 182 enum class RegType {
 183    none = 0,
 184    sgpr,
 185    vgpr,
 186    linear_vgpr,
 187 };
 188
 189 struct RegClass {
 190
 191    enum RC : uint8_t {
 192       s1 = 1,
 193       s2 = 2,
 194       s3 = 3,
 195       s4 = 4,
 196       s6 = 6,
 197       s8 = 8,
 198       s16 = 16,
 199       v1 = s1 | (1 << 5),
 200       v2 = s2 | (1 << 5),
 201       v3 = s3 | (1 << 5),
 202       v4 = s4 | (1 << 5),
 203       v5 = 5  | (1 << 5),
 204       v6 = 6  | (1 << 5),
 205       v7 = 7  | (1 << 5),
 206       v8 = 8  | (1 << 5),
 207       /* byte-sized register class */
 208       v1b = v1 | (1 << 7),
 209       v2b = v2 | (1 << 7),
 210       v3b = v3 | (1 << 7),
 211       v4b = v4 | (1 << 7),
 212       v6b = v6 | (1 << 7),
 213       v8b = v8 | (1 << 7),
 214       /* these are used for WWM and spills to vgpr */
 215       v1_linear = v1 | (1 << 6),
 216       v2_linear = v2 | (1 << 6),
 217    };
 218
 219    RegClass() = default;
 220    constexpr RegClass(RC rc)
 221       : rc(rc) {}
 222    constexpr RegClass(RegType type, unsigned size)
 223       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 224
 225    constexpr operator RC() const { return rc; }
 226    explicit operator bool() = delete;
 227
 228    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 229    constexpr bool is_subdword() const { return rc & (1 << 7); }
 230    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 231    //TODO: use size() less in favor of bytes()
 232    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 233    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 234    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 235    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 236
 237    static constexpr RegClass get(RegType type, unsigned bytes) {
 238       if (type == RegType::sgpr) {
 239          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 240       } else {
 241          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 242                              RegClass(type, bytes / 4u);
 243       }
 244    }
 245
 246 private:
 247    RC rc;
 248 };
 249
 250 /* transitional helper expressions */
 251 static constexpr RegClass s1{RegClass::s1};
 252 static constexpr RegClass s2{RegClass::s2};
 253 static constexpr RegClass s3{RegClass::s3};
 254 static constexpr RegClass s4{RegClass::s4};
 255 static constexpr RegClass s8{RegClass::s8};
 256 static constexpr RegClass s16{RegClass::s16};
 257 static constexpr RegClass v1{RegClass::v1};
 258 static constexpr RegClass v2{RegClass::v2};
 259 static constexpr RegClass v3{RegClass::v3};
 260 static constexpr RegClass v4{RegClass::v4};
 261 static constexpr RegClass v5{RegClass::v5};
 262 static constexpr RegClass v6{RegClass::v6};
 263 static constexpr RegClass v7{RegClass::v7};
 264 static constexpr RegClass v8{RegClass::v8};
 265 static constexpr RegClass v1b{RegClass::v1b};
 266 static constexpr RegClass v2b{RegClass::v2b};
 267 static constexpr RegClass v3b{RegClass::v3b};
 268 static constexpr RegClass v4b{RegClass::v4b};
 269 static constexpr RegClass v6b{RegClass::v6b};
 270 static constexpr RegClass v8b{RegClass::v8b};
 271
 272 /**
 273  * Temp Class
 274  * Each temporary virtual register has a
 275  * register class (i.e. size and type)
 276  * and SSA id.
 277  */
 278 struct Temp {
 279    Temp() noexcept : id_(0), reg_class(0) {}
 280    constexpr Temp(uint32_t id, RegClass cls) noexcept
 281       : id_(id), reg_class(uint8_t(cls)) {}
 282
 283    constexpr uint32_t id() const noexcept { return id_; }
 284    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 285
 286    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 287    constexpr unsigned size() const noexcept { return regClass().size(); }
 288    constexpr RegType type() const noexcept { return regClass().type(); }
 289    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 290
 291    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 292    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 293    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 294
 295 private:
 296    uint32_t id_: 24;
 297    uint32_t reg_class : 8;
 298 };
 299
 300 /**
 301  * PhysReg
 302  * Represents the physical register for each
 303  * Operand and Definition.
 304  */
 305 struct PhysReg {
 306    constexpr PhysReg() = default;
 307    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 308    constexpr unsigned reg() const { return reg_b >> 2; }
 309    constexpr unsigned byte() const { return reg_b & 0x3; }
 310    constexpr operator unsigned() const { return reg(); }
 311    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 312    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 313    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 314    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 315
 316    uint16_t reg_b = 0;
 317 };
 318
 319 /* helper expressions for special registers */
 320 static constexpr PhysReg m0{124};
 321 static constexpr PhysReg vcc{106};
 322 static constexpr PhysReg vcc_hi{107};
 323 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 324 static constexpr PhysReg exec{126};
 325 static constexpr PhysReg exec_lo{126};
 326 static constexpr PhysReg exec_hi{127};
 327 static constexpr PhysReg vccz{251};
 328 static constexpr PhysReg execz{252};
 329 static constexpr PhysReg scc{253};
 330
 331 /**
 332  * Operand Class
 333  * Initially, each Operand refers to either
 334  * a temporary virtual register
 335  * or to a constant value
 336  * Temporary registers get mapped to physical register during RA
 337  * Constant values are inlined into the instruction sequence.
 338  */
 339 class Operand final
 340 {
 341 public:
 342    constexpr Operand()
 343       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 344         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 345         isLateKill_(false) {}
 346
 347    explicit Operand(Temp r) noexcept
 348    {
 349       data_.temp = r;
 350       if (r.id()) {
 351          isTemp_ = true;
 352       } else {
 353          isUndef_ = true;
 354          setFixed(PhysReg{128});
 355       }
 356    };
 357    explicit Operand(uint8_t v) noexcept
 358    {
 359       /* 8-bit constants are only used for copies and copies from any 8-bit
 360        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 361        * to be inline constants. */
 362       data_.i = v;
 363       isConstant_ = true;
 364       constSize = 0;
 365       setFixed(PhysReg{0u});
 366    };
 367    explicit Operand(uint16_t v) noexcept
 368    {
 369       data_.i = v;
 370       isConstant_ = true;
 371       constSize = 1;
 372       if (v <= 64)
 373          setFixed(PhysReg{128u + v});
 374       else if (v >= 0xFFF0) /* [-16 .. -1] */
 375          setFixed(PhysReg{192u + (0xFFFF - v)});
 376       else if (v == 0x3800) /* 0.5 */
 377          setFixed(PhysReg{240});
 378       else if (v == 0xB800) /* -0.5 */
 379          setFixed(PhysReg{241});
 380       else if (v == 0x3C00) /* 1.0 */
 381          setFixed(PhysReg{242});
 382       else if (v == 0xBC00) /* -1.0 */
 383          setFixed(PhysReg{243});
 384       else if (v == 0x4000) /* 2.0 */
 385          setFixed(PhysReg{244});
 386       else if (v == 0xC000) /* -2.0 */
 387          setFixed(PhysReg{245});
 388       else if (v == 0x4400) /* 4.0 */
 389          setFixed(PhysReg{246});
 390       else if (v == 0xC400) /* -4.0 */
 391          setFixed(PhysReg{247});
 392       else if (v == 0x3118) /* 1/2 PI */
 393          setFixed(PhysReg{248});
 394       else /* Literal Constant */
 395          setFixed(PhysReg{255});
 396    };
 397    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 398    {
 399       data_.i = v;
 400       isConstant_ = true;
 401       constSize = is64bit ? 3 : 2;
 402       if (v <= 64)
 403          setFixed(PhysReg{128 + v});
 404       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 405          setFixed(PhysReg{192 - v});
 406       else if (v == 0x3f000000) /* 0.5 */
 407          setFixed(PhysReg{240});
 408       else if (v == 0xbf000000) /* -0.5 */
 409          setFixed(PhysReg{241});
 410       else if (v == 0x3f800000) /* 1.0 */
 411          setFixed(PhysReg{242});
 412       else if (v == 0xbf800000) /* -1.0 */
 413          setFixed(PhysReg{243});
 414       else if (v == 0x40000000) /* 2.0 */
 415          setFixed(PhysReg{244});
 416       else if (v == 0xc0000000) /* -2.0 */
 417          setFixed(PhysReg{245});
 418       else if (v == 0x40800000) /* 4.0 */
 419          setFixed(PhysReg{246});
 420       else if (v == 0xc0800000) /* -4.0 */
 421          setFixed(PhysReg{247});
 422       else { /* Literal Constant */
 423          assert(!is64bit && "attempt to create a 64-bit literal constant");
 424          setFixed(PhysReg{255});
 425       }
 426    };
 427    explicit Operand(uint64_t v) noexcept
 428    {
 429       isConstant_ = true;
 430       constSize = 3;
 431       if (v <= 64) {
 432          data_.i = (uint32_t) v;
 433          setFixed(PhysReg{128 + (uint32_t) v});
 434       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 435          data_.i = (uint32_t) v;
 436          setFixed(PhysReg{192 - (uint32_t) v});
 437       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 438          data_.i = 0x3f000000;
 439          setFixed(PhysReg{240});
 440       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 441          data_.i = 0xbf000000;
 442          setFixed(PhysReg{241});
 443       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 444          data_.i = 0x3f800000;
 445          setFixed(PhysReg{242});
 446       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 447          data_.i = 0xbf800000;
 448          setFixed(PhysReg{243});
 449       } else if (v == 0x4000000000000000) { /* 2.0 */
 450          data_.i = 0x40000000;
 451          setFixed(PhysReg{244});
 452       } else if (v == 0xC000000000000000) { /* -2.0 */
 453          data_.i = 0xc0000000;
 454          setFixed(PhysReg{245});
 455       } else if (v == 0x4010000000000000) { /* 4.0 */
 456          data_.i = 0x40800000;
 457          setFixed(PhysReg{246});
 458       } else if (v == 0xC010000000000000) { /* -4.0 */
 459          data_.i = 0xc0800000;
 460          setFixed(PhysReg{247});
 461       } else { /* Literal Constant: we don't know if it is a long or double.*/
 462          isConstant_ = 0;
 463          assert(false && "attempt to create a 64-bit literal constant");
 464       }
 465    };
 466    explicit Operand(RegClass type) noexcept
 467    {
 468       isUndef_ = true;
 469       data_.temp = Temp(0, type);
 470       setFixed(PhysReg{128});
 471    };
 472    explicit Operand(PhysReg reg, RegClass type) noexcept
 473    {
 474       data_.temp = Temp(0, type);
 475       setFixed(reg);
 476    }
 477
 478    constexpr bool isTemp() const noexcept
 479    {
 480       return isTemp_;
 481    }
 482
 483    constexpr void setTemp(Temp t) noexcept {
 484       assert(!isConstant_);
 485       isTemp_ = true;
 486       data_.temp = t;
 487    }
 488
 489    constexpr Temp getTemp() const noexcept
 490    {
 491       return data_.temp;
 492    }
 493
 494    constexpr uint32_t tempId() const noexcept
 495    {
 496       return data_.temp.id();
 497    }
 498
 499    constexpr bool hasRegClass() const noexcept
 500    {
 501       return isTemp() || isUndefined();
 502    }
 503
 504    constexpr RegClass regClass() const noexcept
 505    {
 506       return data_.temp.regClass();
 507    }
 508
 509    constexpr unsigned bytes() const noexcept
 510    {
 511       if (isConstant())
 512          return 1 << constSize;
 513       else
 514          return data_.temp.bytes();
 515    }
 516
 517    constexpr unsigned size() const noexcept
 518    {
 519       if (isConstant())
 520          return constSize > 2 ? 2 : 1;
 521       else
 522          return data_.temp.size();
 523    }
 524
 525    constexpr bool isFixed() const noexcept
 526    {
 527       return isFixed_;
 528    }
 529
 530    constexpr PhysReg physReg() const noexcept
 531    {
 532       return reg_;
 533    }
 534
 535    constexpr void setFixed(PhysReg reg) noexcept
 536    {
 537       isFixed_ = reg != unsigned(-1);
 538       reg_ = reg;
 539    }
 540
 541    constexpr bool isConstant() const noexcept
 542    {
 543       return isConstant_;
 544    }
 545
 546    constexpr bool isLiteral() const noexcept
 547    {
 548       return isConstant() && reg_ == 255;
 549    }
 550
 551    constexpr bool isUndefined() const noexcept
 552    {
 553       return isUndef_;
 554    }
 555
 556    constexpr uint32_t constantValue() const noexcept
 557    {
 558       return data_.i;
 559    }
 560
 561    constexpr bool constantEquals(uint32_t cmp) const noexcept
 562    {
 563       return isConstant() && constantValue() == cmp;
 564    }
 565
 566    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 567    {
 568       if (constSize == 3) {
 569          if (reg_ <= 192)
 570             return reg_ - 128;
 571          else if (reg_ <= 208)
 572             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 573
 574          switch (reg_) {
 575          case 240:
 576             return 0x3FE0000000000000;
 577          case 241:
 578             return 0xBFE0000000000000;
 579          case 242:
 580             return 0x3FF0000000000000;
 581          case 243:
 582             return 0xBFF0000000000000;
 583          case 244:
 584             return 0x4000000000000000;
 585          case 245:
 586             return 0xC000000000000000;
 587          case 246:
 588             return 0x4010000000000000;
 589          case 247:
 590             return 0xC010000000000000;
 591          }
 592       } else if (constSize == 1) {
 593          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 594       } else if (constSize == 0) {
 595          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 596       }
 597       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 598    }
 599
 600    constexpr bool isOfType(RegType type) const noexcept
 601    {
 602       return hasRegClass() && regClass().type() == type;
 603    }
 604
 605    /* Indicates that the killed operand's live range intersects with the
 606     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 607     * not set by liveness analysis. */
 608    constexpr void setLateKill(bool flag) noexcept
 609    {
 610       isLateKill_ = flag;
 611    }
 612
 613    constexpr bool isLateKill() const noexcept
 614    {
 615       return isLateKill_;
 616    }
 617
 618    constexpr void setKill(bool flag) noexcept
 619    {
 620       isKill_ = flag;
 621       if (!flag)
 622          setFirstKill(false);
 623    }
 624
 625    constexpr bool isKill() const noexcept
 626    {
 627       return isKill_ || isFirstKill();
 628    }
 629
 630    constexpr void setFirstKill(bool flag) noexcept
 631    {
 632       isFirstKill_ = flag;
 633       if (flag)
 634          setKill(flag);
 635    }
 636
 637    /* When there are multiple operands killing the same temporary,
 638     * isFirstKill() is only returns true for the first one. */
 639    constexpr bool isFirstKill() const noexcept
 640    {
 641       return isFirstKill_;
 642    }
 643
 644    constexpr bool isKillBeforeDef() const noexcept
 645    {
 646       return isKill() && !isLateKill();
 647    }
 648
 649    constexpr bool isFirstKillBeforeDef() const noexcept
 650    {
 651       return isFirstKill() && !isLateKill();
 652    }
 653
 654    constexpr bool operator == (Operand other) const noexcept
 655    {
 656       if (other.size() != size())
 657          return false;
 658       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 659          return false;
 660       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 661          return false;
 662       if (isLiteral())
 663          return other.isLiteral() && other.constantValue() == constantValue();
 664       else if (isConstant())
 665          return other.isConstant() && other.physReg() == physReg();
 666       else if (isUndefined())
 667          return other.isUndefined() && other.regClass() == regClass();
 668       else
 669          return other.isTemp() && other.getTemp() == getTemp();
 670    }
 671 private:
 672    union {
 673       uint32_t i;
 674       float f;
 675       Temp temp = Temp(0, s1);
 676    } data_;
 677    PhysReg reg_;
 678    union {
 679       struct {
 680          uint8_t isTemp_:1;
 681          uint8_t isFixed_:1;
 682          uint8_t isConstant_:1;
 683          uint8_t isKill_:1;
 684          uint8_t isUndef_:1;
 685          uint8_t isFirstKill_:1;
 686          uint8_t constSize:2;
 687          uint8_t isLateKill_:1;
 688       };
 689       /* can't initialize bit-fields in c++11, so work around using a union */
 690       uint16_t control_ = 0;
 691    };
 692 };
 693
 694 /**
 695  * Definition Class
 696  * Definitions are the results of Instructions
 697  * and refer to temporary virtual registers
 698  * which are later mapped to physical registers
 699  */
 700 class Definition final
 701 {
 702 public:
 703    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 704                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 705    Definition(uint32_t index, RegClass type) noexcept
 706       : temp(index, type) {}
 707    explicit Definition(Temp tmp) noexcept
 708       : temp(tmp) {}
 709    Definition(PhysReg reg, RegClass type) noexcept
 710       : temp(Temp(0, type))
 711    {
 712       setFixed(reg);
 713    }
 714    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 715       : temp(Temp(tmpId, type))
 716    {
 717       setFixed(reg);
 718    }
 719
 720    constexpr bool isTemp() const noexcept
 721    {
 722       return tempId() > 0;
 723    }
 724
 725    constexpr Temp getTemp() const noexcept
 726    {
 727       return temp;
 728    }
 729
 730    constexpr uint32_t tempId() const noexcept
 731    {
 732       return temp.id();
 733    }
 734
 735    constexpr void setTemp(Temp t) noexcept {
 736       temp = t;
 737    }
 738
 739    constexpr RegClass regClass() const noexcept
 740    {
 741       return temp.regClass();
 742    }
 743
 744    constexpr unsigned bytes() const noexcept
 745    {
 746       return temp.bytes();
 747    }
 748
 749    constexpr unsigned size() const noexcept
 750    {
 751       return temp.size();
 752    }
 753
 754    constexpr bool isFixed() const noexcept
 755    {
 756       return isFixed_;
 757    }
 758
 759    constexpr PhysReg physReg() const noexcept
 760    {
 761       return reg_;
 762    }
 763
 764    constexpr void setFixed(PhysReg reg) noexcept
 765    {
 766       isFixed_ = 1;
 767       reg_ = reg;
 768    }
 769
 770    constexpr void setHint(PhysReg reg) noexcept
 771    {
 772       hasHint_ = 1;
 773       reg_ = reg;
 774    }
 775
 776    constexpr bool hasHint() const noexcept
 777    {
 778       return hasHint_;
 779    }
 780
 781    constexpr void setKill(bool flag) noexcept
 782    {
 783       isKill_ = flag;
 784    }
 785
 786    constexpr bool isKill() const noexcept
 787    {
 788       return isKill_;
 789    }
 790
 791    constexpr void setPrecise(bool precise) noexcept
 792    {
 793       isPrecise_ = precise;
 794    }
 795
 796    constexpr bool isPrecise() const noexcept
 797    {
 798       return isPrecise_;
 799    }
 800
 801    /* No Unsigned Wrap */
 802    constexpr void setNUW(bool nuw) noexcept
 803    {
 804       isNUW_ = nuw;
 805    }
 806
 807    constexpr bool isNUW() const noexcept
 808    {
 809       return isNUW_;
 810    }
 811
 812 private:
 813    Temp temp = Temp(0, s1);
 814    PhysReg reg_;
 815    union {
 816       struct {
 817          uint8_t isFixed_:1;
 818          uint8_t hasHint_:1;
 819          uint8_t isKill_:1;
 820          uint8_t isPrecise_:1;
 821          uint8_t isNUW_:1;
 822       };
 823       /* can't initialize bit-fields in c++11, so work around using a union */
 824       uint8_t control_ = 0;
 825    };
 826 };
 827
 828 struct Block;
 829
 830 struct Instruction {
 831    aco_opcode opcode;
 832    Format format;
 833    uint32_t pass_flags;
 834
 835    aco::span<Operand> operands;
 836    aco::span<Definition> definitions;
 837
 838    constexpr bool isVALU() const noexcept
 839    {
 840       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 841           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 842           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 843           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 844           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 845           || format == Format::VOP3P;
 846    }
 847
 848    constexpr bool isSALU() const noexcept
 849    {
 850       return format == Format::SOP1 ||
 851              format == Format::SOP2 ||
 852              format == Format::SOPC ||
 853              format == Format::SOPK ||
 854              format == Format::SOPP;
 855    }
 856
 857    constexpr bool isVMEM() const noexcept
 858    {
 859       return format == Format::MTBUF ||
 860              format == Format::MUBUF ||
 861              format == Format::MIMG;
 862    }
 863
 864    constexpr bool isDPP() const noexcept
 865    {
 866       return (uint16_t) format & (uint16_t) Format::DPP;
 867    }
 868
 869    constexpr bool isVOP3() const noexcept
 870    {
 871       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 872              ((uint16_t) format & (uint16_t) Format::VOP3B);
 873    }
 874
 875    constexpr bool isSDWA() const noexcept
 876    {
 877       return (uint16_t) format & (uint16_t) Format::SDWA;
 878    }
 879
 880    constexpr bool isFlatOrGlobal() const noexcept
 881    {
 882       return format == Format::FLAT || format == Format::GLOBAL;
 883    }
 884
 885    constexpr bool usesModifiers() const noexcept;
 886
 887    constexpr bool reads_exec() const noexcept
 888    {
 889       for (const Operand& op : operands) {
 890          if (op.isFixed() && op.physReg() == exec)
 891             return true;
 892       }
 893       return false;
 894    }
 895 };
 896 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 897
 898 struct SOPK_instruction : public Instruction {
 899    uint16_t imm;
 900    uint16_t padding;
 901 };
 902 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 903
 904 struct SOPP_instruction : public Instruction {
 905    uint32_t imm;
 906    int block;
 907 };
 908 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 909
 910 struct SOPC_instruction : public Instruction {
 911 };
 912 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 913
 914 struct SOP1_instruction : public Instruction {
 915 };
 916 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 917
 918 struct SOP2_instruction : public Instruction {
 919 };
 920 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 921
 922 /**
 923  * Scalar Memory Format:
 924  * For s_(buffer_)load_dword*:
 925  * Operand(0): SBASE - SGPR-pair which provides base address
 926  * Operand(1): Offset - immediate (un)signed offset or SGPR
 927  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 928  * Operand(n-1): SOffset - SGPR offset (Vega only)
 929  *
 930  * Having no operands is also valid for instructions such as s_dcache_inv.
 931  *
 932  */
 933 struct SMEM_instruction : public Instruction {
 934    barrier_interaction barrier;
 935    bool glc : 1; /* VI+: globally coherent */
 936    bool dlc : 1; /* NAVI: device level coherent */
 937    bool nv : 1; /* VEGA only: Non-volatile */
 938    bool can_reorder : 1;
 939    bool disable_wqm : 1;
 940    bool prevent_overflow : 1; /* avoid overflow when combining additions */
 941    uint32_t padding: 18;
 942 };
 943 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 944
 945 struct VOP1_instruction : public Instruction {
 946 };
 947 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 948
 949 struct VOP2_instruction : public Instruction {
 950 };
 951 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 952
 953 struct VOPC_instruction : public Instruction {
 954 };
 955 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 956
 957 struct VOP3A_instruction : public Instruction {
 958    bool abs[3];
 959    bool neg[3];
 960    uint8_t opsel : 4;
 961    uint8_t omod : 2;
 962    bool clamp : 1;
 963    uint32_t padding : 9;
 964 };
 965 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 966
 967 struct VOP3P_instruction : public Instruction {
 968    bool neg_lo[3];
 969    bool neg_hi[3];
 970    uint8_t opsel_lo : 3;
 971    uint8_t opsel_hi : 3;
 972    bool clamp : 1;
 973    uint32_t padding : 9;
 974 };
 975 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 976
 977 /**
 978  * Data Parallel Primitives Format:
 979  * This format can be used for VOP1, VOP2 or VOPC instructions.
 980  * The swizzle applies to the src0 operand.
 981  *
 982  */
 983 struct DPP_instruction : public Instruction {
 984    bool abs[2];
 985    bool neg[2];
 986    uint16_t dpp_ctrl;
 987    uint8_t row_mask : 4;
 988    uint8_t bank_mask : 4;
 989    bool bound_ctrl : 1;
 990    uint32_t padding : 7;
 991 };
 992 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 993
 994 enum sdwa_sel : uint8_t {
 995     /* masks */
 996     sdwa_wordnum = 0x1,
 997     sdwa_bytenum = 0x3,
 998     sdwa_asuint = 0x7 | 0x10,
 999     sdwa_rasize = 0x3,
1000
1001     /* flags */
1002     sdwa_isword = 0x4,
1003     sdwa_sext = 0x8,
1004     sdwa_isra = 0x10,
1005
1006     /* specific values */
1007     sdwa_ubyte0 = 0,
1008     sdwa_ubyte1 = 1,
1009     sdwa_ubyte2 = 2,
1010     sdwa_ubyte3 = 3,
1011     sdwa_uword0 = sdwa_isword | 0,
1012     sdwa_uword1 = sdwa_isword | 1,
1013     sdwa_udword = 6,
1014
1015     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1016     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1017     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1018     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1019     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1020     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1021     sdwa_sdword = sdwa_udword | sdwa_sext,
1022
1023     /* register-allocated */
1024     sdwa_ubyte = 1 | sdwa_isra,
1025     sdwa_uword = 2 | sdwa_isra,
1026     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1027     sdwa_sword = sdwa_uword | sdwa_sext,
1028 };
1029
1030 /**
1031  * Sub-Dword Addressing Format:
1032  * This format can be used for VOP1, VOP2 or VOPC instructions.
1033  *
1034  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1035  * the definition doesn't have to be VCC on GFX9+.
1036  *
1037  */
1038 struct SDWA_instruction : public Instruction {
1039    /* these destination modifiers aren't available with VOPC except for
1040     * clamp on GFX8 */
1041    uint8_t sel[2];
1042    uint8_t dst_sel;
1043    bool neg[2];
1044    bool abs[2];
1045    bool dst_preserve : 1;
1046    bool clamp : 1;
1047    uint8_t omod : 2; /* GFX9+ */
1048    uint32_t padding : 4;
1049 };
1050 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1051
1052 struct Interp_instruction : public Instruction {
1053    uint8_t attribute;
1054    uint8_t component;
1055    uint16_t padding;
1056 };
1057 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1058
1059 /**
1060  * Local and Global Data Sharing instructions
1061  * Operand(0): ADDR - VGPR which supplies the address.
1062  * Operand(1): DATA0 - First data VGPR.
1063  * Operand(2): DATA1 - Second data VGPR.
1064  * Operand(n-1): M0 - LDS size.
1065  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1066  *
1067  */
1068 struct DS_instruction : public Instruction {
1069    int16_t offset0;
1070    int8_t offset1;
1071    bool gds;
1072 };
1073 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1074
1075 /**
1076  * Vector Memory Untyped-buffer Instructions
1077  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1078  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1079  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1080  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1081  *
1082  */
1083 struct MUBUF_instruction : public Instruction {
1084    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1085    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1086    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1087    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1088    bool glc : 1; /* globally coherent */
1089    bool dlc : 1; /* NAVI: device level coherent */
1090    bool slc : 1; /* system level coherent */
1091    bool tfe : 1; /* texture fail enable */
1092    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1093    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1094    bool can_reorder : 1;
1095    bool swizzled:1;
1096    uint8_t padding : 1;
1097    barrier_interaction barrier;
1098 };
1099 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1100
1101 /**
1102  * Vector Memory Typed-buffer Instructions
1103  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1104  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1105  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1106  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1107  *
1108  */
1109 struct MTBUF_instruction : public Instruction {
1110    uint16_t offset; /* Unsigned byte offset - 12 bit */
1111    barrier_interaction barrier;
1112    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1113    uint8_t nfmt : 3; /* Numeric format of data in memory */
1114    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1115    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1116    bool glc : 1; /* globally coherent */
1117    bool dlc : 1; /* NAVI: device level coherent */
1118    bool slc : 1; /* system level coherent */
1119    bool tfe : 1; /* texture fail enable */
1120    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1121    bool can_reorder : 1;
1122    uint32_t padding : 25;
1123 };
1124 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1125
1126 /**
1127  * Vector Memory Image Instructions
1128  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1129  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1130  *             or VDATA - Vector GPR for write data.
1131  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1132  * Definition(0): VDATA - Vector GPR for read result.
1133  *
1134  */
1135 struct MIMG_instruction : public Instruction {
1136    uint8_t dmask; /* Data VGPR enable mask */
1137    uint8_t dim : 3; /* NAVI: dimensionality */
1138    bool unrm : 1; /* Force address to be un-normalized */
1139    bool dlc : 1; /* NAVI: device level coherent */
1140    bool glc : 1; /* globally coherent */
1141    bool slc : 1; /* system level coherent */
1142    bool tfe : 1; /* texture fail enable */
1143    bool da : 1; /* declare an array */
1144    bool lwe : 1; /* Force data to be un-normalized */
1145    bool r128 : 1; /* NAVI: Texture resource size */
1146    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1147    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1148    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1149    bool can_reorder : 1;
1150    uint8_t padding : 1;
1151    barrier_interaction barrier;
1152 };
1153 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1154
1155 /**
1156  * Flat/Scratch/Global Instructions
1157  * Operand(0): ADDR
1158  * Operand(1): SADDR
1159  * Operand(2) / Definition(0): DATA/VDST
1160  *
1161  */
1162 struct FLAT_instruction : public Instruction {
1163    uint16_t offset; /* Vega/Navi only */
1164    bool slc : 1; /* system level coherent */
1165    bool glc : 1; /* globally coherent */
1166    bool dlc : 1; /* NAVI: device level coherent */
1167    bool lds : 1;
1168    bool nv : 1;
1169    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1170    bool can_reorder : 1;
1171    uint8_t padding : 1;
1172    barrier_interaction barrier;
1173 };
1174 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1175
1176 struct Export_instruction : public Instruction {
1177    uint8_t enabled_mask;
1178    uint8_t dest;
1179    bool compressed : 1;
1180    bool done : 1;
1181    bool valid_mask : 1;
1182    uint32_t padding : 13;
1183 };
1184 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1185
1186 struct Pseudo_instruction : public Instruction {
1187    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1188    bool tmp_in_scc;
1189    uint8_t padding;
1190 };
1191 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1192
1193 struct Pseudo_branch_instruction : public Instruction {
1194    /* target[0] is the block index of the branch target.
1195     * For conditional branches, target[1] contains the fall-through alternative.
1196     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1197     */
1198    uint32_t target[2];
1199 };
1200 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1201
1202 struct Pseudo_barrier_instruction : public Instruction {
1203 };
1204 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1205
1206 enum ReduceOp : uint16_t {
1207    iadd8, iadd16, iadd32, iadd64,
1208    imul8, imul16, imul32, imul64,
1209           fadd16, fadd32, fadd64,
1210           fmul16, fmul32, fmul64,
1211    imin8, imin16, imin32, imin64,
1212    imax8, imax16, imax32, imax64,
1213    umin8, umin16, umin32, umin64,
1214    umax8, umax16, umax32, umax64,
1215           fmin16, fmin32, fmin64,
1216           fmax16, fmax32, fmax64,
1217    iand8, iand16, iand32, iand64,
1218    ior8, ior16, ior32, ior64,
1219    ixor8, ixor16, ixor32, ixor64,
1220 };
1221
1222 /**
1223  * Subgroup Reduction Instructions, everything except for the data to be
1224  * reduced and the result as inserted by setup_reduce_temp().
1225  * Operand(0): data to be reduced
1226  * Operand(1): reduce temporary
1227  * Operand(2): vector temporary
1228  * Definition(0): result
1229  * Definition(1): scalar temporary
1230  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1231  * Definition(3): scc clobber
1232  * Definition(4): vcc clobber
1233  *
1234  */
1235 struct Pseudo_reduction_instruction : public Instruction {
1236    ReduceOp reduce_op;
1237    uint16_t cluster_size; // must be 0 for scans
1238 };
1239 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1240
1241 struct instr_deleter_functor {
1242    void operator()(void* p) {
1243       free(p);
1244    }
1245 };
1246
1247 template<typename T>
1248 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1249
1250 template<typename T>
1251 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1252 {
1253    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1254    char *data = (char*) calloc(1, size);
1255    T* inst = (T*) data;
1256
1257    inst->opcode = opcode;
1258    inst->format = format;
1259
1260    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1261    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1262    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1263    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1264
1265    return inst;
1266 }
1267
1268 constexpr bool Instruction::usesModifiers() const noexcept
1269 {
1270    if (isDPP() || isSDWA())
1271       return true;
1272
1273    if (format == Format::VOP3P) {
1274       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1275       for (unsigned i = 0; i < operands.size(); i++) {
1276          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1277             return true;
1278       }
1279       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1280    } else if (isVOP3()) {
1281       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1282       for (unsigned i = 0; i < operands.size(); i++) {
1283          if (vop3->abs[i] || vop3->neg[i])
1284             return true;
1285       }
1286       return vop3->opsel || vop3->clamp || vop3->omod;
1287    }
1288    return false;
1289 }
1290
1291 constexpr bool is_phi(Instruction* instr)
1292 {
1293    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1294 }
1295
1296 static inline bool is_phi(aco_ptr<Instruction>& instr)
1297 {
1298    return is_phi(instr.get());
1299 }
1300
1301 barrier_interaction get_barrier_interaction(const Instruction* instr);
1302 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1303
1304 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1305 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1306 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1307 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1308
1309 enum block_kind {
1310    /* uniform indicates that leaving this block,
1311     * all actives lanes stay active */
1312    block_kind_uniform = 1 << 0,
1313    block_kind_top_level = 1 << 1,
1314    block_kind_loop_preheader = 1 << 2,
1315    block_kind_loop_header = 1 << 3,
1316    block_kind_loop_exit = 1 << 4,
1317    block_kind_continue = 1 << 5,
1318    block_kind_break = 1 << 6,
1319    block_kind_continue_or_break = 1 << 7,
1320    block_kind_discard = 1 << 8,
1321    block_kind_branch = 1 << 9,
1322    block_kind_merge = 1 << 10,
1323    block_kind_invert = 1 << 11,
1324    block_kind_uses_discard_if = 1 << 12,
1325    block_kind_needs_lowering = 1 << 13,
1326    block_kind_uses_demote = 1 << 14,
1327    block_kind_export_end = 1 << 15,
1328 };
1329
1330
1331 struct RegisterDemand {
1332    constexpr RegisterDemand() = default;
1333    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1334       : vgpr{v}, sgpr{s} {}
1335    int16_t vgpr = 0;
1336    int16_t sgpr = 0;
1337
1338    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1339       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1340    }
1341
1342    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1343       return vgpr > other.vgpr || sgpr > other.sgpr;
1344    }
1345
1346    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1347       if (t.type() == RegType::sgpr)
1348          return RegisterDemand( vgpr, sgpr + t.size() );
1349       else
1350          return RegisterDemand( vgpr + t.size(), sgpr );
1351    }
1352
1353    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1354       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1355    }
1356
1357    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1358       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1359    }
1360
1361    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1362       vgpr += other.vgpr;
1363       sgpr += other.sgpr;
1364       return *this;
1365    }
1366
1367    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1368       vgpr -= other.vgpr;
1369       sgpr -= other.sgpr;
1370       return *this;
1371    }
1372
1373    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1374       if (t.type() == RegType::sgpr)
1375          sgpr += t.size();
1376       else
1377          vgpr += t.size();
1378       return *this;
1379    }
1380
1381    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1382       if (t.type() == RegType::sgpr)
1383          sgpr -= t.size();
1384       else
1385          vgpr -= t.size();
1386       return *this;
1387    }
1388
1389    constexpr void update(const RegisterDemand other) noexcept {
1390       vgpr = std::max(vgpr, other.vgpr);
1391       sgpr = std::max(sgpr, other.sgpr);
1392    }
1393
1394 };
1395
1396 /* CFG */
1397 struct Block {
1398    float_mode fp_mode;
1399    unsigned index;
1400    unsigned offset = 0;
1401    std::vector<aco_ptr<Instruction>> instructions;
1402    std::vector<unsigned> logical_preds;
1403    std::vector<unsigned> linear_preds;
1404    std::vector<unsigned> logical_succs;
1405    std::vector<unsigned> linear_succs;
1406    RegisterDemand register_demand = RegisterDemand();
1407    uint16_t loop_nest_depth = 0;
1408    uint16_t kind = 0;
1409    int logical_idom = -1;
1410    int linear_idom = -1;
1411    Temp live_out_exec = Temp();
1412
1413    /* this information is needed for predecessors to blocks with phis when
1414     * moving out of ssa */
1415    bool scc_live_out = false;
1416    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1417
1418    Block(unsigned idx) : index(idx) {}
1419    Block() : index(0) {}
1420 };
1421
1422 using Stage = uint16_t;
1423
1424 /* software stages */
1425 static constexpr Stage sw_vs = 1 << 0;
1426 static constexpr Stage sw_gs = 1 << 1;
1427 static constexpr Stage sw_tcs = 1 << 2;
1428 static constexpr Stage sw_tes = 1 << 3;
1429 static constexpr Stage sw_fs = 1 << 4;
1430 static constexpr Stage sw_cs = 1 << 5;
1431 static constexpr Stage sw_gs_copy = 1 << 6;
1432 static constexpr Stage sw_mask = 0x7f;
1433
1434 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1435 static constexpr Stage hw_vs = 1 << 7;
1436 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1437 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1438 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1439 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1440 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1441 static constexpr Stage hw_fs = 1 << 13;
1442 static constexpr Stage hw_cs = 1 << 14;
1443 static constexpr Stage hw_mask = 0xff << 7;
1444
1445 /* possible settings of Program::stage */
1446 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1447 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1448 static constexpr Stage compute_cs = sw_cs | hw_cs;
1449 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1450 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1451 /* GFX10/NGG */
1452 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1453 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1454 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1455 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1456 /* GFX9 (and GFX10 if NGG isn't used) */
1457 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1458 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1459 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1460 /* pre-GFX9 */
1461 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1462 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1463 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1464 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1465 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1466
1467 enum statistic {
1468    statistic_hash,
1469    statistic_instructions,
1470    statistic_copies,
1471    statistic_branches,
1472    statistic_cycles,
1473    statistic_vmem_clauses,
1474    statistic_smem_clauses,
1475    statistic_vmem_score,
1476    statistic_smem_score,
1477    statistic_sgpr_presched,
1478    statistic_vgpr_presched,
1479    num_statistics
1480 };
1481
1482 class Program final {
1483 public:
1484    float_mode next_fp_mode;
1485    std::vector<Block> blocks;
1486    RegisterDemand max_reg_demand = RegisterDemand();
1487    uint16_t num_waves = 0;
1488    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1489    ac_shader_config* config;
1490    struct radv_shader_info *info;
1491    enum chip_class chip_class;
1492    enum radeon_family family;
1493    unsigned wave_size;
1494    RegClass lane_mask;
1495    Stage stage; /* Stage */
1496    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1497    bool needs_wqm = false; /* there exists a p_wqm instruction */
1498    bool wb_smem_l1_on_end = false;
1499
1500    std::vector<uint8_t> constant_data;
1501    Temp private_segment_buffer;
1502    Temp scratch_offset;
1503
1504    uint16_t min_waves = 0;
1505    uint16_t lds_alloc_granule;
1506    uint32_t lds_limit; /* in bytes */
1507    bool has_16bank_lds;
1508    uint16_t vgpr_limit;
1509    uint16_t sgpr_limit;
1510    uint16_t physical_sgprs;
1511    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1512    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1513    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1514
1515    bool xnack_enabled = false;
1516    bool sram_ecc_enabled = false;
1517    bool has_fast_fma32 = false;
1518
1519    bool needs_vcc = false;
1520    bool needs_flat_scr = false;
1521
1522    bool collect_statistics = false;
1523    uint32_t statistics[num_statistics];
1524
1525    uint32_t allocateId()
1526    {
1527       assert(allocationID <= 16777215);
1528       return allocationID++;
1529    }
1530
1531    uint32_t peekAllocationId()
1532    {
1533       return allocationID;
1534    }
1535
1536    void setAllocationId(uint32_t id)
1537    {
1538       allocationID = id;
1539    }
1540
1541    Block* create_and_insert_block() {
1542       blocks.emplace_back(blocks.size());
1543       blocks.back().fp_mode = next_fp_mode;
1544       return &blocks.back();
1545    }
1546
1547    Block* insert_block(Block&& block) {
1548       block.index = blocks.size();
1549       block.fp_mode = next_fp_mode;
1550       blocks.emplace_back(std::move(block));
1551       return &blocks.back();
1552    }
1553
1554 private:
1555    uint32_t allocationID = 1;
1556 };
1557
1558 struct TempHash {
1559    std::size_t operator()(Temp t) const {
1560       return t.id();
1561    }
1562 };
1563 using TempSet = std::unordered_set<Temp, TempHash>;
1564
1565 struct live {
1566    /* live temps out per block */
1567    std::vector<TempSet> live_out;
1568    /* register demand (sgpr/vgpr) per instruction per block */
1569    std::vector<std::vector<RegisterDemand>> register_demand;
1570 };
1571
1572 void init();
1573
1574 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1575                   enum chip_class chip_class, enum radeon_family family,
1576                   ac_shader_config *config);
1577
1578 void select_program(Program *program,
1579                     unsigned shader_count,
1580                     struct nir_shader *const *shaders,
1581                     ac_shader_config* config,
1582                     struct radv_shader_args *args);
1583 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1584                            ac_shader_config* config,
1585                            struct radv_shader_args *args);
1586
1587 void lower_wqm(Program* program, live& live_vars,
1588                const struct radv_nir_compiler_options *options);
1589 void lower_phis(Program* program);
1590 void calc_min_waves(Program* program);
1591 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1592 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1593 std::vector<uint16_t> dead_code_analysis(Program *program);
1594 void dominator_tree(Program* program);
1595 void insert_exec_mask(Program *program);
1596 void value_numbering(Program* program);
1597 void optimize(Program* program);
1598 void setup_reduce_temp(Program* program);
1599 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1600 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1601 void ssa_elimination(Program* program);
1602 void lower_to_hw_instr(Program* program);
1603 void schedule_program(Program* program, live& live_vars);
1604 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1605 void insert_wait_states(Program* program);
1606 void insert_NOPs(Program* program);
1607 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1608 void print_asm(Program *program, std::vector<uint32_t>& binary,
1609                unsigned exec_size, std::ostream& out);
1610 bool validate(Program* program, FILE *output);
1611 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1612 #ifndef NDEBUG
1613 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1614 #else
1615 #define perfwarn(program, cond, msg, ...) do {} while(0)
1616 #endif
1617
1618 void collect_presched_stats(Program *program);
1619 void collect_preasm_stats(Program *program);
1620 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1621
1622 void aco_print_instr(const Instruction *instr, FILE *output);
1623 void aco_print_program(const Program *program, FILE *output);
1624
1625 /* utilities for dealing with register demand */
1626 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1627 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1628 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1629
1630 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1631 uint16_t get_extra_sgprs(Program *program);
1632
1633 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1634 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1635 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1636
1637 /* return number of addressable sgprs/vgprs for max_waves */
1638 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1639 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1640
1641 typedef struct {
1642    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1643    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1644    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1645    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1646    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1647    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1648    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1649    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1650    /* sizes used for input/output modifiers and constants */
1651    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1652    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1653 } Info;
1654
1655 extern const Info instr_info;
1656
1657 }
1658
1659 #endif /* ACO_IR_H */
1660