src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 constexpr Format asSDWA(Format format) {
 173    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 174    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 175 }
 176
 177 enum class RegType {
 178    none = 0,
 179    sgpr,
 180    vgpr,
 181    linear_vgpr,
 182 };
 183
 184 struct RegClass {
 185
 186    enum RC : uint8_t {
 187       s1 = 1,
 188       s2 = 2,
 189       s3 = 3,
 190       s4 = 4,
 191       s6 = 6,
 192       s8 = 8,
 193       s16 = 16,
 194       v1 = s1 | (1 << 5),
 195       v2 = s2 | (1 << 5),
 196       v3 = s3 | (1 << 5),
 197       v4 = s4 | (1 << 5),
 198       v5 = 5  | (1 << 5),
 199       v6 = 6  | (1 << 5),
 200       v7 = 7  | (1 << 5),
 201       v8 = 8  | (1 << 5),
 202       /* byte-sized register class */
 203       v1b = v1 | (1 << 7),
 204       v2b = v2 | (1 << 7),
 205       v3b = v3 | (1 << 7),
 206       v4b = v4 | (1 << 7),
 207       v6b = v6 | (1 << 7),
 208       v8b = v8 | (1 << 7),
 209       /* these are used for WWM and spills to vgpr */
 210       v1_linear = v1 | (1 << 6),
 211       v2_linear = v2 | (1 << 6),
 212    };
 213
 214    RegClass() = default;
 215    constexpr RegClass(RC rc)
 216       : rc(rc) {}
 217    constexpr RegClass(RegType type, unsigned size)
 218       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 219
 220    constexpr operator RC() const { return rc; }
 221    explicit operator bool() = delete;
 222
 223    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 224    constexpr bool is_subdword() const { return rc & (1 << 7); }
 225    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 226    //TODO: use size() less in favor of bytes()
 227    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 228    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 229    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 230    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 231
 232 private:
 233    RC rc;
 234 };
 235
 236 /* transitional helper expressions */
 237 static constexpr RegClass s1{RegClass::s1};
 238 static constexpr RegClass s2{RegClass::s2};
 239 static constexpr RegClass s3{RegClass::s3};
 240 static constexpr RegClass s4{RegClass::s4};
 241 static constexpr RegClass s8{RegClass::s8};
 242 static constexpr RegClass s16{RegClass::s16};
 243 static constexpr RegClass v1{RegClass::v1};
 244 static constexpr RegClass v2{RegClass::v2};
 245 static constexpr RegClass v3{RegClass::v3};
 246 static constexpr RegClass v4{RegClass::v4};
 247 static constexpr RegClass v5{RegClass::v5};
 248 static constexpr RegClass v6{RegClass::v6};
 249 static constexpr RegClass v7{RegClass::v7};
 250 static constexpr RegClass v8{RegClass::v8};
 251 static constexpr RegClass v1b{RegClass::v1b};
 252 static constexpr RegClass v2b{RegClass::v2b};
 253 static constexpr RegClass v3b{RegClass::v3b};
 254 static constexpr RegClass v4b{RegClass::v4b};
 255 static constexpr RegClass v6b{RegClass::v6b};
 256 static constexpr RegClass v8b{RegClass::v8b};
 257
 258 /**
 259  * Temp Class
 260  * Each temporary virtual register has a
 261  * register class (i.e. size and type)
 262  * and SSA id.
 263  */
 264 struct Temp {
 265    Temp() noexcept : id_(0), reg_class(0) {}
 266    constexpr Temp(uint32_t id, RegClass cls) noexcept
 267       : id_(id), reg_class(uint8_t(cls)) {}
 268
 269    constexpr uint32_t id() const noexcept { return id_; }
 270    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 271
 272    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 273    constexpr unsigned size() const noexcept { return regClass().size(); }
 274    constexpr RegType type() const noexcept { return regClass().type(); }
 275    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 276
 277    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 278    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 279    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 280
 281 private:
 282    uint32_t id_: 24;
 283    uint32_t reg_class : 8;
 284 };
 285
 286 /**
 287  * PhysReg
 288  * Represents the physical register for each
 289  * Operand and Definition.
 290  */
 291 struct PhysReg {
 292    constexpr PhysReg() = default;
 293    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 294    constexpr unsigned reg() const { return reg_b >> 2; }
 295    constexpr unsigned byte() const { return reg_b & 0x3; }
 296    constexpr operator unsigned() const { return reg(); }
 297    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 298    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 299    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 300
 301    uint16_t reg_b = 0;
 302 };
 303
 304 /* helper expressions for special registers */
 305 static constexpr PhysReg m0{124};
 306 static constexpr PhysReg vcc{106};
 307 static constexpr PhysReg vcc_hi{107};
 308 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 309 static constexpr PhysReg exec{126};
 310 static constexpr PhysReg exec_lo{126};
 311 static constexpr PhysReg exec_hi{127};
 312 static constexpr PhysReg vccz{251};
 313 static constexpr PhysReg execz{252};
 314 static constexpr PhysReg scc{253};
 315
 316 /**
 317  * Operand Class
 318  * Initially, each Operand refers to either
 319  * a temporary virtual register
 320  * or to a constant value
 321  * Temporary registers get mapped to physical register during RA
 322  * Constant values are inlined into the instruction sequence.
 323  */
 324 class Operand final
 325 {
 326 public:
 327    constexpr Operand()
 328       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 329         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 330         isLateKill_(false) {}
 331
 332    explicit Operand(Temp r) noexcept
 333    {
 334       data_.temp = r;
 335       if (r.id()) {
 336          isTemp_ = true;
 337       } else {
 338          isUndef_ = true;
 339          setFixed(PhysReg{128});
 340       }
 341    };
 342    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 343    {
 344       data_.i = v;
 345       isConstant_ = true;
 346       is64BitConst_ = is64bit;
 347       if (v <= 64)
 348          setFixed(PhysReg{128 + v});
 349       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 350          setFixed(PhysReg{192 - v});
 351       else if (v == 0x3f000000) /* 0.5 */
 352          setFixed(PhysReg{240});
 353       else if (v == 0xbf000000) /* -0.5 */
 354          setFixed(PhysReg{241});
 355       else if (v == 0x3f800000) /* 1.0 */
 356          setFixed(PhysReg{242});
 357       else if (v == 0xbf800000) /* -1.0 */
 358          setFixed(PhysReg{243});
 359       else if (v == 0x40000000) /* 2.0 */
 360          setFixed(PhysReg{244});
 361       else if (v == 0xc0000000) /* -2.0 */
 362          setFixed(PhysReg{245});
 363       else if (v == 0x40800000) /* 4.0 */
 364          setFixed(PhysReg{246});
 365       else if (v == 0xc0800000) /* -4.0 */
 366          setFixed(PhysReg{247});
 367       else { /* Literal Constant */
 368          assert(!is64bit && "attempt to create a 64-bit literal constant");
 369          setFixed(PhysReg{255});
 370       }
 371    };
 372    explicit Operand(uint64_t v) noexcept
 373    {
 374       isConstant_ = true;
 375       is64BitConst_ = true;
 376       if (v <= 64) {
 377          data_.i = (uint32_t) v;
 378          setFixed(PhysReg{128 + (uint32_t) v});
 379       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 380          data_.i = (uint32_t) v;
 381          setFixed(PhysReg{192 - (uint32_t) v});
 382       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 383          data_.i = 0x3f000000;
 384          setFixed(PhysReg{240});
 385       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 386          data_.i = 0xbf000000;
 387          setFixed(PhysReg{241});
 388       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 389          data_.i = 0x3f800000;
 390          setFixed(PhysReg{242});
 391       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 392          data_.i = 0xbf800000;
 393          setFixed(PhysReg{243});
 394       } else if (v == 0x4000000000000000) { /* 2.0 */
 395          data_.i = 0x40000000;
 396          setFixed(PhysReg{244});
 397       } else if (v == 0xC000000000000000) { /* -2.0 */
 398          data_.i = 0xc0000000;
 399          setFixed(PhysReg{245});
 400       } else if (v == 0x4010000000000000) { /* 4.0 */
 401          data_.i = 0x40800000;
 402          setFixed(PhysReg{246});
 403       } else if (v == 0xC010000000000000) { /* -4.0 */
 404          data_.i = 0xc0800000;
 405          setFixed(PhysReg{247});
 406       } else { /* Literal Constant: we don't know if it is a long or double.*/
 407          isConstant_ = 0;
 408          assert(false && "attempt to create a 64-bit literal constant");
 409       }
 410    };
 411    explicit Operand(RegClass type) noexcept
 412    {
 413       isUndef_ = true;
 414       data_.temp = Temp(0, type);
 415       setFixed(PhysReg{128});
 416    };
 417    explicit Operand(PhysReg reg, RegClass type) noexcept
 418    {
 419       data_.temp = Temp(0, type);
 420       setFixed(reg);
 421    }
 422
 423    constexpr bool isTemp() const noexcept
 424    {
 425       return isTemp_;
 426    }
 427
 428    constexpr void setTemp(Temp t) noexcept {
 429       assert(!isConstant_);
 430       isTemp_ = true;
 431       data_.temp = t;
 432    }
 433
 434    constexpr Temp getTemp() const noexcept
 435    {
 436       return data_.temp;
 437    }
 438
 439    constexpr uint32_t tempId() const noexcept
 440    {
 441       return data_.temp.id();
 442    }
 443
 444    constexpr bool hasRegClass() const noexcept
 445    {
 446       return isTemp() || isUndefined();
 447    }
 448
 449    constexpr RegClass regClass() const noexcept
 450    {
 451       return data_.temp.regClass();
 452    }
 453
 454    constexpr unsigned bytes() const noexcept
 455    {
 456       if (isConstant())
 457          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
 458       else
 459          return data_.temp.bytes();
 460    }
 461
 462    constexpr unsigned size() const noexcept
 463    {
 464       if (isConstant())
 465          return is64BitConst_ ? 2 : 1;
 466       else
 467          return data_.temp.size();
 468    }
 469
 470    constexpr bool isFixed() const noexcept
 471    {
 472       return isFixed_;
 473    }
 474
 475    constexpr PhysReg physReg() const noexcept
 476    {
 477       return reg_;
 478    }
 479
 480    constexpr void setFixed(PhysReg reg) noexcept
 481    {
 482       isFixed_ = reg != unsigned(-1);
 483       reg_ = reg;
 484    }
 485
 486    constexpr bool isConstant() const noexcept
 487    {
 488       return isConstant_;
 489    }
 490
 491    constexpr bool isLiteral() const noexcept
 492    {
 493       return isConstant() && reg_ == 255;
 494    }
 495
 496    constexpr bool isUndefined() const noexcept
 497    {
 498       return isUndef_;
 499    }
 500
 501    constexpr uint32_t constantValue() const noexcept
 502    {
 503       return data_.i;
 504    }
 505
 506    constexpr bool constantEquals(uint32_t cmp) const noexcept
 507    {
 508       return isConstant() && constantValue() == cmp;
 509    }
 510
 511    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 512    {
 513       if (is64BitConst_) {
 514          if (reg_ <= 192)
 515             return reg_ - 128;
 516          else if (reg_ <= 208)
 517             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 518
 519          switch (reg_) {
 520          case 240:
 521             return 0x3FE0000000000000;
 522          case 241:
 523             return 0xBFE0000000000000;
 524          case 242:
 525             return 0x3FF0000000000000;
 526          case 243:
 527             return 0xBFF0000000000000;
 528          case 244:
 529             return 0x4000000000000000;
 530          case 245:
 531             return 0xC000000000000000;
 532          case 246:
 533             return 0x4010000000000000;
 534          case 247:
 535             return 0xC010000000000000;
 536          }
 537       }
 538       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 539    }
 540
 541    /* Indicates that the killed operand's live range intersects with the
 542     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 543     * not set by liveness analysis. */
 544    constexpr void setLateKill(bool flag) noexcept
 545    {
 546       isLateKill_ = flag;
 547    }
 548
 549    constexpr bool isLateKill() const noexcept
 550    {
 551       return isLateKill_;
 552    }
 553
 554    constexpr void setKill(bool flag) noexcept
 555    {
 556       isKill_ = flag;
 557       if (!flag)
 558          setFirstKill(false);
 559    }
 560
 561    constexpr bool isKill() const noexcept
 562    {
 563       return isKill_ || isFirstKill();
 564    }
 565
 566    constexpr void setFirstKill(bool flag) noexcept
 567    {
 568       isFirstKill_ = flag;
 569       if (flag)
 570          setKill(flag);
 571    }
 572
 573    /* When there are multiple operands killing the same temporary,
 574     * isFirstKill() is only returns true for the first one. */
 575    constexpr bool isFirstKill() const noexcept
 576    {
 577       return isFirstKill_;
 578    }
 579
 580    constexpr bool isKillBeforeDef() const noexcept
 581    {
 582       return isKill() && !isLateKill();
 583    }
 584
 585    constexpr bool isFirstKillBeforeDef() const noexcept
 586    {
 587       return isFirstKill() && !isLateKill();
 588    }
 589
 590    constexpr bool operator == (Operand other) const noexcept
 591    {
 592       if (other.size() != size())
 593          return false;
 594       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 595          return false;
 596       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 597          return false;
 598       if (isLiteral())
 599          return other.isLiteral() && other.constantValue() == constantValue();
 600       else if (isConstant())
 601          return other.isConstant() && other.physReg() == physReg();
 602       else if (isUndefined())
 603          return other.isUndefined() && other.regClass() == regClass();
 604       else
 605          return other.isTemp() && other.getTemp() == getTemp();
 606    }
 607 private:
 608    union {
 609       uint32_t i;
 610       float f;
 611       Temp temp = Temp(0, s1);
 612    } data_;
 613    PhysReg reg_;
 614    union {
 615       struct {
 616          uint8_t isTemp_:1;
 617          uint8_t isFixed_:1;
 618          uint8_t isConstant_:1;
 619          uint8_t isKill_:1;
 620          uint8_t isUndef_:1;
 621          uint8_t isFirstKill_:1;
 622          uint8_t is64BitConst_:1;
 623          uint8_t isLateKill_:1;
 624       };
 625       /* can't initialize bit-fields in c++11, so work around using a union */
 626       uint8_t control_ = 0;
 627    };
 628 };
 629
 630 /**
 631  * Definition Class
 632  * Definitions are the results of Instructions
 633  * and refer to temporary virtual registers
 634  * which are later mapped to physical registers
 635  */
 636 class Definition final
 637 {
 638 public:
 639    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 640    Definition(uint32_t index, RegClass type) noexcept
 641       : temp(index, type) {}
 642    explicit Definition(Temp tmp) noexcept
 643       : temp(tmp) {}
 644    Definition(PhysReg reg, RegClass type) noexcept
 645       : temp(Temp(0, type))
 646    {
 647       setFixed(reg);
 648    }
 649    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 650       : temp(Temp(tmpId, type))
 651    {
 652       setFixed(reg);
 653    }
 654
 655    constexpr bool isTemp() const noexcept
 656    {
 657       return tempId() > 0;
 658    }
 659
 660    constexpr Temp getTemp() const noexcept
 661    {
 662       return temp;
 663    }
 664
 665    constexpr uint32_t tempId() const noexcept
 666    {
 667       return temp.id();
 668    }
 669
 670    constexpr void setTemp(Temp t) noexcept {
 671       temp = t;
 672    }
 673
 674    constexpr RegClass regClass() const noexcept
 675    {
 676       return temp.regClass();
 677    }
 678
 679    constexpr unsigned bytes() const noexcept
 680    {
 681       return temp.bytes();
 682    }
 683
 684    constexpr unsigned size() const noexcept
 685    {
 686       return temp.size();
 687    }
 688
 689    constexpr bool isFixed() const noexcept
 690    {
 691       return isFixed_;
 692    }
 693
 694    constexpr PhysReg physReg() const noexcept
 695    {
 696       return reg_;
 697    }
 698
 699    constexpr void setFixed(PhysReg reg) noexcept
 700    {
 701       isFixed_ = 1;
 702       reg_ = reg;
 703    }
 704
 705    constexpr void setHint(PhysReg reg) noexcept
 706    {
 707       hasHint_ = 1;
 708       reg_ = reg;
 709    }
 710
 711    constexpr bool hasHint() const noexcept
 712    {
 713       return hasHint_;
 714    }
 715
 716    constexpr void setKill(bool flag) noexcept
 717    {
 718       isKill_ = flag;
 719    }
 720
 721    constexpr bool isKill() const noexcept
 722    {
 723       return isKill_;
 724    }
 725
 726 private:
 727    Temp temp = Temp(0, s1);
 728    PhysReg reg_;
 729    union {
 730       struct {
 731          uint8_t isFixed_:1;
 732          uint8_t hasHint_:1;
 733          uint8_t isKill_:1;
 734       };
 735       /* can't initialize bit-fields in c++11, so work around using a union */
 736       uint8_t control_ = 0;
 737    };
 738 };
 739
 740 class Block;
 741
 742 struct Instruction {
 743    aco_opcode opcode;
 744    Format format;
 745    uint32_t pass_flags;
 746
 747    aco::span<Operand> operands;
 748    aco::span<Definition> definitions;
 749
 750    constexpr bool isVALU() const noexcept
 751    {
 752       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 753           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 754           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 755           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 756           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 757           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 758    }
 759
 760    constexpr bool isSALU() const noexcept
 761    {
 762       return format == Format::SOP1 ||
 763              format == Format::SOP2 ||
 764              format == Format::SOPC ||
 765              format == Format::SOPK ||
 766              format == Format::SOPP;
 767    }
 768
 769    constexpr bool isVMEM() const noexcept
 770    {
 771       return format == Format::MTBUF ||
 772              format == Format::MUBUF ||
 773              format == Format::MIMG;
 774    }
 775
 776    constexpr bool isDPP() const noexcept
 777    {
 778       return (uint16_t) format & (uint16_t) Format::DPP;
 779    }
 780
 781    constexpr bool isVOP3() const noexcept
 782    {
 783       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 784              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 785              format == Format::VOP3P;
 786    }
 787
 788    constexpr bool isSDWA() const noexcept
 789    {
 790       return (uint16_t) format & (uint16_t) Format::SDWA;
 791    }
 792
 793    constexpr bool isFlatOrGlobal() const noexcept
 794    {
 795       return format == Format::FLAT || format == Format::GLOBAL;
 796    }
 797
 798    constexpr bool usesModifiers() const noexcept;
 799
 800    constexpr bool reads_exec() const noexcept
 801    {
 802       for (const Operand& op : operands) {
 803          if (op.isFixed() && op.physReg() == exec)
 804             return true;
 805       }
 806       return false;
 807    }
 808 };
 809 static_assert(sizeof(Instruction) == 16);
 810
 811 struct SOPK_instruction : public Instruction {
 812    uint16_t imm;
 813    uint16_t padding;
 814 };
 815 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
 816
 817 struct SOPP_instruction : public Instruction {
 818    uint32_t imm;
 819    int block;
 820 };
 821 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
 822
 823 struct SOPC_instruction : public Instruction {
 824 };
 825 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
 826
 827 struct SOP1_instruction : public Instruction {
 828 };
 829 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
 830
 831 struct SOP2_instruction : public Instruction {
 832 };
 833 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
 834
 835 /**
 836  * Scalar Memory Format:
 837  * For s_(buffer_)load_dword*:
 838  * Operand(0): SBASE - SGPR-pair which provides base address
 839  * Operand(1): Offset - immediate (un)signed offset or SGPR
 840  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 841  * Operand(n-1): SOffset - SGPR offset (Vega only)
 842  *
 843  * Having no operands is also valid for instructions such as s_dcache_inv.
 844  *
 845  */
 846 struct SMEM_instruction : public Instruction {
 847    barrier_interaction barrier;
 848    bool glc : 1; /* VI+: globally coherent */
 849    bool dlc : 1; /* NAVI: device level coherent */
 850    bool nv : 1; /* VEGA only: Non-volatile */
 851    bool can_reorder : 1;
 852    bool disable_wqm : 1;
 853    uint32_t padding: 19;
 854 };
 855 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
 856
 857 struct VOP1_instruction : public Instruction {
 858 };
 859 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
 860
 861 struct VOP2_instruction : public Instruction {
 862 };
 863 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
 864
 865 struct VOPC_instruction : public Instruction {
 866 };
 867 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
 868
 869 struct VOP3A_instruction : public Instruction {
 870    bool abs[3];
 871    bool neg[3];
 872    uint8_t opsel : 4;
 873    uint8_t omod : 2;
 874    bool clamp : 1;
 875    uint32_t padding : 9;
 876 };
 877 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
 878
 879 /**
 880  * Data Parallel Primitives Format:
 881  * This format can be used for VOP1, VOP2 or VOPC instructions.
 882  * The swizzle applies to the src0 operand.
 883  *
 884  */
 885 struct DPP_instruction : public Instruction {
 886    bool abs[2];
 887    bool neg[2];
 888    uint16_t dpp_ctrl;
 889    uint8_t row_mask : 4;
 890    uint8_t bank_mask : 4;
 891    bool bound_ctrl : 1;
 892    uint32_t padding : 7;
 893 };
 894 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
 895
 896 enum sdwa_sel : uint8_t {
 897     /* masks */
 898     sdwa_wordnum = 0x1,
 899     sdwa_bytenum = 0x3,
 900     sdwa_asuint = 0x7 | 0x10,
 901     sdwa_rasize = 0x3,
 902
 903     /* flags */
 904     sdwa_isword = 0x4,
 905     sdwa_sext = 0x8,
 906     sdwa_isra = 0x10,
 907
 908     /* specific values */
 909     sdwa_ubyte0 = 0,
 910     sdwa_ubyte1 = 1,
 911     sdwa_ubyte2 = 2,
 912     sdwa_ubyte3 = 3,
 913     sdwa_uword0 = sdwa_isword | 0,
 914     sdwa_uword1 = sdwa_isword | 1,
 915     sdwa_udword = 6,
 916
 917     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 918     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 919     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 920     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 921     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 922     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 923     sdwa_sdword = sdwa_udword | sdwa_sext,
 924
 925     /* register-allocated */
 926     sdwa_ubyte = 1 | sdwa_isra,
 927     sdwa_uword = 2 | sdwa_isra,
 928     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
 929     sdwa_sword = sdwa_uword | sdwa_sext,
 930 };
 931
 932 /**
 933  * Sub-Dword Addressing Format:
 934  * This format can be used for VOP1, VOP2 or VOPC instructions.
 935  *
 936  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 937  * the definition doesn't have to be VCC on GFX9+.
 938  *
 939  */
 940 struct SDWA_instruction : public Instruction {
 941    /* these destination modifiers aren't available with VOPC except for
 942     * clamp on GFX8 */
 943    uint8_t sel[2];
 944    uint8_t dst_sel;
 945    bool neg[2];
 946    bool abs[2];
 947    bool dst_preserve : 1;
 948    bool clamp : 1;
 949    uint8_t omod : 2; /* GFX9+ */
 950    uint32_t padding : 4;
 951 };
 952 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
 953
 954 struct Interp_instruction : public Instruction {
 955    uint8_t attribute;
 956    uint8_t component;
 957    uint16_t padding;
 958 };
 959 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
 960
 961 /**
 962  * Local and Global Data Sharing instructions
 963  * Operand(0): ADDR - VGPR which supplies the address.
 964  * Operand(1): DATA0 - First data VGPR.
 965  * Operand(2): DATA1 - Second data VGPR.
 966  * Operand(n-1): M0 - LDS size.
 967  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 968  *
 969  */
 970 struct DS_instruction : public Instruction {
 971    int16_t offset0;
 972    int8_t offset1;
 973    bool gds;
 974 };
 975 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
 976
 977 /**
 978  * Vector Memory Untyped-buffer Instructions
 979  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 980  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 981  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 982  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 983  *
 984  */
 985 struct MUBUF_instruction : public Instruction {
 986    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 987    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 988    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 989    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 990    bool glc : 1; /* globally coherent */
 991    bool dlc : 1; /* NAVI: device level coherent */
 992    bool slc : 1; /* system level coherent */
 993    bool tfe : 1; /* texture fail enable */
 994    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 995    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 996    bool can_reorder : 1;
 997    uint8_t padding : 2;
 998    barrier_interaction barrier;
 999 };
1000 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1001
1002 /**
1003  * Vector Memory Typed-buffer Instructions
1004  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1005  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1006  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1007  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1008  *
1009  */
1010 struct MTBUF_instruction : public Instruction {
1011    uint16_t offset; /* Unsigned byte offset - 12 bit */
1012    barrier_interaction barrier;
1013    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1014    uint8_t nfmt : 3; /* Numeric format of data in memory */
1015    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1016    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1017    bool glc : 1; /* globally coherent */
1018    bool dlc : 1; /* NAVI: device level coherent */
1019    bool slc : 1; /* system level coherent */
1020    bool tfe : 1; /* texture fail enable */
1021    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1022    bool can_reorder : 1;
1023    uint32_t padding : 25;
1024 };
1025 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1026
1027 /**
1028  * Vector Memory Image Instructions
1029  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1030  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1031  *             or VDATA - Vector GPR for write data.
1032  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1033  * Definition(0): VDATA - Vector GPR for read result.
1034  *
1035  */
1036 struct MIMG_instruction : public Instruction {
1037    uint8_t dmask; /* Data VGPR enable mask */
1038    uint8_t dim : 3; /* NAVI: dimensionality */
1039    bool unrm : 1; /* Force address to be un-normalized */
1040    bool dlc : 1; /* NAVI: device level coherent */
1041    bool glc : 1; /* globally coherent */
1042    bool slc : 1; /* system level coherent */
1043    bool tfe : 1; /* texture fail enable */
1044    bool da : 1; /* declare an array */
1045    bool lwe : 1; /* Force data to be un-normalized */
1046    bool r128 : 1; /* NAVI: Texture resource size */
1047    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1048    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1049    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1050    bool can_reorder : 1;
1051    uint8_t padding : 1;
1052    barrier_interaction barrier;
1053 };
1054 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1055
1056 /**
1057  * Flat/Scratch/Global Instructions
1058  * Operand(0): ADDR
1059  * Operand(1): SADDR
1060  * Operand(2) / Definition(0): DATA/VDST
1061  *
1062  */
1063 struct FLAT_instruction : public Instruction {
1064    uint16_t offset; /* Vega/Navi only */
1065    bool slc : 1; /* system level coherent */
1066    bool glc : 1; /* globally coherent */
1067    bool dlc : 1; /* NAVI: device level coherent */
1068    bool lds : 1;
1069    bool nv : 1;
1070    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1071    bool can_reorder : 1;
1072    uint8_t padding : 1;
1073    barrier_interaction barrier;
1074 };
1075 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1076
1077 struct Export_instruction : public Instruction {
1078    uint8_t enabled_mask;
1079    uint8_t dest;
1080    bool compressed : 1;
1081    bool done : 1;
1082    bool valid_mask : 1;
1083    uint32_t padding : 13;
1084 };
1085 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1086
1087 struct Pseudo_instruction : public Instruction {
1088    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1089    bool tmp_in_scc;
1090    uint8_t padding;
1091 };
1092 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1093
1094 struct Pseudo_branch_instruction : public Instruction {
1095    /* target[0] is the block index of the branch target.
1096     * For conditional branches, target[1] contains the fall-through alternative.
1097     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1098     */
1099    uint32_t target[2];
1100 };
1101 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1102
1103 struct Pseudo_barrier_instruction : public Instruction {
1104 };
1105 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1106
1107 enum ReduceOp : uint16_t {
1108    iadd32, iadd64,
1109    imul32, imul64,
1110    fadd32, fadd64,
1111    fmul32, fmul64,
1112    imin32, imin64,
1113    imax32, imax64,
1114    umin32, umin64,
1115    umax32, umax64,
1116    fmin32, fmin64,
1117    fmax32, fmax64,
1118    iand32, iand64,
1119    ior32, ior64,
1120    ixor32, ixor64,
1121    gfx10_wave64_bpermute
1122 };
1123
1124 /**
1125  * Subgroup Reduction Instructions, everything except for the data to be
1126  * reduced and the result as inserted by setup_reduce_temp().
1127  * Operand(0): data to be reduced
1128  * Operand(1): reduce temporary
1129  * Operand(2): vector temporary
1130  * Definition(0): result
1131  * Definition(1): scalar temporary
1132  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1133  * Definition(3): scc clobber
1134  * Definition(4): vcc clobber
1135  *
1136  */
1137 struct Pseudo_reduction_instruction : public Instruction {
1138    ReduceOp reduce_op;
1139    uint16_t cluster_size; // must be 0 for scans
1140 };
1141 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1142
1143 struct instr_deleter_functor {
1144    void operator()(void* p) {
1145       free(p);
1146    }
1147 };
1148
1149 template<typename T>
1150 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1151
1152 template<typename T>
1153 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1154 {
1155    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1156    char *data = (char*) calloc(1, size);
1157    T* inst = (T*) data;
1158
1159    inst->opcode = opcode;
1160    inst->format = format;
1161
1162    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1163    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1164    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1165    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1166
1167    return inst;
1168 }
1169
1170 constexpr bool Instruction::usesModifiers() const noexcept
1171 {
1172    if (isDPP() || isSDWA())
1173       return true;
1174    if (!isVOP3())
1175       return false;
1176    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1177    for (unsigned i = 0; i < operands.size(); i++) {
1178       if (vop3->abs[i] || vop3->neg[i])
1179          return true;
1180    }
1181    return vop3->opsel || vop3->clamp || vop3->omod;
1182 }
1183
1184 constexpr bool is_phi(Instruction* instr)
1185 {
1186    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1187 }
1188
1189 static inline bool is_phi(aco_ptr<Instruction>& instr)
1190 {
1191    return is_phi(instr.get());
1192 }
1193
1194 barrier_interaction get_barrier_interaction(Instruction* instr);
1195
1196 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1197
1198 enum block_kind {
1199    /* uniform indicates that leaving this block,
1200     * all actives lanes stay active */
1201    block_kind_uniform = 1 << 0,
1202    block_kind_top_level = 1 << 1,
1203    block_kind_loop_preheader = 1 << 2,
1204    block_kind_loop_header = 1 << 3,
1205    block_kind_loop_exit = 1 << 4,
1206    block_kind_continue = 1 << 5,
1207    block_kind_break = 1 << 6,
1208    block_kind_continue_or_break = 1 << 7,
1209    block_kind_discard = 1 << 8,
1210    block_kind_branch = 1 << 9,
1211    block_kind_merge = 1 << 10,
1212    block_kind_invert = 1 << 11,
1213    block_kind_uses_discard_if = 1 << 12,
1214    block_kind_needs_lowering = 1 << 13,
1215    block_kind_uses_demote = 1 << 14,
1216    block_kind_export_end = 1 << 15,
1217 };
1218
1219
1220 struct RegisterDemand {
1221    constexpr RegisterDemand() = default;
1222    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1223       : vgpr{v}, sgpr{s} {}
1224    int16_t vgpr = 0;
1225    int16_t sgpr = 0;
1226
1227    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1228       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1229    }
1230
1231    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1232       return vgpr > other.vgpr || sgpr > other.sgpr;
1233    }
1234
1235    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1236       if (t.type() == RegType::sgpr)
1237          return RegisterDemand( vgpr, sgpr + t.size() );
1238       else
1239          return RegisterDemand( vgpr + t.size(), sgpr );
1240    }
1241
1242    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1243       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1244    }
1245
1246    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1247       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1248    }
1249
1250    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1251       vgpr += other.vgpr;
1252       sgpr += other.sgpr;
1253       return *this;
1254    }
1255
1256    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1257       vgpr -= other.vgpr;
1258       sgpr -= other.sgpr;
1259       return *this;
1260    }
1261
1262    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1263       if (t.type() == RegType::sgpr)
1264          sgpr += t.size();
1265       else
1266          vgpr += t.size();
1267       return *this;
1268    }
1269
1270    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1271       if (t.type() == RegType::sgpr)
1272          sgpr -= t.size();
1273       else
1274          vgpr -= t.size();
1275       return *this;
1276    }
1277
1278    constexpr void update(const RegisterDemand other) noexcept {
1279       vgpr = std::max(vgpr, other.vgpr);
1280       sgpr = std::max(sgpr, other.sgpr);
1281    }
1282
1283 };
1284
1285 /* CFG */
1286 struct Block {
1287    float_mode fp_mode;
1288    unsigned index;
1289    unsigned offset = 0;
1290    std::vector<aco_ptr<Instruction>> instructions;
1291    std::vector<unsigned> logical_preds;
1292    std::vector<unsigned> linear_preds;
1293    std::vector<unsigned> logical_succs;
1294    std::vector<unsigned> linear_succs;
1295    RegisterDemand register_demand = RegisterDemand();
1296    uint16_t loop_nest_depth = 0;
1297    uint16_t kind = 0;
1298    int logical_idom = -1;
1299    int linear_idom = -1;
1300    Temp live_out_exec = Temp();
1301
1302    /* this information is needed for predecessors to blocks with phis when
1303     * moving out of ssa */
1304    bool scc_live_out = false;
1305    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1306
1307    Block(unsigned idx) : index(idx) {}
1308    Block() : index(0) {}
1309 };
1310
1311 using Stage = uint16_t;
1312
1313 /* software stages */
1314 static constexpr Stage sw_vs = 1 << 0;
1315 static constexpr Stage sw_gs = 1 << 1;
1316 static constexpr Stage sw_tcs = 1 << 2;
1317 static constexpr Stage sw_tes = 1 << 3;
1318 static constexpr Stage sw_fs = 1 << 4;
1319 static constexpr Stage sw_cs = 1 << 5;
1320 static constexpr Stage sw_gs_copy = 1 << 6;
1321 static constexpr Stage sw_mask = 0x7f;
1322
1323 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1324 static constexpr Stage hw_vs = 1 << 7;
1325 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1326 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1327 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1328 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1329 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1330 static constexpr Stage hw_fs = 1 << 13;
1331 static constexpr Stage hw_cs = 1 << 14;
1332 static constexpr Stage hw_mask = 0xff << 7;
1333
1334 /* possible settings of Program::stage */
1335 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1336 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1337 static constexpr Stage compute_cs = sw_cs | hw_cs;
1338 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1339 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1340 /* GFX10/NGG */
1341 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1342 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1343 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1344 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1345 /* GFX9 (and GFX10 if NGG isn't used) */
1346 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1347 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1348 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1349 /* pre-GFX9 */
1350 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1351 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1352 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1353 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1354 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1355
1356 enum statistic {
1357    statistic_hash,
1358    statistic_instructions,
1359    statistic_copies,
1360    statistic_branches,
1361    statistic_cycles,
1362    statistic_vmem_clauses,
1363    statistic_smem_clauses,
1364    statistic_vmem_score,
1365    statistic_smem_score,
1366    statistic_sgpr_presched,
1367    statistic_vgpr_presched,
1368    num_statistics
1369 };
1370
1371 class Program final {
1372 public:
1373    float_mode next_fp_mode;
1374    std::vector<Block> blocks;
1375    RegisterDemand max_reg_demand = RegisterDemand();
1376    uint16_t num_waves = 0;
1377    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1378    ac_shader_config* config;
1379    struct radv_shader_info *info;
1380    enum chip_class chip_class;
1381    enum radeon_family family;
1382    unsigned wave_size;
1383    RegClass lane_mask;
1384    Stage stage; /* Stage */
1385    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1386    bool needs_wqm = false; /* there exists a p_wqm instruction */
1387    bool wb_smem_l1_on_end = false;
1388
1389    std::vector<uint8_t> constant_data;
1390    Temp private_segment_buffer;
1391    Temp scratch_offset;
1392
1393    uint16_t min_waves = 0;
1394    uint16_t lds_alloc_granule;
1395    uint32_t lds_limit; /* in bytes */
1396    bool has_16bank_lds;
1397    uint16_t vgpr_limit;
1398    uint16_t sgpr_limit;
1399    uint16_t physical_sgprs;
1400    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1401    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1402    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1403
1404    bool xnack_enabled = false;
1405
1406    bool needs_vcc = false;
1407    bool needs_flat_scr = false;
1408
1409    bool collect_statistics = false;
1410    uint32_t statistics[num_statistics];
1411
1412    uint32_t allocateId()
1413    {
1414       assert(allocationID <= 16777215);
1415       return allocationID++;
1416    }
1417
1418    uint32_t peekAllocationId()
1419    {
1420       return allocationID;
1421    }
1422
1423    void setAllocationId(uint32_t id)
1424    {
1425       allocationID = id;
1426    }
1427
1428    Block* create_and_insert_block() {
1429       blocks.emplace_back(blocks.size());
1430       blocks.back().fp_mode = next_fp_mode;
1431       return &blocks.back();
1432    }
1433
1434    Block* insert_block(Block&& block) {
1435       block.index = blocks.size();
1436       block.fp_mode = next_fp_mode;
1437       blocks.emplace_back(std::move(block));
1438       return &blocks.back();
1439    }
1440
1441 private:
1442    uint32_t allocationID = 1;
1443 };
1444
1445 struct live {
1446    /* live temps out per block */
1447    std::vector<std::set<Temp>> live_out;
1448    /* register demand (sgpr/vgpr) per instruction per block */
1449    std::vector<std::vector<RegisterDemand>> register_demand;
1450 };
1451
1452 void select_program(Program *program,
1453                     unsigned shader_count,
1454                     struct nir_shader *const *shaders,
1455                     ac_shader_config* config,
1456                     struct radv_shader_args *args);
1457 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1458                            ac_shader_config* config,
1459                            struct radv_shader_args *args);
1460
1461 void lower_wqm(Program* program, live& live_vars,
1462                const struct radv_nir_compiler_options *options);
1463 void lower_bool_phis(Program* program);
1464 void calc_min_waves(Program* program);
1465 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1466 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1467 std::vector<uint16_t> dead_code_analysis(Program *program);
1468 void dominator_tree(Program* program);
1469 void insert_exec_mask(Program *program);
1470 void value_numbering(Program* program);
1471 void optimize(Program* program);
1472 void setup_reduce_temp(Program* program);
1473 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1474 void register_allocation(Program *program, std::vector<std::set<Temp>>& live_out_per_block);
1475 void ssa_elimination(Program* program);
1476 void lower_to_hw_instr(Program* program);
1477 void schedule_program(Program* program, live& live_vars);
1478 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1479 void insert_wait_states(Program* program);
1480 void insert_NOPs(Program* program);
1481 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1482 void print_asm(Program *program, std::vector<uint32_t>& binary,
1483                unsigned exec_size, std::ostream& out);
1484 void validate(Program* program, FILE *output);
1485 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1486 #ifndef NDEBUG
1487 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1488 #else
1489 #define perfwarn(program, cond, msg, ...) do {} while(0)
1490 #endif
1491
1492 void collect_presched_stats(Program *program);
1493 void collect_preasm_stats(Program *program);
1494 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1495
1496 void aco_print_instr(Instruction *instr, FILE *output);
1497 void aco_print_program(Program *program, FILE *output);
1498
1499 /* utilities for dealing with register demand */
1500 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1501 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1502 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1503
1504 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1505 uint16_t get_extra_sgprs(Program *program);
1506
1507 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1508 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1509 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1510
1511 /* return number of addressable sgprs/vgprs for max_waves */
1512 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1513 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1514
1515 typedef struct {
1516    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1517    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1518    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1519    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1520    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1521    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1522    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1523    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1524 } Info;
1525
1526 extern const Info instr_info;
1527
1528 }
1529
1530 #endif /* ACO_IR_H */
1531