src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 constexpr Format asSDWA(Format format) {
 173    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 174    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 175 }
 176
 177 enum class RegType {
 178    none = 0,
 179    sgpr,
 180    vgpr,
 181    linear_vgpr,
 182 };
 183
 184 struct RegClass {
 185
 186    enum RC : uint8_t {
 187       s1 = 1,
 188       s2 = 2,
 189       s3 = 3,
 190       s4 = 4,
 191       s6 = 6,
 192       s8 = 8,
 193       s16 = 16,
 194       v1 = s1 | (1 << 5),
 195       v2 = s2 | (1 << 5),
 196       v3 = s3 | (1 << 5),
 197       v4 = s4 | (1 << 5),
 198       v5 = 5  | (1 << 5),
 199       v6 = 6  | (1 << 5),
 200       v7 = 7  | (1 << 5),
 201       v8 = 8  | (1 << 5),
 202       /* byte-sized register class */
 203       v1b = v1 | (1 << 7),
 204       v2b = v2 | (1 << 7),
 205       v3b = v3 | (1 << 7),
 206       v4b = v4 | (1 << 7),
 207       v6b = v6 | (1 << 7),
 208       v8b = v8 | (1 << 7),
 209       /* these are used for WWM and spills to vgpr */
 210       v1_linear = v1 | (1 << 6),
 211       v2_linear = v2 | (1 << 6),
 212    };
 213
 214    RegClass() = default;
 215    constexpr RegClass(RC rc)
 216       : rc(rc) {}
 217    constexpr RegClass(RegType type, unsigned size)
 218       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 219
 220    constexpr operator RC() const { return rc; }
 221    explicit operator bool() = delete;
 222
 223    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 224    constexpr bool is_subdword() const { return rc & (1 << 7); }
 225    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 226    //TODO: use size() less in favor of bytes()
 227    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 228    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 229    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 230    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 231
 232 private:
 233    RC rc;
 234 };
 235
 236 /* transitional helper expressions */
 237 static constexpr RegClass s1{RegClass::s1};
 238 static constexpr RegClass s2{RegClass::s2};
 239 static constexpr RegClass s3{RegClass::s3};
 240 static constexpr RegClass s4{RegClass::s4};
 241 static constexpr RegClass s8{RegClass::s8};
 242 static constexpr RegClass s16{RegClass::s16};
 243 static constexpr RegClass v1{RegClass::v1};
 244 static constexpr RegClass v2{RegClass::v2};
 245 static constexpr RegClass v3{RegClass::v3};
 246 static constexpr RegClass v4{RegClass::v4};
 247 static constexpr RegClass v5{RegClass::v5};
 248 static constexpr RegClass v6{RegClass::v6};
 249 static constexpr RegClass v7{RegClass::v7};
 250 static constexpr RegClass v8{RegClass::v8};
 251 static constexpr RegClass v1b{RegClass::v1b};
 252 static constexpr RegClass v2b{RegClass::v2b};
 253 static constexpr RegClass v3b{RegClass::v3b};
 254 static constexpr RegClass v4b{RegClass::v4b};
 255 static constexpr RegClass v6b{RegClass::v6b};
 256 static constexpr RegClass v8b{RegClass::v8b};
 257
 258 /**
 259  * Temp Class
 260  * Each temporary virtual register has a
 261  * register class (i.e. size and type)
 262  * and SSA id.
 263  */
 264 struct Temp {
 265    Temp() noexcept : id_(0), reg_class(0) {}
 266    constexpr Temp(uint32_t id, RegClass cls) noexcept
 267       : id_(id), reg_class(uint8_t(cls)) {}
 268
 269    constexpr uint32_t id() const noexcept { return id_; }
 270    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 271
 272    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 273    constexpr unsigned size() const noexcept { return regClass().size(); }
 274    constexpr RegType type() const noexcept { return regClass().type(); }
 275    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 276
 277    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 278    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 279    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 280
 281 private:
 282    uint32_t id_: 24;
 283    uint32_t reg_class : 8;
 284 };
 285
 286 /**
 287  * PhysReg
 288  * Represents the physical register for each
 289  * Operand and Definition.
 290  */
 291 struct PhysReg {
 292    constexpr PhysReg() = default;
 293    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 294    constexpr unsigned reg() const { return reg_b >> 2; }
 295    constexpr unsigned byte() const { return reg_b & 0x3; }
 296    constexpr operator unsigned() const { return reg(); }
 297    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 298    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 299    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 300
 301    uint16_t reg_b = 0;
 302 };
 303
 304 /* helper expressions for special registers */
 305 static constexpr PhysReg m0{124};
 306 static constexpr PhysReg vcc{106};
 307 static constexpr PhysReg vcc_hi{107};
 308 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 309 static constexpr PhysReg exec{126};
 310 static constexpr PhysReg exec_lo{126};
 311 static constexpr PhysReg exec_hi{127};
 312 static constexpr PhysReg vccz{251};
 313 static constexpr PhysReg execz{252};
 314 static constexpr PhysReg scc{253};
 315
 316 /**
 317  * Operand Class
 318  * Initially, each Operand refers to either
 319  * a temporary virtual register
 320  * or to a constant value
 321  * Temporary registers get mapped to physical register during RA
 322  * Constant values are inlined into the instruction sequence.
 323  */
 324 class Operand final
 325 {
 326 public:
 327    constexpr Operand()
 328       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 329         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 330         isLateKill_(false) {}
 331
 332    explicit Operand(Temp r) noexcept
 333    {
 334       data_.temp = r;
 335       if (r.id()) {
 336          isTemp_ = true;
 337       } else {
 338          isUndef_ = true;
 339          setFixed(PhysReg{128});
 340       }
 341    };
 342    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 343    {
 344       data_.i = v;
 345       isConstant_ = true;
 346       is64BitConst_ = is64bit;
 347       if (v <= 64)
 348          setFixed(PhysReg{128 + v});
 349       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 350          setFixed(PhysReg{192 - v});
 351       else if (v == 0x3f000000) /* 0.5 */
 352          setFixed(PhysReg{240});
 353       else if (v == 0xbf000000) /* -0.5 */
 354          setFixed(PhysReg{241});
 355       else if (v == 0x3f800000) /* 1.0 */
 356          setFixed(PhysReg{242});
 357       else if (v == 0xbf800000) /* -1.0 */
 358          setFixed(PhysReg{243});
 359       else if (v == 0x40000000) /* 2.0 */
 360          setFixed(PhysReg{244});
 361       else if (v == 0xc0000000) /* -2.0 */
 362          setFixed(PhysReg{245});
 363       else if (v == 0x40800000) /* 4.0 */
 364          setFixed(PhysReg{246});
 365       else if (v == 0xc0800000) /* -4.0 */
 366          setFixed(PhysReg{247});
 367       else { /* Literal Constant */
 368          assert(!is64bit && "attempt to create a 64-bit literal constant");
 369          setFixed(PhysReg{255});
 370       }
 371    };
 372    explicit Operand(uint64_t v) noexcept
 373    {
 374       isConstant_ = true;
 375       is64BitConst_ = true;
 376       if (v <= 64) {
 377          data_.i = (uint32_t) v;
 378          setFixed(PhysReg{128 + (uint32_t) v});
 379       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 380          data_.i = (uint32_t) v;
 381          setFixed(PhysReg{192 - (uint32_t) v});
 382       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 383          data_.i = 0x3f000000;
 384          setFixed(PhysReg{240});
 385       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 386          data_.i = 0xbf000000;
 387          setFixed(PhysReg{241});
 388       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 389          data_.i = 0x3f800000;
 390          setFixed(PhysReg{242});
 391       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 392          data_.i = 0xbf800000;
 393          setFixed(PhysReg{243});
 394       } else if (v == 0x4000000000000000) { /* 2.0 */
 395          data_.i = 0x40000000;
 396          setFixed(PhysReg{244});
 397       } else if (v == 0xC000000000000000) { /* -2.0 */
 398          data_.i = 0xc0000000;
 399          setFixed(PhysReg{245});
 400       } else if (v == 0x4010000000000000) { /* 4.0 */
 401          data_.i = 0x40800000;
 402          setFixed(PhysReg{246});
 403       } else if (v == 0xC010000000000000) { /* -4.0 */
 404          data_.i = 0xc0800000;
 405          setFixed(PhysReg{247});
 406       } else { /* Literal Constant: we don't know if it is a long or double.*/
 407          isConstant_ = 0;
 408          assert(false && "attempt to create a 64-bit literal constant");
 409       }
 410    };
 411    explicit Operand(RegClass type) noexcept
 412    {
 413       isUndef_ = true;
 414       data_.temp = Temp(0, type);
 415       setFixed(PhysReg{128});
 416    };
 417    explicit Operand(PhysReg reg, RegClass type) noexcept
 418    {
 419       data_.temp = Temp(0, type);
 420       setFixed(reg);
 421    }
 422
 423    constexpr bool isTemp() const noexcept
 424    {
 425       return isTemp_;
 426    }
 427
 428    constexpr void setTemp(Temp t) noexcept {
 429       assert(!isConstant_);
 430       isTemp_ = true;
 431       data_.temp = t;
 432    }
 433
 434    constexpr Temp getTemp() const noexcept
 435    {
 436       return data_.temp;
 437    }
 438
 439    constexpr uint32_t tempId() const noexcept
 440    {
 441       return data_.temp.id();
 442    }
 443
 444    constexpr bool hasRegClass() const noexcept
 445    {
 446       return isTemp() || isUndefined();
 447    }
 448
 449    constexpr RegClass regClass() const noexcept
 450    {
 451       return data_.temp.regClass();
 452    }
 453
 454    constexpr unsigned bytes() const noexcept
 455    {
 456       if (isConstant())
 457          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
 458       else
 459          return data_.temp.bytes();
 460    }
 461
 462    constexpr unsigned size() const noexcept
 463    {
 464       if (isConstant())
 465          return is64BitConst_ ? 2 : 1;
 466       else
 467          return data_.temp.size();
 468    }
 469
 470    constexpr bool isFixed() const noexcept
 471    {
 472       return isFixed_;
 473    }
 474
 475    constexpr PhysReg physReg() const noexcept
 476    {
 477       return reg_;
 478    }
 479
 480    constexpr void setFixed(PhysReg reg) noexcept
 481    {
 482       isFixed_ = reg != unsigned(-1);
 483       reg_ = reg;
 484    }
 485
 486    constexpr bool isConstant() const noexcept
 487    {
 488       return isConstant_;
 489    }
 490
 491    constexpr bool isLiteral() const noexcept
 492    {
 493       return isConstant() && reg_ == 255;
 494    }
 495
 496    constexpr bool isUndefined() const noexcept
 497    {
 498       return isUndef_;
 499    }
 500
 501    constexpr uint32_t constantValue() const noexcept
 502    {
 503       return data_.i;
 504    }
 505
 506    constexpr bool constantEquals(uint32_t cmp) const noexcept
 507    {
 508       return isConstant() && constantValue() == cmp;
 509    }
 510
 511    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 512    {
 513       if (is64BitConst_) {
 514          if (reg_ <= 192)
 515             return reg_ - 128;
 516          else if (reg_ <= 208)
 517             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 518
 519          switch (reg_) {
 520          case 240:
 521             return 0x3FE0000000000000;
 522          case 241:
 523             return 0xBFE0000000000000;
 524          case 242:
 525             return 0x3FF0000000000000;
 526          case 243:
 527             return 0xBFF0000000000000;
 528          case 244:
 529             return 0x4000000000000000;
 530          case 245:
 531             return 0xC000000000000000;
 532          case 246:
 533             return 0x4010000000000000;
 534          case 247:
 535             return 0xC010000000000000;
 536          }
 537       }
 538       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 539    }
 540
 541    /* Indicates that the killed operand's live range intersects with the
 542     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 543     * not set by liveness analysis. */
 544    constexpr void setLateKill(bool flag) noexcept
 545    {
 546       isLateKill_ = flag;
 547    }
 548
 549    constexpr bool isLateKill() const noexcept
 550    {
 551       return isLateKill_;
 552    }
 553
 554    constexpr void setKill(bool flag) noexcept
 555    {
 556       isKill_ = flag;
 557       if (!flag)
 558          setFirstKill(false);
 559    }
 560
 561    constexpr bool isKill() const noexcept
 562    {
 563       return isKill_ || isFirstKill();
 564    }
 565
 566    constexpr void setFirstKill(bool flag) noexcept
 567    {
 568       isFirstKill_ = flag;
 569       if (flag)
 570          setKill(flag);
 571    }
 572
 573    /* When there are multiple operands killing the same temporary,
 574     * isFirstKill() is only returns true for the first one. */
 575    constexpr bool isFirstKill() const noexcept
 576    {
 577       return isFirstKill_;
 578    }
 579
 580    constexpr bool isKillBeforeDef() const noexcept
 581    {
 582       return isKill() && !isLateKill();
 583    }
 584
 585    constexpr bool isFirstKillBeforeDef() const noexcept
 586    {
 587       return isFirstKill() && !isLateKill();
 588    }
 589
 590    constexpr bool operator == (Operand other) const noexcept
 591    {
 592       if (other.size() != size())
 593          return false;
 594       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 595          return false;
 596       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 597          return false;
 598       if (isLiteral())
 599          return other.isLiteral() && other.constantValue() == constantValue();
 600       else if (isConstant())
 601          return other.isConstant() && other.physReg() == physReg();
 602       else if (isUndefined())
 603          return other.isUndefined() && other.regClass() == regClass();
 604       else
 605          return other.isTemp() && other.getTemp() == getTemp();
 606    }
 607 private:
 608    union {
 609       uint32_t i;
 610       float f;
 611       Temp temp = Temp(0, s1);
 612    } data_;
 613    PhysReg reg_;
 614    union {
 615       struct {
 616          uint8_t isTemp_:1;
 617          uint8_t isFixed_:1;
 618          uint8_t isConstant_:1;
 619          uint8_t isKill_:1;
 620          uint8_t isUndef_:1;
 621          uint8_t isFirstKill_:1;
 622          uint8_t is64BitConst_:1;
 623          uint8_t isLateKill_:1;
 624       };
 625       /* can't initialize bit-fields in c++11, so work around using a union */
 626       uint8_t control_ = 0;
 627    };
 628 };
 629
 630 /**
 631  * Definition Class
 632  * Definitions are the results of Instructions
 633  * and refer to temporary virtual registers
 634  * which are later mapped to physical registers
 635  */
 636 class Definition final
 637 {
 638 public:
 639    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 640    Definition(uint32_t index, RegClass type) noexcept
 641       : temp(index, type) {}
 642    explicit Definition(Temp tmp) noexcept
 643       : temp(tmp) {}
 644    Definition(PhysReg reg, RegClass type) noexcept
 645       : temp(Temp(0, type))
 646    {
 647       setFixed(reg);
 648    }
 649    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 650       : temp(Temp(tmpId, type))
 651    {
 652       setFixed(reg);
 653    }
 654
 655    constexpr bool isTemp() const noexcept
 656    {
 657       return tempId() > 0;
 658    }
 659
 660    constexpr Temp getTemp() const noexcept
 661    {
 662       return temp;
 663    }
 664
 665    constexpr uint32_t tempId() const noexcept
 666    {
 667       return temp.id();
 668    }
 669
 670    constexpr void setTemp(Temp t) noexcept {
 671       temp = t;
 672    }
 673
 674    constexpr RegClass regClass() const noexcept
 675    {
 676       return temp.regClass();
 677    }
 678
 679    constexpr unsigned bytes() const noexcept
 680    {
 681       return temp.bytes();
 682    }
 683
 684    constexpr unsigned size() const noexcept
 685    {
 686       return temp.size();
 687    }
 688
 689    constexpr bool isFixed() const noexcept
 690    {
 691       return isFixed_;
 692    }
 693
 694    constexpr PhysReg physReg() const noexcept
 695    {
 696       return reg_;
 697    }
 698
 699    constexpr void setFixed(PhysReg reg) noexcept
 700    {
 701       isFixed_ = 1;
 702       reg_ = reg;
 703    }
 704
 705    constexpr void setHint(PhysReg reg) noexcept
 706    {
 707       hasHint_ = 1;
 708       reg_ = reg;
 709    }
 710
 711    constexpr bool hasHint() const noexcept
 712    {
 713       return hasHint_;
 714    }
 715
 716    constexpr void setKill(bool flag) noexcept
 717    {
 718       isKill_ = flag;
 719    }
 720
 721    constexpr bool isKill() const noexcept
 722    {
 723       return isKill_;
 724    }
 725
 726 private:
 727    Temp temp = Temp(0, s1);
 728    PhysReg reg_;
 729    union {
 730       struct {
 731          uint8_t isFixed_:1;
 732          uint8_t hasHint_:1;
 733          uint8_t isKill_:1;
 734       };
 735       /* can't initialize bit-fields in c++11, so work around using a union */
 736       uint8_t control_ = 0;
 737    };
 738 };
 739
 740 class Block;
 741
 742 struct Instruction {
 743    aco_opcode opcode;
 744    Format format;
 745    uint32_t pass_flags;
 746
 747    aco::span<Operand> operands;
 748    aco::span<Definition> definitions;
 749
 750    constexpr bool isVALU() const noexcept
 751    {
 752       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 753           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 754           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 755           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 756           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 757           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 758    }
 759
 760    constexpr bool isSALU() const noexcept
 761    {
 762       return format == Format::SOP1 ||
 763              format == Format::SOP2 ||
 764              format == Format::SOPC ||
 765              format == Format::SOPK ||
 766              format == Format::SOPP;
 767    }
 768
 769    constexpr bool isVMEM() const noexcept
 770    {
 771       return format == Format::MTBUF ||
 772              format == Format::MUBUF ||
 773              format == Format::MIMG;
 774    }
 775
 776    constexpr bool isDPP() const noexcept
 777    {
 778       return (uint16_t) format & (uint16_t) Format::DPP;
 779    }
 780
 781    constexpr bool isVOP3() const noexcept
 782    {
 783       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 784              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 785              format == Format::VOP3P;
 786    }
 787
 788    constexpr bool isSDWA() const noexcept
 789    {
 790       return (uint16_t) format & (uint16_t) Format::SDWA;
 791    }
 792
 793    constexpr bool isFlatOrGlobal() const noexcept
 794    {
 795       return format == Format::FLAT || format == Format::GLOBAL;
 796    }
 797
 798    constexpr bool usesModifiers() const noexcept;
 799
 800    constexpr bool reads_exec() const noexcept
 801    {
 802       for (const Operand& op : operands) {
 803          if (op.isFixed() && op.physReg() == exec)
 804             return true;
 805       }
 806       return false;
 807    }
 808 };
 809
 810 struct SOPK_instruction : public Instruction {
 811    uint16_t imm;
 812 };
 813
 814 struct SOPP_instruction : public Instruction {
 815    uint32_t imm;
 816    int block;
 817 };
 818
 819 struct SOPC_instruction : public Instruction {
 820 };
 821
 822 struct SOP1_instruction : public Instruction {
 823 };
 824
 825 struct SOP2_instruction : public Instruction {
 826 };
 827
 828 /**
 829  * Scalar Memory Format:
 830  * For s_(buffer_)load_dword*:
 831  * Operand(0): SBASE - SGPR-pair which provides base address
 832  * Operand(1): Offset - immediate (un)signed offset or SGPR
 833  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 834  * Operand(n-1): SOffset - SGPR offset (Vega only)
 835  *
 836  * Having no operands is also valid for instructions such as s_dcache_inv.
 837  *
 838  */
 839 struct SMEM_instruction : public Instruction {
 840    bool glc : 1; /* VI+: globally coherent */
 841    bool dlc : 1; /* NAVI: device level coherent */
 842    bool nv : 1; /* VEGA only: Non-volatile */
 843    bool can_reorder : 1;
 844    bool disable_wqm : 1;
 845    barrier_interaction barrier;
 846 };
 847
 848 struct VOP1_instruction : public Instruction {
 849 };
 850
 851 struct VOP2_instruction : public Instruction {
 852 };
 853
 854 struct VOPC_instruction : public Instruction {
 855 };
 856
 857 struct VOP3A_instruction : public Instruction {
 858    bool abs[3];
 859    bool neg[3];
 860    uint8_t opsel : 4;
 861    uint8_t omod : 2;
 862    bool clamp : 1;
 863 };
 864
 865 /**
 866  * Data Parallel Primitives Format:
 867  * This format can be used for VOP1, VOP2 or VOPC instructions.
 868  * The swizzle applies to the src0 operand.
 869  *
 870  */
 871 struct DPP_instruction : public Instruction {
 872    bool abs[2];
 873    bool neg[2];
 874    uint16_t dpp_ctrl;
 875    uint8_t row_mask : 4;
 876    uint8_t bank_mask : 4;
 877    bool bound_ctrl : 1;
 878 };
 879
 880 enum sdwa_sel : uint8_t {
 881     /* masks */
 882     sdwa_wordnum = 0x1,
 883     sdwa_bytenum = 0x3,
 884     sdwa_asuint = 0x7 | 0x10,
 885     sdwa_rasize = 0x3,
 886
 887     /* flags */
 888     sdwa_isword = 0x4,
 889     sdwa_sext = 0x8,
 890     sdwa_isra = 0x10,
 891
 892     /* specific values */
 893     sdwa_ubyte0 = 0,
 894     sdwa_ubyte1 = 1,
 895     sdwa_ubyte2 = 2,
 896     sdwa_ubyte3 = 3,
 897     sdwa_uword0 = sdwa_isword | 0,
 898     sdwa_uword1 = sdwa_isword | 1,
 899     sdwa_udword = 6,
 900
 901     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 902     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 903     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 904     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 905     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 906     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 907     sdwa_sdword = sdwa_udword | sdwa_sext,
 908
 909     /* register-allocated */
 910     sdwa_ubyte = 1 | sdwa_isra,
 911     sdwa_uword = 2 | sdwa_isra,
 912     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
 913     sdwa_sword = sdwa_uword | sdwa_sext,
 914 };
 915
 916 /**
 917  * Sub-Dword Addressing Format:
 918  * This format can be used for VOP1, VOP2 or VOPC instructions.
 919  *
 920  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 921  * the definition doesn't have to be VCC on GFX9+.
 922  *
 923  */
 924 struct SDWA_instruction : public Instruction {
 925    /* these destination modifiers aren't available with VOPC except for
 926     * clamp on GFX8 */
 927    unsigned dst_sel:8;
 928    bool dst_preserve:1;
 929    bool clamp:1;
 930    unsigned omod:2; /* GFX9+ */
 931
 932    unsigned sel[2];
 933    bool neg[2];
 934    bool abs[2];
 935 };
 936
 937 struct Interp_instruction : public Instruction {
 938    uint8_t attribute;
 939    uint8_t component;
 940 };
 941
 942 /**
 943  * Local and Global Data Sharing instructions
 944  * Operand(0): ADDR - VGPR which supplies the address.
 945  * Operand(1): DATA0 - First data VGPR.
 946  * Operand(2): DATA1 - Second data VGPR.
 947  * Operand(n-1): M0 - LDS size.
 948  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 949  *
 950  */
 951 struct DS_instruction : public Instruction {
 952    int16_t offset0;
 953    int8_t offset1;
 954    bool gds;
 955 };
 956
 957 /**
 958  * Vector Memory Untyped-buffer Instructions
 959  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 960  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 961  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 962  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 963  *
 964  */
 965 struct MUBUF_instruction : public Instruction {
 966    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 967    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 968    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 969    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 970    bool glc : 1; /* globally coherent */
 971    bool dlc : 1; /* NAVI: device level coherent */
 972    bool slc : 1; /* system level coherent */
 973    bool tfe : 1; /* texture fail enable */
 974    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 975    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 976    bool can_reorder : 1;
 977    barrier_interaction barrier;
 978 };
 979
 980 /**
 981  * Vector Memory Typed-buffer Instructions
 982  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 983  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 984  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 985  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 986  *
 987  */
 988 struct MTBUF_instruction : public Instruction {
 989    uint16_t offset; /* Unsigned byte offset - 12 bit */
 990    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 991    uint8_t nfmt : 3; /* Numeric format of data in memory */
 992    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 993    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 994    bool glc : 1; /* globally coherent */
 995    bool dlc : 1; /* NAVI: device level coherent */
 996    bool slc : 1; /* system level coherent */
 997    bool tfe : 1; /* texture fail enable */
 998    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 999    bool can_reorder : 1;
1000    barrier_interaction barrier;
1001 };
1002
1003 /**
1004  * Vector Memory Image Instructions
1005  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1006  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1007  *             or VDATA - Vector GPR for write data.
1008  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1009  * Definition(0): VDATA - Vector GPR for read result.
1010  *
1011  */
1012 struct MIMG_instruction : public Instruction {
1013    uint8_t dmask; /* Data VGPR enable mask */
1014    uint8_t dim : 3; /* NAVI: dimensionality */
1015    bool unrm : 1; /* Force address to be un-normalized */
1016    bool dlc : 1; /* NAVI: device level coherent */
1017    bool glc : 1; /* globally coherent */
1018    bool slc : 1; /* system level coherent */
1019    bool tfe : 1; /* texture fail enable */
1020    bool da : 1; /* declare an array */
1021    bool lwe : 1; /* Force data to be un-normalized */
1022    bool r128 : 1; /* NAVI: Texture resource size */
1023    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1024    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1025    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1026    bool can_reorder : 1;
1027    barrier_interaction barrier;
1028 };
1029
1030 /**
1031  * Flat/Scratch/Global Instructions
1032  * Operand(0): ADDR
1033  * Operand(1): SADDR
1034  * Operand(2) / Definition(0): DATA/VDST
1035  *
1036  */
1037 struct FLAT_instruction : public Instruction {
1038    uint16_t offset; /* Vega/Navi only */
1039    bool slc : 1; /* system level coherent */
1040    bool glc : 1; /* globally coherent */
1041    bool dlc : 1; /* NAVI: device level coherent */
1042    bool lds : 1;
1043    bool nv : 1;
1044    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1045    bool can_reorder : 1;
1046    barrier_interaction barrier;
1047 };
1048
1049 struct Export_instruction : public Instruction {
1050    uint8_t enabled_mask;
1051    uint8_t dest;
1052    bool compressed : 1;
1053    bool done : 1;
1054    bool valid_mask : 1;
1055 };
1056
1057 struct Pseudo_instruction : public Instruction {
1058    bool tmp_in_scc;
1059    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1060 };
1061
1062 struct Pseudo_branch_instruction : public Instruction {
1063    /* target[0] is the block index of the branch target.
1064     * For conditional branches, target[1] contains the fall-through alternative.
1065     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1066     */
1067    uint32_t target[2];
1068 };
1069
1070 struct Pseudo_barrier_instruction : public Instruction {
1071 };
1072
1073 enum ReduceOp {
1074    iadd32, iadd64,
1075    imul32, imul64,
1076    fadd32, fadd64,
1077    fmul32, fmul64,
1078    imin32, imin64,
1079    imax32, imax64,
1080    umin32, umin64,
1081    umax32, umax64,
1082    fmin32, fmin64,
1083    fmax32, fmax64,
1084    iand32, iand64,
1085    ior32, ior64,
1086    ixor32, ixor64,
1087    gfx10_wave64_bpermute
1088 };
1089
1090 /**
1091  * Subgroup Reduction Instructions, everything except for the data to be
1092  * reduced and the result as inserted by setup_reduce_temp().
1093  * Operand(0): data to be reduced
1094  * Operand(1): reduce temporary
1095  * Operand(2): vector temporary
1096  * Definition(0): result
1097  * Definition(1): scalar temporary
1098  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1099  * Definition(3): scc clobber
1100  * Definition(4): vcc clobber
1101  *
1102  */
1103 struct Pseudo_reduction_instruction : public Instruction {
1104    ReduceOp reduce_op;
1105    unsigned cluster_size; // must be 0 for scans
1106 };
1107
1108 struct instr_deleter_functor {
1109    void operator()(void* p) {
1110       free(p);
1111    }
1112 };
1113
1114 template<typename T>
1115 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1116
1117 template<typename T>
1118 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1119 {
1120    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1121    char *data = (char*) calloc(1, size);
1122    T* inst = (T*) data;
1123
1124    inst->opcode = opcode;
1125    inst->format = format;
1126
1127    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1128    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1129    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1130    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1131
1132    return inst;
1133 }
1134
1135 constexpr bool Instruction::usesModifiers() const noexcept
1136 {
1137    if (isDPP() || isSDWA())
1138       return true;
1139    if (!isVOP3())
1140       return false;
1141    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1142    for (unsigned i = 0; i < operands.size(); i++) {
1143       if (vop3->abs[i] || vop3->neg[i])
1144          return true;
1145    }
1146    return vop3->opsel || vop3->clamp || vop3->omod;
1147 }
1148
1149 constexpr bool is_phi(Instruction* instr)
1150 {
1151    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1152 }
1153
1154 static inline bool is_phi(aco_ptr<Instruction>& instr)
1155 {
1156    return is_phi(instr.get());
1157 }
1158
1159 barrier_interaction get_barrier_interaction(Instruction* instr);
1160
1161 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1162
1163 enum block_kind {
1164    /* uniform indicates that leaving this block,
1165     * all actives lanes stay active */
1166    block_kind_uniform = 1 << 0,
1167    block_kind_top_level = 1 << 1,
1168    block_kind_loop_preheader = 1 << 2,
1169    block_kind_loop_header = 1 << 3,
1170    block_kind_loop_exit = 1 << 4,
1171    block_kind_continue = 1 << 5,
1172    block_kind_break = 1 << 6,
1173    block_kind_continue_or_break = 1 << 7,
1174    block_kind_discard = 1 << 8,
1175    block_kind_branch = 1 << 9,
1176    block_kind_merge = 1 << 10,
1177    block_kind_invert = 1 << 11,
1178    block_kind_uses_discard_if = 1 << 12,
1179    block_kind_needs_lowering = 1 << 13,
1180    block_kind_uses_demote = 1 << 14,
1181    block_kind_export_end = 1 << 15,
1182 };
1183
1184
1185 struct RegisterDemand {
1186    constexpr RegisterDemand() = default;
1187    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1188       : vgpr{v}, sgpr{s} {}
1189    int16_t vgpr = 0;
1190    int16_t sgpr = 0;
1191
1192    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1193       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1194    }
1195
1196    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1197       return vgpr > other.vgpr || sgpr > other.sgpr;
1198    }
1199
1200    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1201       if (t.type() == RegType::sgpr)
1202          return RegisterDemand( vgpr, sgpr + t.size() );
1203       else
1204          return RegisterDemand( vgpr + t.size(), sgpr );
1205    }
1206
1207    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1208       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1209    }
1210
1211    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1212       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1213    }
1214
1215    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1216       vgpr += other.vgpr;
1217       sgpr += other.sgpr;
1218       return *this;
1219    }
1220
1221    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1222       vgpr -= other.vgpr;
1223       sgpr -= other.sgpr;
1224       return *this;
1225    }
1226
1227    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1228       if (t.type() == RegType::sgpr)
1229          sgpr += t.size();
1230       else
1231          vgpr += t.size();
1232       return *this;
1233    }
1234
1235    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1236       if (t.type() == RegType::sgpr)
1237          sgpr -= t.size();
1238       else
1239          vgpr -= t.size();
1240       return *this;
1241    }
1242
1243    constexpr void update(const RegisterDemand other) noexcept {
1244       vgpr = std::max(vgpr, other.vgpr);
1245       sgpr = std::max(sgpr, other.sgpr);
1246    }
1247
1248 };
1249
1250 /* CFG */
1251 struct Block {
1252    float_mode fp_mode;
1253    unsigned index;
1254    unsigned offset = 0;
1255    std::vector<aco_ptr<Instruction>> instructions;
1256    std::vector<unsigned> logical_preds;
1257    std::vector<unsigned> linear_preds;
1258    std::vector<unsigned> logical_succs;
1259    std::vector<unsigned> linear_succs;
1260    RegisterDemand register_demand = RegisterDemand();
1261    uint16_t loop_nest_depth = 0;
1262    uint16_t kind = 0;
1263    int logical_idom = -1;
1264    int linear_idom = -1;
1265    Temp live_out_exec = Temp();
1266
1267    /* this information is needed for predecessors to blocks with phis when
1268     * moving out of ssa */
1269    bool scc_live_out = false;
1270    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1271
1272    Block(unsigned idx) : index(idx) {}
1273    Block() : index(0) {}
1274 };
1275
1276 using Stage = uint16_t;
1277
1278 /* software stages */
1279 static constexpr Stage sw_vs = 1 << 0;
1280 static constexpr Stage sw_gs = 1 << 1;
1281 static constexpr Stage sw_tcs = 1 << 2;
1282 static constexpr Stage sw_tes = 1 << 3;
1283 static constexpr Stage sw_fs = 1 << 4;
1284 static constexpr Stage sw_cs = 1 << 5;
1285 static constexpr Stage sw_gs_copy = 1 << 6;
1286 static constexpr Stage sw_mask = 0x7f;
1287
1288 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1289 static constexpr Stage hw_vs = 1 << 7;
1290 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1291 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1292 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1293 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1294 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1295 static constexpr Stage hw_fs = 1 << 13;
1296 static constexpr Stage hw_cs = 1 << 14;
1297 static constexpr Stage hw_mask = 0xff << 7;
1298
1299 /* possible settings of Program::stage */
1300 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1301 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1302 static constexpr Stage compute_cs = sw_cs | hw_cs;
1303 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1304 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1305 /* GFX10/NGG */
1306 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1307 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1308 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1309 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1310 /* GFX9 (and GFX10 if NGG isn't used) */
1311 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1312 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1313 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1314 /* pre-GFX9 */
1315 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1316 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1317 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1318 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1319 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1320
1321 enum statistic {
1322    statistic_hash,
1323    statistic_instructions,
1324    statistic_copies,
1325    statistic_branches,
1326    statistic_cycles,
1327    statistic_vmem_clauses,
1328    statistic_smem_clauses,
1329    statistic_vmem_score,
1330    statistic_smem_score,
1331    statistic_sgpr_presched,
1332    statistic_vgpr_presched,
1333    num_statistics
1334 };
1335
1336 class Program final {
1337 public:
1338    float_mode next_fp_mode;
1339    std::vector<Block> blocks;
1340    RegisterDemand max_reg_demand = RegisterDemand();
1341    uint16_t num_waves = 0;
1342    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1343    ac_shader_config* config;
1344    struct radv_shader_info *info;
1345    enum chip_class chip_class;
1346    enum radeon_family family;
1347    unsigned wave_size;
1348    RegClass lane_mask;
1349    Stage stage; /* Stage */
1350    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1351    bool needs_wqm = false; /* there exists a p_wqm instruction */
1352    bool wb_smem_l1_on_end = false;
1353
1354    std::vector<uint8_t> constant_data;
1355    Temp private_segment_buffer;
1356    Temp scratch_offset;
1357
1358    uint16_t min_waves = 0;
1359    uint16_t lds_alloc_granule;
1360    uint32_t lds_limit; /* in bytes */
1361    bool has_16bank_lds;
1362    uint16_t vgpr_limit;
1363    uint16_t sgpr_limit;
1364    uint16_t physical_sgprs;
1365    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1366    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1367    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1368
1369    bool xnack_enabled = false;
1370
1371    bool needs_vcc = false;
1372    bool needs_flat_scr = false;
1373
1374    bool collect_statistics = false;
1375    uint32_t statistics[num_statistics];
1376
1377    uint32_t allocateId()
1378    {
1379       assert(allocationID <= 16777215);
1380       return allocationID++;
1381    }
1382
1383    uint32_t peekAllocationId()
1384    {
1385       return allocationID;
1386    }
1387
1388    void setAllocationId(uint32_t id)
1389    {
1390       allocationID = id;
1391    }
1392
1393    Block* create_and_insert_block() {
1394       blocks.emplace_back(blocks.size());
1395       blocks.back().fp_mode = next_fp_mode;
1396       return &blocks.back();
1397    }
1398
1399    Block* insert_block(Block&& block) {
1400       block.index = blocks.size();
1401       block.fp_mode = next_fp_mode;
1402       blocks.emplace_back(std::move(block));
1403       return &blocks.back();
1404    }
1405
1406 private:
1407    uint32_t allocationID = 1;
1408 };
1409
1410 struct live {
1411    /* live temps out per block */
1412    std::vector<std::set<Temp>> live_out;
1413    /* register demand (sgpr/vgpr) per instruction per block */
1414    std::vector<std::vector<RegisterDemand>> register_demand;
1415 };
1416
1417 void select_program(Program *program,
1418                     unsigned shader_count,
1419                     struct nir_shader *const *shaders,
1420                     ac_shader_config* config,
1421                     struct radv_shader_args *args);
1422 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1423                            ac_shader_config* config,
1424                            struct radv_shader_args *args);
1425
1426 void lower_wqm(Program* program, live& live_vars,
1427                const struct radv_nir_compiler_options *options);
1428 void lower_bool_phis(Program* program);
1429 void calc_min_waves(Program* program);
1430 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1431 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1432 std::vector<uint16_t> dead_code_analysis(Program *program);
1433 void dominator_tree(Program* program);
1434 void insert_exec_mask(Program *program);
1435 void value_numbering(Program* program);
1436 void optimize(Program* program);
1437 void setup_reduce_temp(Program* program);
1438 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1439 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1440 void ssa_elimination(Program* program);
1441 void lower_to_hw_instr(Program* program);
1442 void schedule_program(Program* program, live& live_vars);
1443 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1444 void insert_wait_states(Program* program);
1445 void insert_NOPs(Program* program);
1446 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1447 void print_asm(Program *program, std::vector<uint32_t>& binary,
1448                unsigned exec_size, std::ostream& out);
1449 void validate(Program* program, FILE *output);
1450 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1451 #ifndef NDEBUG
1452 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1453 #else
1454 #define perfwarn(program, cond, msg, ...) do {} while(0)
1455 #endif
1456
1457 void collect_presched_stats(Program *program);
1458 void collect_preasm_stats(Program *program);
1459 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1460
1461 void aco_print_instr(Instruction *instr, FILE *output);
1462 void aco_print_program(Program *program, FILE *output);
1463
1464 /* utilities for dealing with register demand */
1465 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1466 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1467 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1468
1469 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1470 uint16_t get_extra_sgprs(Program *program);
1471
1472 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1473 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1474 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1475
1476 /* return number of addressable sgprs/vgprs for max_waves */
1477 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1478 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1479
1480 typedef struct {
1481    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1482    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1483    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1484    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1485    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1486    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1487    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1488    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1489 } Info;
1490
1491 extern const Info instr_info;
1492
1493 }
1494
1495 #endif /* ACO_IR_H */
1496