src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 constexpr Format asSDWA(Format format) {
 173    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 174    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 175 }
 176
 177 enum class RegType {
 178    none = 0,
 179    sgpr,
 180    vgpr,
 181    linear_vgpr,
 182 };
 183
 184 struct RegClass {
 185
 186    enum RC : uint8_t {
 187       s1 = 1,
 188       s2 = 2,
 189       s3 = 3,
 190       s4 = 4,
 191       s6 = 6,
 192       s8 = 8,
 193       s16 = 16,
 194       v1 = s1 | (1 << 5),
 195       v2 = s2 | (1 << 5),
 196       v3 = s3 | (1 << 5),
 197       v4 = s4 | (1 << 5),
 198       v5 = 5  | (1 << 5),
 199       v6 = 6  | (1 << 5),
 200       v7 = 7  | (1 << 5),
 201       v8 = 8  | (1 << 5),
 202       /* byte-sized register class */
 203       v1b = v1 | (1 << 7),
 204       v2b = v2 | (1 << 7),
 205       v3b = v3 | (1 << 7),
 206       v4b = v4 | (1 << 7),
 207       v6b = v6 | (1 << 7),
 208       v8b = v8 | (1 << 7),
 209       /* these are used for WWM and spills to vgpr */
 210       v1_linear = v1 | (1 << 6),
 211       v2_linear = v2 | (1 << 6),
 212    };
 213
 214    RegClass() = default;
 215    constexpr RegClass(RC rc)
 216       : rc(rc) {}
 217    constexpr RegClass(RegType type, unsigned size)
 218       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 219
 220    constexpr operator RC() const { return rc; }
 221    explicit operator bool() = delete;
 222
 223    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 224    constexpr bool is_subdword() const { return rc & (1 << 7); }
 225    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 226    //TODO: use size() less in favor of bytes()
 227    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 228    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 229    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 230
 231 private:
 232    RC rc;
 233 };
 234
 235 /* transitional helper expressions */
 236 static constexpr RegClass s1{RegClass::s1};
 237 static constexpr RegClass s2{RegClass::s2};
 238 static constexpr RegClass s3{RegClass::s3};
 239 static constexpr RegClass s4{RegClass::s4};
 240 static constexpr RegClass s8{RegClass::s8};
 241 static constexpr RegClass s16{RegClass::s16};
 242 static constexpr RegClass v1{RegClass::v1};
 243 static constexpr RegClass v2{RegClass::v2};
 244 static constexpr RegClass v3{RegClass::v3};
 245 static constexpr RegClass v4{RegClass::v4};
 246 static constexpr RegClass v5{RegClass::v5};
 247 static constexpr RegClass v6{RegClass::v6};
 248 static constexpr RegClass v7{RegClass::v7};
 249 static constexpr RegClass v8{RegClass::v8};
 250 static constexpr RegClass v1b{RegClass::v1b};
 251 static constexpr RegClass v2b{RegClass::v2b};
 252 static constexpr RegClass v3b{RegClass::v3b};
 253 static constexpr RegClass v4b{RegClass::v4b};
 254 static constexpr RegClass v6b{RegClass::v6b};
 255 static constexpr RegClass v8b{RegClass::v8b};
 256
 257 /**
 258  * Temp Class
 259  * Each temporary virtual register has a
 260  * register class (i.e. size and type)
 261  * and SSA id.
 262  */
 263 struct Temp {
 264    Temp() = default;
 265    constexpr Temp(uint32_t id, RegClass cls) noexcept
 266       : id_(id), reg_class(cls) {}
 267
 268    constexpr uint32_t id() const noexcept { return id_; }
 269    constexpr RegClass regClass() const noexcept { return reg_class; }
 270
 271    constexpr unsigned bytes() const noexcept { return reg_class.bytes(); }
 272    constexpr unsigned size() const noexcept { return reg_class.size(); }
 273    constexpr RegType type() const noexcept { return reg_class.type(); }
 274    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 275
 276    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 277    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 278    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 279
 280 private:
 281    uint32_t id_:24;
 282    RegClass reg_class;
 283 };
 284
 285 /**
 286  * PhysReg
 287  * Represents the physical register for each
 288  * Operand and Definition.
 289  */
 290 struct PhysReg {
 291    constexpr PhysReg() = default;
 292    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 293    constexpr unsigned reg() const { return reg_b >> 2; }
 294    constexpr unsigned byte() const { return reg_b & 0x3; }
 295    constexpr operator unsigned() const { return reg(); }
 296    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 297    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 298    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 299
 300    uint16_t reg_b = 0;
 301 };
 302
 303 /* helper expressions for special registers */
 304 static constexpr PhysReg m0{124};
 305 static constexpr PhysReg vcc{106};
 306 static constexpr PhysReg vcc_hi{107};
 307 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 308 static constexpr PhysReg exec{126};
 309 static constexpr PhysReg exec_lo{126};
 310 static constexpr PhysReg exec_hi{127};
 311 static constexpr PhysReg vccz{251};
 312 static constexpr PhysReg execz{252};
 313 static constexpr PhysReg scc{253};
 314
 315 /**
 316  * Operand Class
 317  * Initially, each Operand refers to either
 318  * a temporary virtual register
 319  * or to a constant value
 320  * Temporary registers get mapped to physical register during RA
 321  * Constant values are inlined into the instruction sequence.
 322  */
 323 class Operand final
 324 {
 325 public:
 326    constexpr Operand()
 327       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 328         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 329         isLateKill_(false) {}
 330
 331    explicit Operand(Temp r) noexcept
 332    {
 333       data_.temp = r;
 334       if (r.id()) {
 335          isTemp_ = true;
 336       } else {
 337          isUndef_ = true;
 338          setFixed(PhysReg{128});
 339       }
 340    };
 341    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 342    {
 343       data_.i = v;
 344       isConstant_ = true;
 345       is64BitConst_ = is64bit;
 346       if (v <= 64)
 347          setFixed(PhysReg{128 + v});
 348       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 349          setFixed(PhysReg{192 - v});
 350       else if (v == 0x3f000000) /* 0.5 */
 351          setFixed(PhysReg{240});
 352       else if (v == 0xbf000000) /* -0.5 */
 353          setFixed(PhysReg{241});
 354       else if (v == 0x3f800000) /* 1.0 */
 355          setFixed(PhysReg{242});
 356       else if (v == 0xbf800000) /* -1.0 */
 357          setFixed(PhysReg{243});
 358       else if (v == 0x40000000) /* 2.0 */
 359          setFixed(PhysReg{244});
 360       else if (v == 0xc0000000) /* -2.0 */
 361          setFixed(PhysReg{245});
 362       else if (v == 0x40800000) /* 4.0 */
 363          setFixed(PhysReg{246});
 364       else if (v == 0xc0800000) /* -4.0 */
 365          setFixed(PhysReg{247});
 366       else { /* Literal Constant */
 367          assert(!is64bit && "attempt to create a 64-bit literal constant");
 368          setFixed(PhysReg{255});
 369       }
 370    };
 371    explicit Operand(uint64_t v) noexcept
 372    {
 373       isConstant_ = true;
 374       is64BitConst_ = true;
 375       if (v <= 64) {
 376          data_.i = (uint32_t) v;
 377          setFixed(PhysReg{128 + (uint32_t) v});
 378       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 379          data_.i = (uint32_t) v;
 380          setFixed(PhysReg{192 - (uint32_t) v});
 381       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 382          data_.i = 0x3f000000;
 383          setFixed(PhysReg{240});
 384       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 385          data_.i = 0xbf000000;
 386          setFixed(PhysReg{241});
 387       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 388          data_.i = 0x3f800000;
 389          setFixed(PhysReg{242});
 390       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 391          data_.i = 0xbf800000;
 392          setFixed(PhysReg{243});
 393       } else if (v == 0x4000000000000000) { /* 2.0 */
 394          data_.i = 0x40000000;
 395          setFixed(PhysReg{244});
 396       } else if (v == 0xC000000000000000) { /* -2.0 */
 397          data_.i = 0xc0000000;
 398          setFixed(PhysReg{245});
 399       } else if (v == 0x4010000000000000) { /* 4.0 */
 400          data_.i = 0x40800000;
 401          setFixed(PhysReg{246});
 402       } else if (v == 0xC010000000000000) { /* -4.0 */
 403          data_.i = 0xc0800000;
 404          setFixed(PhysReg{247});
 405       } else { /* Literal Constant: we don't know if it is a long or double.*/
 406          isConstant_ = 0;
 407          assert(false && "attempt to create a 64-bit literal constant");
 408       }
 409    };
 410    explicit Operand(RegClass type) noexcept
 411    {
 412       isUndef_ = true;
 413       data_.temp = Temp(0, type);
 414       setFixed(PhysReg{128});
 415    };
 416    explicit Operand(PhysReg reg, RegClass type) noexcept
 417    {
 418       data_.temp = Temp(0, type);
 419       setFixed(reg);
 420    }
 421
 422    constexpr bool isTemp() const noexcept
 423    {
 424       return isTemp_;
 425    }
 426
 427    constexpr void setTemp(Temp t) noexcept {
 428       assert(!isConstant_);
 429       isTemp_ = true;
 430       data_.temp = t;
 431    }
 432
 433    constexpr Temp getTemp() const noexcept
 434    {
 435       return data_.temp;
 436    }
 437
 438    constexpr uint32_t tempId() const noexcept
 439    {
 440       return data_.temp.id();
 441    }
 442
 443    constexpr bool hasRegClass() const noexcept
 444    {
 445       return isTemp() || isUndefined();
 446    }
 447
 448    constexpr RegClass regClass() const noexcept
 449    {
 450       return data_.temp.regClass();
 451    }
 452
 453    constexpr unsigned bytes() const noexcept
 454    {
 455       if (isConstant())
 456          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
 457       else
 458          return data_.temp.bytes();
 459    }
 460
 461    constexpr unsigned size() const noexcept
 462    {
 463       if (isConstant())
 464          return is64BitConst_ ? 2 : 1;
 465       else
 466          return data_.temp.size();
 467    }
 468
 469    constexpr bool isFixed() const noexcept
 470    {
 471       return isFixed_;
 472    }
 473
 474    constexpr PhysReg physReg() const noexcept
 475    {
 476       return reg_;
 477    }
 478
 479    constexpr void setFixed(PhysReg reg) noexcept
 480    {
 481       isFixed_ = reg != unsigned(-1);
 482       reg_ = reg;
 483    }
 484
 485    constexpr bool isConstant() const noexcept
 486    {
 487       return isConstant_;
 488    }
 489
 490    constexpr bool isLiteral() const noexcept
 491    {
 492       return isConstant() && reg_ == 255;
 493    }
 494
 495    constexpr bool isUndefined() const noexcept
 496    {
 497       return isUndef_;
 498    }
 499
 500    constexpr uint32_t constantValue() const noexcept
 501    {
 502       return data_.i;
 503    }
 504
 505    constexpr bool constantEquals(uint32_t cmp) const noexcept
 506    {
 507       return isConstant() && constantValue() == cmp;
 508    }
 509
 510    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 511    {
 512       if (is64BitConst_) {
 513          if (reg_ <= 192)
 514             return reg_ - 128;
 515          else if (reg_ <= 208)
 516             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 517
 518          switch (reg_) {
 519          case 240:
 520             return 0x3FE0000000000000;
 521          case 241:
 522             return 0xBFE0000000000000;
 523          case 242:
 524             return 0x3FF0000000000000;
 525          case 243:
 526             return 0xBFF0000000000000;
 527          case 244:
 528             return 0x4000000000000000;
 529          case 245:
 530             return 0xC000000000000000;
 531          case 246:
 532             return 0x4010000000000000;
 533          case 247:
 534             return 0xC010000000000000;
 535          }
 536       }
 537       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 538    }
 539
 540    /* Indicates that the killed operand's live range intersects with the
 541     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 542     * not set by liveness analysis. */
 543    constexpr void setLateKill(bool flag) noexcept
 544    {
 545       isLateKill_ = flag;
 546    }
 547
 548    constexpr bool isLateKill() const noexcept
 549    {
 550       return isLateKill_;
 551    }
 552
 553    constexpr void setKill(bool flag) noexcept
 554    {
 555       isKill_ = flag;
 556       if (!flag)
 557          setFirstKill(false);
 558    }
 559
 560    constexpr bool isKill() const noexcept
 561    {
 562       return isKill_ || isFirstKill();
 563    }
 564
 565    constexpr void setFirstKill(bool flag) noexcept
 566    {
 567       isFirstKill_ = flag;
 568       if (flag)
 569          setKill(flag);
 570    }
 571
 572    /* When there are multiple operands killing the same temporary,
 573     * isFirstKill() is only returns true for the first one. */
 574    constexpr bool isFirstKill() const noexcept
 575    {
 576       return isFirstKill_;
 577    }
 578
 579    constexpr bool isKillBeforeDef() const noexcept
 580    {
 581       return isKill() && !isLateKill();
 582    }
 583
 584    constexpr bool isFirstKillBeforeDef() const noexcept
 585    {
 586       return isFirstKill() && !isLateKill();
 587    }
 588
 589    constexpr bool operator == (Operand other) const noexcept
 590    {
 591       if (other.size() != size())
 592          return false;
 593       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 594          return false;
 595       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 596          return false;
 597       if (isLiteral())
 598          return other.isLiteral() && other.constantValue() == constantValue();
 599       else if (isConstant())
 600          return other.isConstant() && other.physReg() == physReg();
 601       else if (isUndefined())
 602          return other.isUndefined() && other.regClass() == regClass();
 603       else
 604          return other.isTemp() && other.getTemp() == getTemp();
 605    }
 606 private:
 607    union {
 608       uint32_t i;
 609       float f;
 610       Temp temp = Temp(0, s1);
 611    } data_;
 612    PhysReg reg_;
 613    union {
 614       struct {
 615          uint8_t isTemp_:1;
 616          uint8_t isFixed_:1;
 617          uint8_t isConstant_:1;
 618          uint8_t isKill_:1;
 619          uint8_t isUndef_:1;
 620          uint8_t isFirstKill_:1;
 621          uint8_t is64BitConst_:1;
 622          uint8_t isLateKill_:1;
 623       };
 624       /* can't initialize bit-fields in c++11, so work around using a union */
 625       uint8_t control_ = 0;
 626    };
 627 };
 628
 629 /**
 630  * Definition Class
 631  * Definitions are the results of Instructions
 632  * and refer to temporary virtual registers
 633  * which are later mapped to physical registers
 634  */
 635 class Definition final
 636 {
 637 public:
 638    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 639    Definition(uint32_t index, RegClass type) noexcept
 640       : temp(index, type) {}
 641    explicit Definition(Temp tmp) noexcept
 642       : temp(tmp) {}
 643    Definition(PhysReg reg, RegClass type) noexcept
 644       : temp(Temp(0, type))
 645    {
 646       setFixed(reg);
 647    }
 648    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 649       : temp(Temp(tmpId, type))
 650    {
 651       setFixed(reg);
 652    }
 653
 654    constexpr bool isTemp() const noexcept
 655    {
 656       return tempId() > 0;
 657    }
 658
 659    constexpr Temp getTemp() const noexcept
 660    {
 661       return temp;
 662    }
 663
 664    constexpr uint32_t tempId() const noexcept
 665    {
 666       return temp.id();
 667    }
 668
 669    constexpr void setTemp(Temp t) noexcept {
 670       temp = t;
 671    }
 672
 673    constexpr RegClass regClass() const noexcept
 674    {
 675       return temp.regClass();
 676    }
 677
 678    constexpr unsigned bytes() const noexcept
 679    {
 680       return temp.bytes();
 681    }
 682
 683    constexpr unsigned size() const noexcept
 684    {
 685       return temp.size();
 686    }
 687
 688    constexpr bool isFixed() const noexcept
 689    {
 690       return isFixed_;
 691    }
 692
 693    constexpr PhysReg physReg() const noexcept
 694    {
 695       return reg_;
 696    }
 697
 698    constexpr void setFixed(PhysReg reg) noexcept
 699    {
 700       isFixed_ = 1;
 701       reg_ = reg;
 702    }
 703
 704    constexpr void setHint(PhysReg reg) noexcept
 705    {
 706       hasHint_ = 1;
 707       reg_ = reg;
 708    }
 709
 710    constexpr bool hasHint() const noexcept
 711    {
 712       return hasHint_;
 713    }
 714
 715    constexpr void setKill(bool flag) noexcept
 716    {
 717       isKill_ = flag;
 718    }
 719
 720    constexpr bool isKill() const noexcept
 721    {
 722       return isKill_;
 723    }
 724
 725 private:
 726    Temp temp = Temp(0, s1);
 727    PhysReg reg_;
 728    union {
 729       struct {
 730          uint8_t isFixed_:1;
 731          uint8_t hasHint_:1;
 732          uint8_t isKill_:1;
 733       };
 734       /* can't initialize bit-fields in c++11, so work around using a union */
 735       uint8_t control_ = 0;
 736    };
 737 };
 738
 739 class Block;
 740
 741 struct Instruction {
 742    aco_opcode opcode;
 743    Format format;
 744    uint32_t pass_flags;
 745
 746    aco::span<Operand> operands;
 747    aco::span<Definition> definitions;
 748
 749    constexpr bool isVALU() const noexcept
 750    {
 751       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 752           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 753           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 754           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 755           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 756           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 757    }
 758
 759    constexpr bool isSALU() const noexcept
 760    {
 761       return format == Format::SOP1 ||
 762              format == Format::SOP2 ||
 763              format == Format::SOPC ||
 764              format == Format::SOPK ||
 765              format == Format::SOPP;
 766    }
 767
 768    constexpr bool isVMEM() const noexcept
 769    {
 770       return format == Format::MTBUF ||
 771              format == Format::MUBUF ||
 772              format == Format::MIMG;
 773    }
 774
 775    constexpr bool isDPP() const noexcept
 776    {
 777       return (uint16_t) format & (uint16_t) Format::DPP;
 778    }
 779
 780    constexpr bool isVOP3() const noexcept
 781    {
 782       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 783              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 784              format == Format::VOP3P;
 785    }
 786
 787    constexpr bool isSDWA() const noexcept
 788    {
 789       return (uint16_t) format & (uint16_t) Format::SDWA;
 790    }
 791
 792    constexpr bool isFlatOrGlobal() const noexcept
 793    {
 794       return format == Format::FLAT || format == Format::GLOBAL;
 795    }
 796
 797    constexpr bool usesModifiers() const noexcept;
 798
 799    constexpr bool reads_exec() const noexcept
 800    {
 801       for (const Operand& op : operands) {
 802          if (op.isFixed() && op.physReg() == exec)
 803             return true;
 804       }
 805       return false;
 806    }
 807 };
 808
 809 struct SOPK_instruction : public Instruction {
 810    uint16_t imm;
 811 };
 812
 813 struct SOPP_instruction : public Instruction {
 814    uint32_t imm;
 815    int block;
 816 };
 817
 818 struct SOPC_instruction : public Instruction {
 819 };
 820
 821 struct SOP1_instruction : public Instruction {
 822 };
 823
 824 struct SOP2_instruction : public Instruction {
 825 };
 826
 827 /**
 828  * Scalar Memory Format:
 829  * For s_(buffer_)load_dword*:
 830  * Operand(0): SBASE - SGPR-pair which provides base address
 831  * Operand(1): Offset - immediate (un)signed offset or SGPR
 832  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 833  * Operand(n-1): SOffset - SGPR offset (Vega only)
 834  *
 835  * Having no operands is also valid for instructions such as s_dcache_inv.
 836  *
 837  */
 838 struct SMEM_instruction : public Instruction {
 839    bool glc : 1; /* VI+: globally coherent */
 840    bool dlc : 1; /* NAVI: device level coherent */
 841    bool nv : 1; /* VEGA only: Non-volatile */
 842    bool can_reorder : 1;
 843    bool disable_wqm : 1;
 844    barrier_interaction barrier;
 845 };
 846
 847 struct VOP1_instruction : public Instruction {
 848 };
 849
 850 struct VOP2_instruction : public Instruction {
 851 };
 852
 853 struct VOPC_instruction : public Instruction {
 854 };
 855
 856 struct VOP3A_instruction : public Instruction {
 857    bool abs[3];
 858    bool neg[3];
 859    uint8_t opsel : 4;
 860    uint8_t omod : 2;
 861    bool clamp : 1;
 862 };
 863
 864 /**
 865  * Data Parallel Primitives Format:
 866  * This format can be used for VOP1, VOP2 or VOPC instructions.
 867  * The swizzle applies to the src0 operand.
 868  *
 869  */
 870 struct DPP_instruction : public Instruction {
 871    bool abs[2];
 872    bool neg[2];
 873    uint16_t dpp_ctrl;
 874    uint8_t row_mask : 4;
 875    uint8_t bank_mask : 4;
 876    bool bound_ctrl : 1;
 877 };
 878
 879 enum sdwa_sel : uint8_t {
 880     /* masks */
 881     sdwa_wordnum = 0x1,
 882     sdwa_bytenum = 0x3,
 883     sdwa_asuint = 0x7 | 0x10,
 884     sdwa_rasize = 0x3,
 885
 886     /* flags */
 887     sdwa_isword = 0x4,
 888     sdwa_sext = 0x8,
 889     sdwa_isra = 0x10,
 890
 891     /* specific values */
 892     sdwa_ubyte0 = 0,
 893     sdwa_ubyte1 = 1,
 894     sdwa_ubyte2 = 2,
 895     sdwa_ubyte3 = 3,
 896     sdwa_uword0 = sdwa_isword | 0,
 897     sdwa_uword1 = sdwa_isword | 1,
 898     sdwa_udword = 6,
 899
 900     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 901     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 902     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 903     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 904     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 905     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 906     sdwa_sdword = sdwa_udword | sdwa_sext,
 907
 908     /* register-allocated */
 909     sdwa_ubyte = 1 | sdwa_isra,
 910     sdwa_uword = 2 | sdwa_isra,
 911     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
 912     sdwa_sword = sdwa_uword | sdwa_sext,
 913 };
 914
 915 /**
 916  * Sub-Dword Addressing Format:
 917  * This format can be used for VOP1, VOP2 or VOPC instructions.
 918  *
 919  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 920  * the definition doesn't have to be VCC on GFX9+.
 921  *
 922  */
 923 struct SDWA_instruction : public Instruction {
 924    /* these destination modifiers aren't available with VOPC except for
 925     * clamp on GFX8 */
 926    unsigned dst_sel:8;
 927    bool dst_preserve:1;
 928    bool clamp:1;
 929    unsigned omod:2; /* GFX9+ */
 930
 931    unsigned sel[2];
 932    bool neg[2];
 933    bool abs[2];
 934 };
 935
 936 struct Interp_instruction : public Instruction {
 937    uint8_t attribute;
 938    uint8_t component;
 939 };
 940
 941 /**
 942  * Local and Global Data Sharing instructions
 943  * Operand(0): ADDR - VGPR which supplies the address.
 944  * Operand(1): DATA0 - First data VGPR.
 945  * Operand(2): DATA1 - Second data VGPR.
 946  * Operand(n-1): M0 - LDS size.
 947  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 948  *
 949  */
 950 struct DS_instruction : public Instruction {
 951    int16_t offset0;
 952    int8_t offset1;
 953    bool gds;
 954 };
 955
 956 /**
 957  * Vector Memory Untyped-buffer Instructions
 958  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 959  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 960  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 961  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 962  *
 963  */
 964 struct MUBUF_instruction : public Instruction {
 965    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 966    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 967    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 968    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 969    bool glc : 1; /* globally coherent */
 970    bool dlc : 1; /* NAVI: device level coherent */
 971    bool slc : 1; /* system level coherent */
 972    bool tfe : 1; /* texture fail enable */
 973    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 974    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 975    bool can_reorder : 1;
 976    barrier_interaction barrier;
 977 };
 978
 979 /**
 980  * Vector Memory Typed-buffer Instructions
 981  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 982  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 983  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 984  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 985  *
 986  */
 987 struct MTBUF_instruction : public Instruction {
 988    uint16_t offset; /* Unsigned byte offset - 12 bit */
 989    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 990    uint8_t nfmt : 3; /* Numeric format of data in memory */
 991    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 992    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 993    bool glc : 1; /* globally coherent */
 994    bool dlc : 1; /* NAVI: device level coherent */
 995    bool slc : 1; /* system level coherent */
 996    bool tfe : 1; /* texture fail enable */
 997    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 998    bool can_reorder : 1;
 999    barrier_interaction barrier;
1000 };
1001
1002 /**
1003  * Vector Memory Image Instructions
1004  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1005  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1006  *             or VDATA - Vector GPR for write data.
1007  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1008  * Definition(0): VDATA - Vector GPR for read result.
1009  *
1010  */
1011 struct MIMG_instruction : public Instruction {
1012    uint8_t dmask; /* Data VGPR enable mask */
1013    uint8_t dim : 3; /* NAVI: dimensionality */
1014    bool unrm : 1; /* Force address to be un-normalized */
1015    bool dlc : 1; /* NAVI: device level coherent */
1016    bool glc : 1; /* globally coherent */
1017    bool slc : 1; /* system level coherent */
1018    bool tfe : 1; /* texture fail enable */
1019    bool da : 1; /* declare an array */
1020    bool lwe : 1; /* Force data to be un-normalized */
1021    bool r128 : 1; /* NAVI: Texture resource size */
1022    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1023    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1024    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1025    bool can_reorder : 1;
1026    barrier_interaction barrier;
1027 };
1028
1029 /**
1030  * Flat/Scratch/Global Instructions
1031  * Operand(0): ADDR
1032  * Operand(1): SADDR
1033  * Operand(2) / Definition(0): DATA/VDST
1034  *
1035  */
1036 struct FLAT_instruction : public Instruction {
1037    uint16_t offset; /* Vega/Navi only */
1038    bool slc : 1; /* system level coherent */
1039    bool glc : 1; /* globally coherent */
1040    bool dlc : 1; /* NAVI: device level coherent */
1041    bool lds : 1;
1042    bool nv : 1;
1043    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1044    bool can_reorder : 1;
1045    barrier_interaction barrier;
1046 };
1047
1048 struct Export_instruction : public Instruction {
1049    uint8_t enabled_mask;
1050    uint8_t dest;
1051    bool compressed : 1;
1052    bool done : 1;
1053    bool valid_mask : 1;
1054 };
1055
1056 struct Pseudo_instruction : public Instruction {
1057    bool tmp_in_scc;
1058    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1059 };
1060
1061 struct Pseudo_branch_instruction : public Instruction {
1062    /* target[0] is the block index of the branch target.
1063     * For conditional branches, target[1] contains the fall-through alternative.
1064     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1065     */
1066    uint32_t target[2];
1067 };
1068
1069 struct Pseudo_barrier_instruction : public Instruction {
1070 };
1071
1072 enum ReduceOp {
1073    iadd32, iadd64,
1074    imul32, imul64,
1075    fadd32, fadd64,
1076    fmul32, fmul64,
1077    imin32, imin64,
1078    imax32, imax64,
1079    umin32, umin64,
1080    umax32, umax64,
1081    fmin32, fmin64,
1082    fmax32, fmax64,
1083    iand32, iand64,
1084    ior32, ior64,
1085    ixor32, ixor64,
1086    gfx10_wave64_bpermute
1087 };
1088
1089 /**
1090  * Subgroup Reduction Instructions, everything except for the data to be
1091  * reduced and the result as inserted by setup_reduce_temp().
1092  * Operand(0): data to be reduced
1093  * Operand(1): reduce temporary
1094  * Operand(2): vector temporary
1095  * Definition(0): result
1096  * Definition(1): scalar temporary
1097  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1098  * Definition(3): scc clobber
1099  * Definition(4): vcc clobber
1100  *
1101  */
1102 struct Pseudo_reduction_instruction : public Instruction {
1103    ReduceOp reduce_op;
1104    unsigned cluster_size; // must be 0 for scans
1105 };
1106
1107 struct instr_deleter_functor {
1108    void operator()(void* p) {
1109       free(p);
1110    }
1111 };
1112
1113 template<typename T>
1114 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1115
1116 template<typename T>
1117 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1118 {
1119    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1120    char *data = (char*) calloc(1, size);
1121    T* inst = (T*) data;
1122
1123    inst->opcode = opcode;
1124    inst->format = format;
1125
1126    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1127    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1128    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1129    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1130
1131    return inst;
1132 }
1133
1134 constexpr bool Instruction::usesModifiers() const noexcept
1135 {
1136    if (isDPP() || isSDWA())
1137       return true;
1138    if (!isVOP3())
1139       return false;
1140    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1141    for (unsigned i = 0; i < operands.size(); i++) {
1142       if (vop3->abs[i] || vop3->neg[i])
1143          return true;
1144    }
1145    return vop3->opsel || vop3->clamp || vop3->omod;
1146 }
1147
1148 constexpr bool is_phi(Instruction* instr)
1149 {
1150    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1151 }
1152
1153 static inline bool is_phi(aco_ptr<Instruction>& instr)
1154 {
1155    return is_phi(instr.get());
1156 }
1157
1158 barrier_interaction get_barrier_interaction(Instruction* instr);
1159
1160 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1161
1162 enum block_kind {
1163    /* uniform indicates that leaving this block,
1164     * all actives lanes stay active */
1165    block_kind_uniform = 1 << 0,
1166    block_kind_top_level = 1 << 1,
1167    block_kind_loop_preheader = 1 << 2,
1168    block_kind_loop_header = 1 << 3,
1169    block_kind_loop_exit = 1 << 4,
1170    block_kind_continue = 1 << 5,
1171    block_kind_break = 1 << 6,
1172    block_kind_continue_or_break = 1 << 7,
1173    block_kind_discard = 1 << 8,
1174    block_kind_branch = 1 << 9,
1175    block_kind_merge = 1 << 10,
1176    block_kind_invert = 1 << 11,
1177    block_kind_uses_discard_if = 1 << 12,
1178    block_kind_needs_lowering = 1 << 13,
1179    block_kind_uses_demote = 1 << 14,
1180    block_kind_export_end = 1 << 15,
1181 };
1182
1183
1184 struct RegisterDemand {
1185    constexpr RegisterDemand() = default;
1186    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1187       : vgpr{v}, sgpr{s} {}
1188    int16_t vgpr = 0;
1189    int16_t sgpr = 0;
1190
1191    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1192       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1193    }
1194
1195    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1196       return vgpr > other.vgpr || sgpr > other.sgpr;
1197    }
1198
1199    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1200       if (t.type() == RegType::sgpr)
1201          return RegisterDemand( vgpr, sgpr + t.size() );
1202       else
1203          return RegisterDemand( vgpr + t.size(), sgpr );
1204    }
1205
1206    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1207       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1208    }
1209
1210    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1211       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1212    }
1213
1214    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1215       vgpr += other.vgpr;
1216       sgpr += other.sgpr;
1217       return *this;
1218    }
1219
1220    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1221       vgpr -= other.vgpr;
1222       sgpr -= other.sgpr;
1223       return *this;
1224    }
1225
1226    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1227       if (t.type() == RegType::sgpr)
1228          sgpr += t.size();
1229       else
1230          vgpr += t.size();
1231       return *this;
1232    }
1233
1234    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1235       if (t.type() == RegType::sgpr)
1236          sgpr -= t.size();
1237       else
1238          vgpr -= t.size();
1239       return *this;
1240    }
1241
1242    constexpr void update(const RegisterDemand other) noexcept {
1243       vgpr = std::max(vgpr, other.vgpr);
1244       sgpr = std::max(sgpr, other.sgpr);
1245    }
1246
1247 };
1248
1249 /* CFG */
1250 struct Block {
1251    float_mode fp_mode;
1252    unsigned index;
1253    unsigned offset = 0;
1254    std::vector<aco_ptr<Instruction>> instructions;
1255    std::vector<unsigned> logical_preds;
1256    std::vector<unsigned> linear_preds;
1257    std::vector<unsigned> logical_succs;
1258    std::vector<unsigned> linear_succs;
1259    RegisterDemand register_demand = RegisterDemand();
1260    uint16_t loop_nest_depth = 0;
1261    uint16_t kind = 0;
1262    int logical_idom = -1;
1263    int linear_idom = -1;
1264    Temp live_out_exec = Temp();
1265
1266    /* this information is needed for predecessors to blocks with phis when
1267     * moving out of ssa */
1268    bool scc_live_out = false;
1269    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1270
1271    Block(unsigned idx) : index(idx) {}
1272    Block() : index(0) {}
1273 };
1274
1275 using Stage = uint16_t;
1276
1277 /* software stages */
1278 static constexpr Stage sw_vs = 1 << 0;
1279 static constexpr Stage sw_gs = 1 << 1;
1280 static constexpr Stage sw_tcs = 1 << 2;
1281 static constexpr Stage sw_tes = 1 << 3;
1282 static constexpr Stage sw_fs = 1 << 4;
1283 static constexpr Stage sw_cs = 1 << 5;
1284 static constexpr Stage sw_gs_copy = 1 << 6;
1285 static constexpr Stage sw_mask = 0x7f;
1286
1287 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1288 static constexpr Stage hw_vs = 1 << 7;
1289 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1290 static constexpr Stage hw_gs = 1 << 9;
1291 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1292 static constexpr Stage hw_hs = 1 << 11;
1293 static constexpr Stage hw_fs = 1 << 12;
1294 static constexpr Stage hw_cs = 1 << 13;
1295 static constexpr Stage hw_mask = 0x7f << 7;
1296
1297 /* possible settings of Program::stage */
1298 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1299 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1300 static constexpr Stage compute_cs = sw_cs | hw_cs;
1301 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1302 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1303 /* GFX10/NGG */
1304 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1305 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1306 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1307 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1308 /* GFX9 (and GFX10 if NGG isn't used) */
1309 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1310 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1311 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1312 /* pre-GFX9 */
1313 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1314 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1315 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1316 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1317 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1318
1319 enum statistic {
1320    statistic_hash,
1321    statistic_instructions,
1322    statistic_copies,
1323    statistic_branches,
1324    statistic_cycles,
1325    statistic_vmem_clauses,
1326    statistic_smem_clauses,
1327    statistic_vmem_score,
1328    statistic_smem_score,
1329    statistic_sgpr_presched,
1330    statistic_vgpr_presched,
1331    num_statistics
1332 };
1333
1334 class Program final {
1335 public:
1336    float_mode next_fp_mode;
1337    std::vector<Block> blocks;
1338    RegisterDemand max_reg_demand = RegisterDemand();
1339    uint16_t num_waves = 0;
1340    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1341    ac_shader_config* config;
1342    struct radv_shader_info *info;
1343    enum chip_class chip_class;
1344    enum radeon_family family;
1345    unsigned wave_size;
1346    RegClass lane_mask;
1347    Stage stage; /* Stage */
1348    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1349    bool needs_wqm = false; /* there exists a p_wqm instruction */
1350    bool wb_smem_l1_on_end = false;
1351
1352    std::vector<uint8_t> constant_data;
1353    Temp private_segment_buffer;
1354    Temp scratch_offset;
1355
1356    uint16_t min_waves = 0;
1357    uint16_t lds_alloc_granule;
1358    uint32_t lds_limit; /* in bytes */
1359    bool has_16bank_lds;
1360    uint16_t vgpr_limit;
1361    uint16_t sgpr_limit;
1362    uint16_t physical_sgprs;
1363    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1364    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1365    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1366
1367    bool xnack_enabled = false;
1368
1369    bool needs_vcc = false;
1370    bool needs_flat_scr = false;
1371
1372    bool collect_statistics = false;
1373    uint32_t statistics[num_statistics];
1374
1375    uint32_t allocateId()
1376    {
1377       assert(allocationID <= 16777215);
1378       return allocationID++;
1379    }
1380
1381    uint32_t peekAllocationId()
1382    {
1383       return allocationID;
1384    }
1385
1386    void setAllocationId(uint32_t id)
1387    {
1388       allocationID = id;
1389    }
1390
1391    Block* create_and_insert_block() {
1392       blocks.emplace_back(blocks.size());
1393       blocks.back().fp_mode = next_fp_mode;
1394       return &blocks.back();
1395    }
1396
1397    Block* insert_block(Block&& block) {
1398       block.index = blocks.size();
1399       block.fp_mode = next_fp_mode;
1400       blocks.emplace_back(std::move(block));
1401       return &blocks.back();
1402    }
1403
1404 private:
1405    uint32_t allocationID = 1;
1406 };
1407
1408 struct live {
1409    /* live temps out per block */
1410    std::vector<std::set<Temp>> live_out;
1411    /* register demand (sgpr/vgpr) per instruction per block */
1412    std::vector<std::vector<RegisterDemand>> register_demand;
1413 };
1414
1415 void select_program(Program *program,
1416                     unsigned shader_count,
1417                     struct nir_shader *const *shaders,
1418                     ac_shader_config* config,
1419                     struct radv_shader_args *args);
1420 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1421                            ac_shader_config* config,
1422                            struct radv_shader_args *args);
1423
1424 void lower_wqm(Program* program, live& live_vars,
1425                const struct radv_nir_compiler_options *options);
1426 void lower_bool_phis(Program* program);
1427 void calc_min_waves(Program* program);
1428 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1429 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1430 std::vector<uint16_t> dead_code_analysis(Program *program);
1431 void dominator_tree(Program* program);
1432 void insert_exec_mask(Program *program);
1433 void value_numbering(Program* program);
1434 void optimize(Program* program);
1435 void setup_reduce_temp(Program* program);
1436 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1437 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1438 void ssa_elimination(Program* program);
1439 void lower_to_hw_instr(Program* program);
1440 void schedule_program(Program* program, live& live_vars);
1441 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1442 void insert_wait_states(Program* program);
1443 void insert_NOPs(Program* program);
1444 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1445 void print_asm(Program *program, std::vector<uint32_t>& binary,
1446                unsigned exec_size, std::ostream& out);
1447 void validate(Program* program, FILE *output);
1448 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1449 #ifndef NDEBUG
1450 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1451 #else
1452 #define perfwarn(program, cond, msg, ...) do {} while(0)
1453 #endif
1454
1455 void collect_presched_stats(Program *program);
1456 void collect_preasm_stats(Program *program);
1457 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1458
1459 void aco_print_instr(Instruction *instr, FILE *output);
1460 void aco_print_program(Program *program, FILE *output);
1461
1462 /* utilities for dealing with register demand */
1463 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1464 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1465 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1466
1467 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1468 uint16_t get_extra_sgprs(Program *program);
1469
1470 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1471 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1472 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1473
1474 /* return number of addressable sgprs/vgprs for max_waves */
1475 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1476 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1477
1478 typedef struct {
1479    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1480    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1481    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1482    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1483    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1484    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1485    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1486    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1487 } Info;
1488
1489 extern const Info instr_info;
1490
1491 }
1492
1493 #endif /* ACO_IR_H */
1494