src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 constexpr Format asSDWA(Format format) {
 173    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 174    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 175 }
 176
 177 enum class RegType {
 178    none = 0,
 179    sgpr,
 180    vgpr,
 181    linear_vgpr,
 182 };
 183
 184 struct RegClass {
 185
 186    enum RC : uint8_t {
 187       s1 = 1,
 188       s2 = 2,
 189       s3 = 3,
 190       s4 = 4,
 191       s6 = 6,
 192       s8 = 8,
 193       s16 = 16,
 194       v1 = s1 | (1 << 5),
 195       v2 = s2 | (1 << 5),
 196       v3 = s3 | (1 << 5),
 197       v4 = s4 | (1 << 5),
 198       v5 = 5  | (1 << 5),
 199       v6 = 6  | (1 << 5),
 200       v7 = 7  | (1 << 5),
 201       v8 = 8  | (1 << 5),
 202       /* these are used for WWM and spills to vgpr */
 203       v1_linear = v1 | (1 << 6),
 204       v2_linear = v2 | (1 << 6),
 205    };
 206
 207    RegClass() = default;
 208    constexpr RegClass(RC rc)
 209       : rc(rc) {}
 210    constexpr RegClass(RegType type, unsigned size)
 211       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 212
 213    constexpr operator RC() const { return rc; }
 214    explicit operator bool() = delete;
 215
 216    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 217    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 218    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 219    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 220
 221 private:
 222    RC rc;
 223 };
 224
 225 /* transitional helper expressions */
 226 static constexpr RegClass s1{RegClass::s1};
 227 static constexpr RegClass s2{RegClass::s2};
 228 static constexpr RegClass s3{RegClass::s3};
 229 static constexpr RegClass s4{RegClass::s4};
 230 static constexpr RegClass s8{RegClass::s8};
 231 static constexpr RegClass s16{RegClass::s16};
 232 static constexpr RegClass v1{RegClass::v1};
 233 static constexpr RegClass v2{RegClass::v2};
 234 static constexpr RegClass v3{RegClass::v3};
 235 static constexpr RegClass v4{RegClass::v4};
 236 static constexpr RegClass v5{RegClass::v5};
 237 static constexpr RegClass v6{RegClass::v6};
 238 static constexpr RegClass v7{RegClass::v7};
 239 static constexpr RegClass v8{RegClass::v8};
 240
 241 /**
 242  * Temp Class
 243  * Each temporary virtual register has a
 244  * register class (i.e. size and type)
 245  * and SSA id.
 246  */
 247 struct Temp {
 248    Temp() = default;
 249    constexpr Temp(uint32_t id, RegClass cls) noexcept
 250       : id_(id), reg_class(cls) {}
 251
 252    constexpr uint32_t id() const noexcept { return id_; }
 253    constexpr RegClass regClass() const noexcept { return reg_class; }
 254
 255    constexpr unsigned size() const noexcept { return reg_class.size(); }
 256    constexpr RegType type() const noexcept { return reg_class.type(); }
 257    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 258
 259    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 260    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 261    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 262
 263 private:
 264    uint32_t id_:24;
 265    RegClass reg_class;
 266 };
 267
 268 /**
 269  * PhysReg
 270  * Represents the physical register for each
 271  * Operand and Definition.
 272  */
 273 struct PhysReg {
 274    constexpr PhysReg() = default;
 275    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 276    constexpr unsigned reg() const { return reg_b >> 2; }
 277    constexpr unsigned byte() const { return reg_b & 0x3; }
 278    constexpr operator unsigned() const { return reg(); }
 279    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 280    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 281    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 282
 283    uint16_t reg_b = 0;
 284 };
 285
 286 /* helper expressions for special registers */
 287 static constexpr PhysReg m0{124};
 288 static constexpr PhysReg vcc{106};
 289 static constexpr PhysReg vcc_hi{107};
 290 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 291 static constexpr PhysReg exec{126};
 292 static constexpr PhysReg exec_lo{126};
 293 static constexpr PhysReg exec_hi{127};
 294 static constexpr PhysReg vccz{251};
 295 static constexpr PhysReg execz{252};
 296 static constexpr PhysReg scc{253};
 297
 298 /**
 299  * Operand Class
 300  * Initially, each Operand refers to either
 301  * a temporary virtual register
 302  * or to a constant value
 303  * Temporary registers get mapped to physical register during RA
 304  * Constant values are inlined into the instruction sequence.
 305  */
 306 class Operand final
 307 {
 308 public:
 309    constexpr Operand()
 310       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 311         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 312         isLateKill_(false) {}
 313
 314    explicit Operand(Temp r) noexcept
 315    {
 316       data_.temp = r;
 317       if (r.id()) {
 318          isTemp_ = true;
 319       } else {
 320          isUndef_ = true;
 321          setFixed(PhysReg{128});
 322       }
 323    };
 324    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 325    {
 326       data_.i = v;
 327       isConstant_ = true;
 328       is64BitConst_ = is64bit;
 329       if (v <= 64)
 330          setFixed(PhysReg{128 + v});
 331       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 332          setFixed(PhysReg{192 - v});
 333       else if (v == 0x3f000000) /* 0.5 */
 334          setFixed(PhysReg{240});
 335       else if (v == 0xbf000000) /* -0.5 */
 336          setFixed(PhysReg{241});
 337       else if (v == 0x3f800000) /* 1.0 */
 338          setFixed(PhysReg{242});
 339       else if (v == 0xbf800000) /* -1.0 */
 340          setFixed(PhysReg{243});
 341       else if (v == 0x40000000) /* 2.0 */
 342          setFixed(PhysReg{244});
 343       else if (v == 0xc0000000) /* -2.0 */
 344          setFixed(PhysReg{245});
 345       else if (v == 0x40800000) /* 4.0 */
 346          setFixed(PhysReg{246});
 347       else if (v == 0xc0800000) /* -4.0 */
 348          setFixed(PhysReg{247});
 349       else { /* Literal Constant */
 350          assert(!is64bit && "attempt to create a 64-bit literal constant");
 351          setFixed(PhysReg{255});
 352       }
 353    };
 354    explicit Operand(uint64_t v) noexcept
 355    {
 356       isConstant_ = true;
 357       is64BitConst_ = true;
 358       if (v <= 64) {
 359          data_.i = (uint32_t) v;
 360          setFixed(PhysReg{128 + (uint32_t) v});
 361       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 362          data_.i = (uint32_t) v;
 363          setFixed(PhysReg{192 - (uint32_t) v});
 364       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 365          data_.i = 0x3f000000;
 366          setFixed(PhysReg{240});
 367       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 368          data_.i = 0xbf000000;
 369          setFixed(PhysReg{241});
 370       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 371          data_.i = 0x3f800000;
 372          setFixed(PhysReg{242});
 373       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 374          data_.i = 0xbf800000;
 375          setFixed(PhysReg{243});
 376       } else if (v == 0x4000000000000000) { /* 2.0 */
 377          data_.i = 0x40000000;
 378          setFixed(PhysReg{244});
 379       } else if (v == 0xC000000000000000) { /* -2.0 */
 380          data_.i = 0xc0000000;
 381          setFixed(PhysReg{245});
 382       } else if (v == 0x4010000000000000) { /* 4.0 */
 383          data_.i = 0x40800000;
 384          setFixed(PhysReg{246});
 385       } else if (v == 0xC010000000000000) { /* -4.0 */
 386          data_.i = 0xc0800000;
 387          setFixed(PhysReg{247});
 388       } else { /* Literal Constant: we don't know if it is a long or double.*/
 389          isConstant_ = 0;
 390          assert(false && "attempt to create a 64-bit literal constant");
 391       }
 392    };
 393    explicit Operand(RegClass type) noexcept
 394    {
 395       isUndef_ = true;
 396       data_.temp = Temp(0, type);
 397       setFixed(PhysReg{128});
 398    };
 399    explicit Operand(PhysReg reg, RegClass type) noexcept
 400    {
 401       data_.temp = Temp(0, type);
 402       setFixed(reg);
 403    }
 404
 405    constexpr bool isTemp() const noexcept
 406    {
 407       return isTemp_;
 408    }
 409
 410    constexpr void setTemp(Temp t) noexcept {
 411       assert(!isConstant_);
 412       isTemp_ = true;
 413       data_.temp = t;
 414    }
 415
 416    constexpr Temp getTemp() const noexcept
 417    {
 418       return data_.temp;
 419    }
 420
 421    constexpr uint32_t tempId() const noexcept
 422    {
 423       return data_.temp.id();
 424    }
 425
 426    constexpr bool hasRegClass() const noexcept
 427    {
 428       return isTemp() || isUndefined();
 429    }
 430
 431    constexpr RegClass regClass() const noexcept
 432    {
 433       return data_.temp.regClass();
 434    }
 435
 436    constexpr unsigned size() const noexcept
 437    {
 438       if (isConstant())
 439          return is64BitConst_ ? 2 : 1;
 440       else
 441          return data_.temp.size();
 442    }
 443
 444    constexpr bool isFixed() const noexcept
 445    {
 446       return isFixed_;
 447    }
 448
 449    constexpr PhysReg physReg() const noexcept
 450    {
 451       return reg_;
 452    }
 453
 454    constexpr void setFixed(PhysReg reg) noexcept
 455    {
 456       isFixed_ = reg != unsigned(-1);
 457       reg_ = reg;
 458    }
 459
 460    constexpr bool isConstant() const noexcept
 461    {
 462       return isConstant_;
 463    }
 464
 465    constexpr bool isLiteral() const noexcept
 466    {
 467       return isConstant() && reg_ == 255;
 468    }
 469
 470    constexpr bool isUndefined() const noexcept
 471    {
 472       return isUndef_;
 473    }
 474
 475    constexpr uint32_t constantValue() const noexcept
 476    {
 477       return data_.i;
 478    }
 479
 480    constexpr bool constantEquals(uint32_t cmp) const noexcept
 481    {
 482       return isConstant() && constantValue() == cmp;
 483    }
 484
 485    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 486    {
 487       if (is64BitConst_) {
 488          if (reg_ <= 192)
 489             return reg_ - 128;
 490          else if (reg_ <= 208)
 491             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 492
 493          switch (reg_) {
 494          case 240:
 495             return 0x3FE0000000000000;
 496          case 241:
 497             return 0xBFE0000000000000;
 498          case 242:
 499             return 0x3FF0000000000000;
 500          case 243:
 501             return 0xBFF0000000000000;
 502          case 244:
 503             return 0x4000000000000000;
 504          case 245:
 505             return 0xC000000000000000;
 506          case 246:
 507             return 0x4010000000000000;
 508          case 247:
 509             return 0xC010000000000000;
 510          }
 511       }
 512       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 513    }
 514
 515    /* Indicates that the killed operand's live range intersects with the
 516     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 517     * not set by liveness analysis. */
 518    constexpr void setLateKill(bool flag) noexcept
 519    {
 520       isLateKill_ = flag;
 521    }
 522
 523    constexpr bool isLateKill() const noexcept
 524    {
 525       return isLateKill_;
 526    }
 527
 528    constexpr void setKill(bool flag) noexcept
 529    {
 530       isKill_ = flag;
 531       if (!flag)
 532          setFirstKill(false);
 533    }
 534
 535    constexpr bool isKill() const noexcept
 536    {
 537       return isKill_ || isFirstKill();
 538    }
 539
 540    constexpr void setFirstKill(bool flag) noexcept
 541    {
 542       isFirstKill_ = flag;
 543       if (flag)
 544          setKill(flag);
 545    }
 546
 547    /* When there are multiple operands killing the same temporary,
 548     * isFirstKill() is only returns true for the first one. */
 549    constexpr bool isFirstKill() const noexcept
 550    {
 551       return isFirstKill_;
 552    }
 553
 554    constexpr bool isKillBeforeDef() const noexcept
 555    {
 556       return isKill() && !isLateKill();
 557    }
 558
 559    constexpr bool isFirstKillBeforeDef() const noexcept
 560    {
 561       return isFirstKill() && !isLateKill();
 562    }
 563
 564    constexpr bool operator == (Operand other) const noexcept
 565    {
 566       if (other.size() != size())
 567          return false;
 568       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 569          return false;
 570       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 571          return false;
 572       if (isLiteral())
 573          return other.isLiteral() && other.constantValue() == constantValue();
 574       else if (isConstant())
 575          return other.isConstant() && other.physReg() == physReg();
 576       else if (isUndefined())
 577          return other.isUndefined() && other.regClass() == regClass();
 578       else
 579          return other.isTemp() && other.getTemp() == getTemp();
 580    }
 581 private:
 582    union {
 583       uint32_t i;
 584       float f;
 585       Temp temp = Temp(0, s1);
 586    } data_;
 587    PhysReg reg_;
 588    union {
 589       struct {
 590          uint8_t isTemp_:1;
 591          uint8_t isFixed_:1;
 592          uint8_t isConstant_:1;
 593          uint8_t isKill_:1;
 594          uint8_t isUndef_:1;
 595          uint8_t isFirstKill_:1;
 596          uint8_t is64BitConst_:1;
 597          uint8_t isLateKill_:1;
 598       };
 599       /* can't initialize bit-fields in c++11, so work around using a union */
 600       uint8_t control_ = 0;
 601    };
 602 };
 603
 604 /**
 605  * Definition Class
 606  * Definitions are the results of Instructions
 607  * and refer to temporary virtual registers
 608  * which are later mapped to physical registers
 609  */
 610 class Definition final
 611 {
 612 public:
 613    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 614    Definition(uint32_t index, RegClass type) noexcept
 615       : temp(index, type) {}
 616    explicit Definition(Temp tmp) noexcept
 617       : temp(tmp) {}
 618    Definition(PhysReg reg, RegClass type) noexcept
 619       : temp(Temp(0, type))
 620    {
 621       setFixed(reg);
 622    }
 623    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 624       : temp(Temp(tmpId, type))
 625    {
 626       setFixed(reg);
 627    }
 628
 629    constexpr bool isTemp() const noexcept
 630    {
 631       return tempId() > 0;
 632    }
 633
 634    constexpr Temp getTemp() const noexcept
 635    {
 636       return temp;
 637    }
 638
 639    constexpr uint32_t tempId() const noexcept
 640    {
 641       return temp.id();
 642    }
 643
 644    constexpr void setTemp(Temp t) noexcept {
 645       temp = t;
 646    }
 647
 648    constexpr RegClass regClass() const noexcept
 649    {
 650       return temp.regClass();
 651    }
 652
 653    constexpr unsigned size() const noexcept
 654    {
 655       return temp.size();
 656    }
 657
 658    constexpr bool isFixed() const noexcept
 659    {
 660       return isFixed_;
 661    }
 662
 663    constexpr PhysReg physReg() const noexcept
 664    {
 665       return reg_;
 666    }
 667
 668    constexpr void setFixed(PhysReg reg) noexcept
 669    {
 670       isFixed_ = 1;
 671       reg_ = reg;
 672    }
 673
 674    constexpr void setHint(PhysReg reg) noexcept
 675    {
 676       hasHint_ = 1;
 677       reg_ = reg;
 678    }
 679
 680    constexpr bool hasHint() const noexcept
 681    {
 682       return hasHint_;
 683    }
 684
 685    constexpr void setKill(bool flag) noexcept
 686    {
 687       isKill_ = flag;
 688    }
 689
 690    constexpr bool isKill() const noexcept
 691    {
 692       return isKill_;
 693    }
 694
 695 private:
 696    Temp temp = Temp(0, s1);
 697    PhysReg reg_;
 698    union {
 699       struct {
 700          uint8_t isFixed_:1;
 701          uint8_t hasHint_:1;
 702          uint8_t isKill_:1;
 703       };
 704       /* can't initialize bit-fields in c++11, so work around using a union */
 705       uint8_t control_ = 0;
 706    };
 707 };
 708
 709 class Block;
 710
 711 struct Instruction {
 712    aco_opcode opcode;
 713    Format format;
 714    uint32_t pass_flags;
 715
 716    aco::span<Operand> operands;
 717    aco::span<Definition> definitions;
 718
 719    constexpr bool isVALU() const noexcept
 720    {
 721       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 722           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 723           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 724           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 725           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 726           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 727    }
 728
 729    constexpr bool isSALU() const noexcept
 730    {
 731       return format == Format::SOP1 ||
 732              format == Format::SOP2 ||
 733              format == Format::SOPC ||
 734              format == Format::SOPK ||
 735              format == Format::SOPP;
 736    }
 737
 738    constexpr bool isVMEM() const noexcept
 739    {
 740       return format == Format::MTBUF ||
 741              format == Format::MUBUF ||
 742              format == Format::MIMG;
 743    }
 744
 745    constexpr bool isDPP() const noexcept
 746    {
 747       return (uint16_t) format & (uint16_t) Format::DPP;
 748    }
 749
 750    constexpr bool isVOP3() const noexcept
 751    {
 752       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 753              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 754              format == Format::VOP3P;
 755    }
 756
 757    constexpr bool isSDWA() const noexcept
 758    {
 759       return (uint16_t) format & (uint16_t) Format::SDWA;
 760    }
 761
 762    constexpr bool isFlatOrGlobal() const noexcept
 763    {
 764       return format == Format::FLAT || format == Format::GLOBAL;
 765    }
 766
 767    constexpr bool usesModifiers() const noexcept;
 768
 769    constexpr bool reads_exec() const noexcept
 770    {
 771       for (const Operand& op : operands) {
 772          if (op.isFixed() && op.physReg() == exec)
 773             return true;
 774       }
 775       return false;
 776    }
 777 };
 778
 779 struct SOPK_instruction : public Instruction {
 780    uint16_t imm;
 781 };
 782
 783 struct SOPP_instruction : public Instruction {
 784    uint32_t imm;
 785    int block;
 786 };
 787
 788 struct SOPC_instruction : public Instruction {
 789 };
 790
 791 struct SOP1_instruction : public Instruction {
 792 };
 793
 794 struct SOP2_instruction : public Instruction {
 795 };
 796
 797 /**
 798  * Scalar Memory Format:
 799  * For s_(buffer_)load_dword*:
 800  * Operand(0): SBASE - SGPR-pair which provides base address
 801  * Operand(1): Offset - immediate (un)signed offset or SGPR
 802  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 803  * Operand(n-1): SOffset - SGPR offset (Vega only)
 804  *
 805  * Having no operands is also valid for instructions such as s_dcache_inv.
 806  *
 807  */
 808 struct SMEM_instruction : public Instruction {
 809    bool glc : 1; /* VI+: globally coherent */
 810    bool dlc : 1; /* NAVI: device level coherent */
 811    bool nv : 1; /* VEGA only: Non-volatile */
 812    bool can_reorder : 1;
 813    bool disable_wqm : 1;
 814    barrier_interaction barrier;
 815 };
 816
 817 struct VOP1_instruction : public Instruction {
 818 };
 819
 820 struct VOP2_instruction : public Instruction {
 821 };
 822
 823 struct VOPC_instruction : public Instruction {
 824 };
 825
 826 struct VOP3A_instruction : public Instruction {
 827    bool abs[3];
 828    bool neg[3];
 829    uint8_t opsel : 4;
 830    uint8_t omod : 2;
 831    bool clamp : 1;
 832 };
 833
 834 /**
 835  * Data Parallel Primitives Format:
 836  * This format can be used for VOP1, VOP2 or VOPC instructions.
 837  * The swizzle applies to the src0 operand.
 838  *
 839  */
 840 struct DPP_instruction : public Instruction {
 841    bool abs[2];
 842    bool neg[2];
 843    uint16_t dpp_ctrl;
 844    uint8_t row_mask : 4;
 845    uint8_t bank_mask : 4;
 846    bool bound_ctrl : 1;
 847 };
 848
 849 enum sdwa_sel : uint8_t {
 850     /* masks */
 851     sdwa_wordnum = 0x1,
 852     sdwa_bytenum = 0x3,
 853     sdwa_asuint = 0x7,
 854
 855     /* flags */
 856     sdwa_isword = 0x4,
 857     sdwa_sext = 0x8,
 858
 859     /* specific values */
 860     sdwa_ubyte0 = 0,
 861     sdwa_ubyte1 = 1,
 862     sdwa_ubyte2 = 2,
 863     sdwa_ubyte3 = 3,
 864     sdwa_uword0 = sdwa_isword | 0,
 865     sdwa_uword1 = sdwa_isword | 1,
 866     sdwa_udword = 6,
 867
 868     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 869     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 870     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 871     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 872     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 873     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 874     sdwa_sdword = sdwa_udword | sdwa_sext,
 875 };
 876
 877 /**
 878  * Sub-Dword Addressing Format:
 879  * This format can be used for VOP1, VOP2 or VOPC instructions.
 880  *
 881  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 882  * the definition doesn't have to be VCC on GFX9+.
 883  *
 884  */
 885 struct SDWA_instruction : public Instruction {
 886    /* these destination modifiers aren't available with VOPC except for
 887     * clamp on GFX8 */
 888    unsigned dst_sel:4;
 889    bool dst_preserve:1;
 890    bool clamp:1;
 891    unsigned omod:2; /* GFX9+ */
 892
 893    unsigned sel[2];
 894    bool neg[2];
 895    bool abs[2];
 896 };
 897
 898 struct Interp_instruction : public Instruction {
 899    uint8_t attribute;
 900    uint8_t component;
 901 };
 902
 903 /**
 904  * Local and Global Data Sharing instructions
 905  * Operand(0): ADDR - VGPR which supplies the address.
 906  * Operand(1): DATA0 - First data VGPR.
 907  * Operand(2): DATA1 - Second data VGPR.
 908  * Operand(n-1): M0 - LDS size.
 909  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 910  *
 911  */
 912 struct DS_instruction : public Instruction {
 913    int16_t offset0;
 914    int8_t offset1;
 915    bool gds;
 916 };
 917
 918 /**
 919  * Vector Memory Untyped-buffer Instructions
 920  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 921  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 922  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 923  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 924  *
 925  */
 926 struct MUBUF_instruction : public Instruction {
 927    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 928    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 929    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 930    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 931    bool glc : 1; /* globally coherent */
 932    bool dlc : 1; /* NAVI: device level coherent */
 933    bool slc : 1; /* system level coherent */
 934    bool tfe : 1; /* texture fail enable */
 935    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 936    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 937    bool can_reorder : 1;
 938    barrier_interaction barrier;
 939 };
 940
 941 /**
 942  * Vector Memory Typed-buffer Instructions
 943  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 944  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 945  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 946  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 947  *
 948  */
 949 struct MTBUF_instruction : public Instruction {
 950    uint16_t offset; /* Unsigned byte offset - 12 bit */
 951    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 952    uint8_t nfmt : 3; /* Numeric format of data in memory */
 953    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 954    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 955    bool glc : 1; /* globally coherent */
 956    bool dlc : 1; /* NAVI: device level coherent */
 957    bool slc : 1; /* system level coherent */
 958    bool tfe : 1; /* texture fail enable */
 959    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 960    bool can_reorder : 1;
 961    barrier_interaction barrier;
 962 };
 963
 964 /**
 965  * Vector Memory Image Instructions
 966  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 967  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 968  *             or VDATA - Vector GPR for write data.
 969  * Operand(2): VADDR - Address source. Can carry an offset or an index.
 970  * Definition(0): VDATA - Vector GPR for read result.
 971  *
 972  */
 973 struct MIMG_instruction : public Instruction {
 974    uint8_t dmask; /* Data VGPR enable mask */
 975    uint8_t dim : 3; /* NAVI: dimensionality */
 976    bool unrm : 1; /* Force address to be un-normalized */
 977    bool dlc : 1; /* NAVI: device level coherent */
 978    bool glc : 1; /* globally coherent */
 979    bool slc : 1; /* system level coherent */
 980    bool tfe : 1; /* texture fail enable */
 981    bool da : 1; /* declare an array */
 982    bool lwe : 1; /* Force data to be un-normalized */
 983    bool r128 : 1; /* NAVI: Texture resource size */
 984    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
 985    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
 986    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 987    bool can_reorder : 1;
 988    barrier_interaction barrier;
 989 };
 990
 991 /**
 992  * Flat/Scratch/Global Instructions
 993  * Operand(0): ADDR
 994  * Operand(1): SADDR
 995  * Operand(2) / Definition(0): DATA/VDST
 996  *
 997  */
 998 struct FLAT_instruction : public Instruction {
 999    uint16_t offset; /* Vega/Navi only */
1000    bool slc : 1; /* system level coherent */
1001    bool glc : 1; /* globally coherent */
1002    bool dlc : 1; /* NAVI: device level coherent */
1003    bool lds : 1;
1004    bool nv : 1;
1005    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1006    bool can_reorder : 1;
1007    barrier_interaction barrier;
1008 };
1009
1010 struct Export_instruction : public Instruction {
1011    uint8_t enabled_mask;
1012    uint8_t dest;
1013    bool compressed : 1;
1014    bool done : 1;
1015    bool valid_mask : 1;
1016 };
1017
1018 struct Pseudo_instruction : public Instruction {
1019    bool tmp_in_scc;
1020    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1021 };
1022
1023 struct Pseudo_branch_instruction : public Instruction {
1024    /* target[0] is the block index of the branch target.
1025     * For conditional branches, target[1] contains the fall-through alternative.
1026     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1027     */
1028    uint32_t target[2];
1029 };
1030
1031 struct Pseudo_barrier_instruction : public Instruction {
1032 };
1033
1034 enum ReduceOp {
1035    iadd32, iadd64,
1036    imul32, imul64,
1037    fadd32, fadd64,
1038    fmul32, fmul64,
1039    imin32, imin64,
1040    imax32, imax64,
1041    umin32, umin64,
1042    umax32, umax64,
1043    fmin32, fmin64,
1044    fmax32, fmax64,
1045    iand32, iand64,
1046    ior32, ior64,
1047    ixor32, ixor64,
1048    gfx10_wave64_bpermute
1049 };
1050
1051 /**
1052  * Subgroup Reduction Instructions, everything except for the data to be
1053  * reduced and the result as inserted by setup_reduce_temp().
1054  * Operand(0): data to be reduced
1055  * Operand(1): reduce temporary
1056  * Operand(2): vector temporary
1057  * Definition(0): result
1058  * Definition(1): scalar temporary
1059  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1060  * Definition(3): scc clobber
1061  * Definition(4): vcc clobber
1062  *
1063  */
1064 struct Pseudo_reduction_instruction : public Instruction {
1065    ReduceOp reduce_op;
1066    unsigned cluster_size; // must be 0 for scans
1067 };
1068
1069 struct instr_deleter_functor {
1070    void operator()(void* p) {
1071       free(p);
1072    }
1073 };
1074
1075 template<typename T>
1076 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1077
1078 template<typename T>
1079 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1080 {
1081    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1082    char *data = (char*) calloc(1, size);
1083    T* inst = (T*) data;
1084
1085    inst->opcode = opcode;
1086    inst->format = format;
1087
1088    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1089    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1090    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1091    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1092
1093    return inst;
1094 }
1095
1096 constexpr bool Instruction::usesModifiers() const noexcept
1097 {
1098    if (isDPP() || isSDWA())
1099       return true;
1100    if (!isVOP3())
1101       return false;
1102    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1103    for (unsigned i = 0; i < operands.size(); i++) {
1104       if (vop3->abs[i] || vop3->neg[i])
1105          return true;
1106    }
1107    return vop3->opsel || vop3->clamp || vop3->omod;
1108 }
1109
1110 constexpr bool is_phi(Instruction* instr)
1111 {
1112    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1113 }
1114
1115 static inline bool is_phi(aco_ptr<Instruction>& instr)
1116 {
1117    return is_phi(instr.get());
1118 }
1119
1120 barrier_interaction get_barrier_interaction(Instruction* instr);
1121
1122 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1123
1124 enum block_kind {
1125    /* uniform indicates that leaving this block,
1126     * all actives lanes stay active */
1127    block_kind_uniform = 1 << 0,
1128    block_kind_top_level = 1 << 1,
1129    block_kind_loop_preheader = 1 << 2,
1130    block_kind_loop_header = 1 << 3,
1131    block_kind_loop_exit = 1 << 4,
1132    block_kind_continue = 1 << 5,
1133    block_kind_break = 1 << 6,
1134    block_kind_continue_or_break = 1 << 7,
1135    block_kind_discard = 1 << 8,
1136    block_kind_branch = 1 << 9,
1137    block_kind_merge = 1 << 10,
1138    block_kind_invert = 1 << 11,
1139    block_kind_uses_discard_if = 1 << 12,
1140    block_kind_needs_lowering = 1 << 13,
1141    block_kind_uses_demote = 1 << 14,
1142    block_kind_export_end = 1 << 15,
1143 };
1144
1145
1146 struct RegisterDemand {
1147    constexpr RegisterDemand() = default;
1148    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1149       : vgpr{v}, sgpr{s} {}
1150    int16_t vgpr = 0;
1151    int16_t sgpr = 0;
1152
1153    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1154       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1155    }
1156
1157    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1158       return vgpr > other.vgpr || sgpr > other.sgpr;
1159    }
1160
1161    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1162       if (t.type() == RegType::sgpr)
1163          return RegisterDemand( vgpr, sgpr + t.size() );
1164       else
1165          return RegisterDemand( vgpr + t.size(), sgpr );
1166    }
1167
1168    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1169       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1170    }
1171
1172    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1173       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1174    }
1175
1176    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1177       vgpr += other.vgpr;
1178       sgpr += other.sgpr;
1179       return *this;
1180    }
1181
1182    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1183       vgpr -= other.vgpr;
1184       sgpr -= other.sgpr;
1185       return *this;
1186    }
1187
1188    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1189       if (t.type() == RegType::sgpr)
1190          sgpr += t.size();
1191       else
1192          vgpr += t.size();
1193       return *this;
1194    }
1195
1196    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1197       if (t.type() == RegType::sgpr)
1198          sgpr -= t.size();
1199       else
1200          vgpr -= t.size();
1201       return *this;
1202    }
1203
1204    constexpr void update(const RegisterDemand other) noexcept {
1205       vgpr = std::max(vgpr, other.vgpr);
1206       sgpr = std::max(sgpr, other.sgpr);
1207    }
1208
1209 };
1210
1211 /* CFG */
1212 struct Block {
1213    float_mode fp_mode;
1214    unsigned index;
1215    unsigned offset = 0;
1216    std::vector<aco_ptr<Instruction>> instructions;
1217    std::vector<unsigned> logical_preds;
1218    std::vector<unsigned> linear_preds;
1219    std::vector<unsigned> logical_succs;
1220    std::vector<unsigned> linear_succs;
1221    RegisterDemand register_demand = RegisterDemand();
1222    uint16_t loop_nest_depth = 0;
1223    uint16_t kind = 0;
1224    int logical_idom = -1;
1225    int linear_idom = -1;
1226    Temp live_out_exec = Temp();
1227
1228    /* this information is needed for predecessors to blocks with phis when
1229     * moving out of ssa */
1230    bool scc_live_out = false;
1231    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1232
1233    Block(unsigned idx) : index(idx) {}
1234    Block() : index(0) {}
1235 };
1236
1237 using Stage = uint16_t;
1238
1239 /* software stages */
1240 static constexpr Stage sw_vs = 1 << 0;
1241 static constexpr Stage sw_gs = 1 << 1;
1242 static constexpr Stage sw_tcs = 1 << 2;
1243 static constexpr Stage sw_tes = 1 << 3;
1244 static constexpr Stage sw_fs = 1 << 4;
1245 static constexpr Stage sw_cs = 1 << 5;
1246 static constexpr Stage sw_gs_copy = 1 << 6;
1247 static constexpr Stage sw_mask = 0x7f;
1248
1249 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1250 static constexpr Stage hw_vs = 1 << 7;
1251 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1252 static constexpr Stage hw_gs = 1 << 9;
1253 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1254 static constexpr Stage hw_hs = 1 << 11;
1255 static constexpr Stage hw_fs = 1 << 12;
1256 static constexpr Stage hw_cs = 1 << 13;
1257 static constexpr Stage hw_mask = 0x7f << 7;
1258
1259 /* possible settings of Program::stage */
1260 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1261 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1262 static constexpr Stage compute_cs = sw_cs | hw_cs;
1263 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1264 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1265 /* GFX10/NGG */
1266 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1267 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1268 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1269 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1270 /* GFX9 (and GFX10 if NGG isn't used) */
1271 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1272 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1273 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1274 /* pre-GFX9 */
1275 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1276 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1277 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1278 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1279 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1280
1281 enum statistic {
1282    statistic_hash,
1283    statistic_instructions,
1284    statistic_copies,
1285    statistic_branches,
1286    statistic_cycles,
1287    statistic_vmem_clauses,
1288    statistic_smem_clauses,
1289    statistic_vmem_score,
1290    statistic_smem_score,
1291    statistic_sgpr_presched,
1292    statistic_vgpr_presched,
1293    num_statistics
1294 };
1295
1296 class Program final {
1297 public:
1298    float_mode next_fp_mode;
1299    std::vector<Block> blocks;
1300    RegisterDemand max_reg_demand = RegisterDemand();
1301    uint16_t num_waves = 0;
1302    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1303    ac_shader_config* config;
1304    struct radv_shader_info *info;
1305    enum chip_class chip_class;
1306    enum radeon_family family;
1307    unsigned wave_size;
1308    RegClass lane_mask;
1309    Stage stage; /* Stage */
1310    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1311    bool needs_wqm = false; /* there exists a p_wqm instruction */
1312    bool wb_smem_l1_on_end = false;
1313
1314    std::vector<uint8_t> constant_data;
1315    Temp private_segment_buffer;
1316    Temp scratch_offset;
1317
1318    uint16_t min_waves = 0;
1319    uint16_t lds_alloc_granule;
1320    uint32_t lds_limit; /* in bytes */
1321    bool has_16bank_lds;
1322    uint16_t vgpr_limit;
1323    uint16_t sgpr_limit;
1324    uint16_t physical_sgprs;
1325    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1326    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1327    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1328
1329    bool xnack_enabled = false;
1330
1331    bool needs_vcc = false;
1332    bool needs_flat_scr = false;
1333
1334    bool collect_statistics = false;
1335    uint32_t statistics[num_statistics];
1336
1337    uint32_t allocateId()
1338    {
1339       assert(allocationID <= 16777215);
1340       return allocationID++;
1341    }
1342
1343    uint32_t peekAllocationId()
1344    {
1345       return allocationID;
1346    }
1347
1348    void setAllocationId(uint32_t id)
1349    {
1350       allocationID = id;
1351    }
1352
1353    Block* create_and_insert_block() {
1354       blocks.emplace_back(blocks.size());
1355       blocks.back().fp_mode = next_fp_mode;
1356       return &blocks.back();
1357    }
1358
1359    Block* insert_block(Block&& block) {
1360       block.index = blocks.size();
1361       block.fp_mode = next_fp_mode;
1362       blocks.emplace_back(std::move(block));
1363       return &blocks.back();
1364    }
1365
1366 private:
1367    uint32_t allocationID = 1;
1368 };
1369
1370 struct live {
1371    /* live temps out per block */
1372    std::vector<std::set<Temp>> live_out;
1373    /* register demand (sgpr/vgpr) per instruction per block */
1374    std::vector<std::vector<RegisterDemand>> register_demand;
1375 };
1376
1377 void select_program(Program *program,
1378                     unsigned shader_count,
1379                     struct nir_shader *const *shaders,
1380                     ac_shader_config* config,
1381                     struct radv_shader_args *args);
1382 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1383                            ac_shader_config* config,
1384                            struct radv_shader_args *args);
1385
1386 void lower_wqm(Program* program, live& live_vars,
1387                const struct radv_nir_compiler_options *options);
1388 void lower_bool_phis(Program* program);
1389 void calc_min_waves(Program* program);
1390 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1391 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1392 std::vector<uint16_t> dead_code_analysis(Program *program);
1393 void dominator_tree(Program* program);
1394 void insert_exec_mask(Program *program);
1395 void value_numbering(Program* program);
1396 void optimize(Program* program);
1397 void setup_reduce_temp(Program* program);
1398 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1399 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1400 void ssa_elimination(Program* program);
1401 void lower_to_hw_instr(Program* program);
1402 void schedule_program(Program* program, live& live_vars);
1403 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1404 void insert_wait_states(Program* program);
1405 void insert_NOPs(Program* program);
1406 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1407 void print_asm(Program *program, std::vector<uint32_t>& binary,
1408                unsigned exec_size, std::ostream& out);
1409 void validate(Program* program, FILE *output);
1410 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1411 #ifndef NDEBUG
1412 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1413 #else
1414 #define perfwarn(program, cond, msg, ...) do {} while(0)
1415 #endif
1416
1417 void collect_presched_stats(Program *program);
1418 void collect_preasm_stats(Program *program);
1419 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1420
1421 void aco_print_instr(Instruction *instr, FILE *output);
1422 void aco_print_program(Program *program, FILE *output);
1423
1424 /* utilities for dealing with register demand */
1425 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1426 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1427 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1428
1429 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1430 uint16_t get_extra_sgprs(Program *program);
1431
1432 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1433 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1434 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1435
1436 /* return number of addressable sgprs/vgprs for max_waves */
1437 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1438 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1439
1440 typedef struct {
1441    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1442    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1443    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1444    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1445    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1446    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1447    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1448    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1449 } Info;
1450
1451 extern const Info instr_info;
1452
1453 }
1454
1455 #endif /* ACO_IR_H */
1456