src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 enum class RegType {
 173    none = 0,
 174    sgpr,
 175    vgpr,
 176    linear_vgpr,
 177 };
 178
 179 struct RegClass {
 180
 181    enum RC : uint8_t {
 182       s1 = 1,
 183       s2 = 2,
 184       s3 = 3,
 185       s4 = 4,
 186       s6 = 6,
 187       s8 = 8,
 188       s16 = 16,
 189       v1 = s1 | (1 << 5),
 190       v2 = s2 | (1 << 5),
 191       v3 = s3 | (1 << 5),
 192       v4 = s4 | (1 << 5),
 193       v5 = 5  | (1 << 5),
 194       v6 = 6  | (1 << 5),
 195       v7 = 7  | (1 << 5),
 196       v8 = 8  | (1 << 5),
 197       /* these are used for WWM and spills to vgpr */
 198       v1_linear = v1 | (1 << 6),
 199       v2_linear = v2 | (1 << 6),
 200    };
 201
 202    RegClass() = default;
 203    constexpr RegClass(RC rc)
 204       : rc(rc) {}
 205    constexpr RegClass(RegType type, unsigned size)
 206       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 207
 208    constexpr operator RC() const { return rc; }
 209    explicit operator bool() = delete;
 210
 211    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 212    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 213    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 214    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 215
 216 private:
 217    RC rc;
 218 };
 219
 220 /* transitional helper expressions */
 221 static constexpr RegClass s1{RegClass::s1};
 222 static constexpr RegClass s2{RegClass::s2};
 223 static constexpr RegClass s3{RegClass::s3};
 224 static constexpr RegClass s4{RegClass::s4};
 225 static constexpr RegClass s8{RegClass::s8};
 226 static constexpr RegClass s16{RegClass::s16};
 227 static constexpr RegClass v1{RegClass::v1};
 228 static constexpr RegClass v2{RegClass::v2};
 229 static constexpr RegClass v3{RegClass::v3};
 230 static constexpr RegClass v4{RegClass::v4};
 231 static constexpr RegClass v5{RegClass::v5};
 232 static constexpr RegClass v6{RegClass::v6};
 233 static constexpr RegClass v7{RegClass::v7};
 234 static constexpr RegClass v8{RegClass::v8};
 235
 236 /**
 237  * Temp Class
 238  * Each temporary virtual register has a
 239  * register class (i.e. size and type)
 240  * and SSA id.
 241  */
 242 struct Temp {
 243    Temp() = default;
 244    constexpr Temp(uint32_t id, RegClass cls) noexcept
 245       : id_(id), reg_class(cls) {}
 246
 247    constexpr uint32_t id() const noexcept { return id_; }
 248    constexpr RegClass regClass() const noexcept { return reg_class; }
 249
 250    constexpr unsigned size() const noexcept { return reg_class.size(); }
 251    constexpr RegType type() const noexcept { return reg_class.type(); }
 252    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 253
 254    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 255    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 256    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 257
 258 private:
 259    uint32_t id_:24;
 260    RegClass reg_class;
 261 };
 262
 263 /**
 264  * PhysReg
 265  * Represents the physical register for each
 266  * Operand and Definition.
 267  */
 268 struct PhysReg {
 269    constexpr PhysReg() = default;
 270    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 271    constexpr unsigned reg() const { return reg_b >> 2; }
 272    constexpr unsigned byte() const { return reg_b & 0x3; }
 273    constexpr operator unsigned() const { return reg(); }
 274    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 275    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 276    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 277
 278    uint16_t reg_b = 0;
 279 };
 280
 281 /* helper expressions for special registers */
 282 static constexpr PhysReg m0{124};
 283 static constexpr PhysReg vcc{106};
 284 static constexpr PhysReg vcc_hi{107};
 285 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 286 static constexpr PhysReg exec{126};
 287 static constexpr PhysReg exec_lo{126};
 288 static constexpr PhysReg exec_hi{127};
 289 static constexpr PhysReg vccz{251};
 290 static constexpr PhysReg execz{252};
 291 static constexpr PhysReg scc{253};
 292
 293 /**
 294  * Operand Class
 295  * Initially, each Operand refers to either
 296  * a temporary virtual register
 297  * or to a constant value
 298  * Temporary registers get mapped to physical register during RA
 299  * Constant values are inlined into the instruction sequence.
 300  */
 301 class Operand final
 302 {
 303 public:
 304    constexpr Operand()
 305       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 306         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 307         isLateKill_(false) {}
 308
 309    explicit Operand(Temp r) noexcept
 310    {
 311       data_.temp = r;
 312       if (r.id()) {
 313          isTemp_ = true;
 314       } else {
 315          isUndef_ = true;
 316          setFixed(PhysReg{128});
 317       }
 318    };
 319    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 320    {
 321       data_.i = v;
 322       isConstant_ = true;
 323       is64BitConst_ = is64bit;
 324       if (v <= 64)
 325          setFixed(PhysReg{128 + v});
 326       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 327          setFixed(PhysReg{192 - v});
 328       else if (v == 0x3f000000) /* 0.5 */
 329          setFixed(PhysReg{240});
 330       else if (v == 0xbf000000) /* -0.5 */
 331          setFixed(PhysReg{241});
 332       else if (v == 0x3f800000) /* 1.0 */
 333          setFixed(PhysReg{242});
 334       else if (v == 0xbf800000) /* -1.0 */
 335          setFixed(PhysReg{243});
 336       else if (v == 0x40000000) /* 2.0 */
 337          setFixed(PhysReg{244});
 338       else if (v == 0xc0000000) /* -2.0 */
 339          setFixed(PhysReg{245});
 340       else if (v == 0x40800000) /* 4.0 */
 341          setFixed(PhysReg{246});
 342       else if (v == 0xc0800000) /* -4.0 */
 343          setFixed(PhysReg{247});
 344       else { /* Literal Constant */
 345          assert(!is64bit && "attempt to create a 64-bit literal constant");
 346          setFixed(PhysReg{255});
 347       }
 348    };
 349    explicit Operand(uint64_t v) noexcept
 350    {
 351       isConstant_ = true;
 352       is64BitConst_ = true;
 353       if (v <= 64) {
 354          data_.i = (uint32_t) v;
 355          setFixed(PhysReg{128 + (uint32_t) v});
 356       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 357          data_.i = (uint32_t) v;
 358          setFixed(PhysReg{192 - (uint32_t) v});
 359       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 360          data_.i = 0x3f000000;
 361          setFixed(PhysReg{240});
 362       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 363          data_.i = 0xbf000000;
 364          setFixed(PhysReg{241});
 365       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 366          data_.i = 0x3f800000;
 367          setFixed(PhysReg{242});
 368       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 369          data_.i = 0xbf800000;
 370          setFixed(PhysReg{243});
 371       } else if (v == 0x4000000000000000) { /* 2.0 */
 372          data_.i = 0x40000000;
 373          setFixed(PhysReg{244});
 374       } else if (v == 0xC000000000000000) { /* -2.0 */
 375          data_.i = 0xc0000000;
 376          setFixed(PhysReg{245});
 377       } else if (v == 0x4010000000000000) { /* 4.0 */
 378          data_.i = 0x40800000;
 379          setFixed(PhysReg{246});
 380       } else if (v == 0xC010000000000000) { /* -4.0 */
 381          data_.i = 0xc0800000;
 382          setFixed(PhysReg{247});
 383       } else { /* Literal Constant: we don't know if it is a long or double.*/
 384          isConstant_ = 0;
 385          assert(false && "attempt to create a 64-bit literal constant");
 386       }
 387    };
 388    explicit Operand(RegClass type) noexcept
 389    {
 390       isUndef_ = true;
 391       data_.temp = Temp(0, type);
 392       setFixed(PhysReg{128});
 393    };
 394    explicit Operand(PhysReg reg, RegClass type) noexcept
 395    {
 396       data_.temp = Temp(0, type);
 397       setFixed(reg);
 398    }
 399
 400    constexpr bool isTemp() const noexcept
 401    {
 402       return isTemp_;
 403    }
 404
 405    constexpr void setTemp(Temp t) noexcept {
 406       assert(!isConstant_);
 407       isTemp_ = true;
 408       data_.temp = t;
 409    }
 410
 411    constexpr Temp getTemp() const noexcept
 412    {
 413       return data_.temp;
 414    }
 415
 416    constexpr uint32_t tempId() const noexcept
 417    {
 418       return data_.temp.id();
 419    }
 420
 421    constexpr bool hasRegClass() const noexcept
 422    {
 423       return isTemp() || isUndefined();
 424    }
 425
 426    constexpr RegClass regClass() const noexcept
 427    {
 428       return data_.temp.regClass();
 429    }
 430
 431    constexpr unsigned size() const noexcept
 432    {
 433       if (isConstant())
 434          return is64BitConst_ ? 2 : 1;
 435       else
 436          return data_.temp.size();
 437    }
 438
 439    constexpr bool isFixed() const noexcept
 440    {
 441       return isFixed_;
 442    }
 443
 444    constexpr PhysReg physReg() const noexcept
 445    {
 446       return reg_;
 447    }
 448
 449    constexpr void setFixed(PhysReg reg) noexcept
 450    {
 451       isFixed_ = reg != unsigned(-1);
 452       reg_ = reg;
 453    }
 454
 455    constexpr bool isConstant() const noexcept
 456    {
 457       return isConstant_;
 458    }
 459
 460    constexpr bool isLiteral() const noexcept
 461    {
 462       return isConstant() && reg_ == 255;
 463    }
 464
 465    constexpr bool isUndefined() const noexcept
 466    {
 467       return isUndef_;
 468    }
 469
 470    constexpr uint32_t constantValue() const noexcept
 471    {
 472       return data_.i;
 473    }
 474
 475    constexpr bool constantEquals(uint32_t cmp) const noexcept
 476    {
 477       return isConstant() && constantValue() == cmp;
 478    }
 479
 480    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 481    {
 482       if (is64BitConst_) {
 483          if (reg_ <= 192)
 484             return reg_ - 128;
 485          else if (reg_ <= 208)
 486             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 487
 488          switch (reg_) {
 489          case 240:
 490             return 0x3FE0000000000000;
 491          case 241:
 492             return 0xBFE0000000000000;
 493          case 242:
 494             return 0x3FF0000000000000;
 495          case 243:
 496             return 0xBFF0000000000000;
 497          case 244:
 498             return 0x4000000000000000;
 499          case 245:
 500             return 0xC000000000000000;
 501          case 246:
 502             return 0x4010000000000000;
 503          case 247:
 504             return 0xC010000000000000;
 505          }
 506       }
 507       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 508    }
 509
 510    /* Indicates that the killed operand's live range intersects with the
 511     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 512     * not set by liveness analysis. */
 513    constexpr void setLateKill(bool flag) noexcept
 514    {
 515       isLateKill_ = flag;
 516    }
 517
 518    constexpr bool isLateKill() const noexcept
 519    {
 520       return isLateKill_;
 521    }
 522
 523    constexpr void setKill(bool flag) noexcept
 524    {
 525       isKill_ = flag;
 526       if (!flag)
 527          setFirstKill(false);
 528    }
 529
 530    constexpr bool isKill() const noexcept
 531    {
 532       return isKill_ || isFirstKill();
 533    }
 534
 535    constexpr void setFirstKill(bool flag) noexcept
 536    {
 537       isFirstKill_ = flag;
 538       if (flag)
 539          setKill(flag);
 540    }
 541
 542    /* When there are multiple operands killing the same temporary,
 543     * isFirstKill() is only returns true for the first one. */
 544    constexpr bool isFirstKill() const noexcept
 545    {
 546       return isFirstKill_;
 547    }
 548
 549    constexpr bool isKillBeforeDef() const noexcept
 550    {
 551       return isKill() && !isLateKill();
 552    }
 553
 554    constexpr bool isFirstKillBeforeDef() const noexcept
 555    {
 556       return isFirstKill() && !isLateKill();
 557    }
 558
 559    constexpr bool operator == (Operand other) const noexcept
 560    {
 561       if (other.size() != size())
 562          return false;
 563       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 564          return false;
 565       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 566          return false;
 567       if (isLiteral())
 568          return other.isLiteral() && other.constantValue() == constantValue();
 569       else if (isConstant())
 570          return other.isConstant() && other.physReg() == physReg();
 571       else if (isUndefined())
 572          return other.isUndefined() && other.regClass() == regClass();
 573       else
 574          return other.isTemp() && other.getTemp() == getTemp();
 575    }
 576 private:
 577    union {
 578       uint32_t i;
 579       float f;
 580       Temp temp = Temp(0, s1);
 581    } data_;
 582    PhysReg reg_;
 583    union {
 584       struct {
 585          uint8_t isTemp_:1;
 586          uint8_t isFixed_:1;
 587          uint8_t isConstant_:1;
 588          uint8_t isKill_:1;
 589          uint8_t isUndef_:1;
 590          uint8_t isFirstKill_:1;
 591          uint8_t is64BitConst_:1;
 592          uint8_t isLateKill_:1;
 593       };
 594       /* can't initialize bit-fields in c++11, so work around using a union */
 595       uint8_t control_ = 0;
 596    };
 597 };
 598
 599 /**
 600  * Definition Class
 601  * Definitions are the results of Instructions
 602  * and refer to temporary virtual registers
 603  * which are later mapped to physical registers
 604  */
 605 class Definition final
 606 {
 607 public:
 608    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 609    Definition(uint32_t index, RegClass type) noexcept
 610       : temp(index, type) {}
 611    explicit Definition(Temp tmp) noexcept
 612       : temp(tmp) {}
 613    Definition(PhysReg reg, RegClass type) noexcept
 614       : temp(Temp(0, type))
 615    {
 616       setFixed(reg);
 617    }
 618    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 619       : temp(Temp(tmpId, type))
 620    {
 621       setFixed(reg);
 622    }
 623
 624    constexpr bool isTemp() const noexcept
 625    {
 626       return tempId() > 0;
 627    }
 628
 629    constexpr Temp getTemp() const noexcept
 630    {
 631       return temp;
 632    }
 633
 634    constexpr uint32_t tempId() const noexcept
 635    {
 636       return temp.id();
 637    }
 638
 639    constexpr void setTemp(Temp t) noexcept {
 640       temp = t;
 641    }
 642
 643    constexpr RegClass regClass() const noexcept
 644    {
 645       return temp.regClass();
 646    }
 647
 648    constexpr unsigned size() const noexcept
 649    {
 650       return temp.size();
 651    }
 652
 653    constexpr bool isFixed() const noexcept
 654    {
 655       return isFixed_;
 656    }
 657
 658    constexpr PhysReg physReg() const noexcept
 659    {
 660       return reg_;
 661    }
 662
 663    constexpr void setFixed(PhysReg reg) noexcept
 664    {
 665       isFixed_ = 1;
 666       reg_ = reg;
 667    }
 668
 669    constexpr void setHint(PhysReg reg) noexcept
 670    {
 671       hasHint_ = 1;
 672       reg_ = reg;
 673    }
 674
 675    constexpr bool hasHint() const noexcept
 676    {
 677       return hasHint_;
 678    }
 679
 680    constexpr void setKill(bool flag) noexcept
 681    {
 682       isKill_ = flag;
 683    }
 684
 685    constexpr bool isKill() const noexcept
 686    {
 687       return isKill_;
 688    }
 689
 690 private:
 691    Temp temp = Temp(0, s1);
 692    PhysReg reg_;
 693    union {
 694       struct {
 695          uint8_t isFixed_:1;
 696          uint8_t hasHint_:1;
 697          uint8_t isKill_:1;
 698       };
 699       /* can't initialize bit-fields in c++11, so work around using a union */
 700       uint8_t control_ = 0;
 701    };
 702 };
 703
 704 class Block;
 705
 706 struct Instruction {
 707    aco_opcode opcode;
 708    Format format;
 709    uint32_t pass_flags;
 710
 711    aco::span<Operand> operands;
 712    aco::span<Definition> definitions;
 713
 714    constexpr bool isVALU() const noexcept
 715    {
 716       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 717           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 718           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 719           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 720           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 721           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 722    }
 723
 724    constexpr bool isSALU() const noexcept
 725    {
 726       return format == Format::SOP1 ||
 727              format == Format::SOP2 ||
 728              format == Format::SOPC ||
 729              format == Format::SOPK ||
 730              format == Format::SOPP;
 731    }
 732
 733    constexpr bool isVMEM() const noexcept
 734    {
 735       return format == Format::MTBUF ||
 736              format == Format::MUBUF ||
 737              format == Format::MIMG;
 738    }
 739
 740    constexpr bool isDPP() const noexcept
 741    {
 742       return (uint16_t) format & (uint16_t) Format::DPP;
 743    }
 744
 745    constexpr bool isVOP3() const noexcept
 746    {
 747       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 748              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 749              format == Format::VOP3P;
 750    }
 751
 752    constexpr bool isSDWA() const noexcept
 753    {
 754       return (uint16_t) format & (uint16_t) Format::SDWA;
 755    }
 756
 757    constexpr bool isFlatOrGlobal() const noexcept
 758    {
 759       return format == Format::FLAT || format == Format::GLOBAL;
 760    }
 761
 762    constexpr bool usesModifiers() const noexcept;
 763
 764    constexpr bool reads_exec() const noexcept
 765    {
 766       for (const Operand& op : operands) {
 767          if (op.isFixed() && op.physReg() == exec)
 768             return true;
 769       }
 770       return false;
 771    }
 772 };
 773
 774 struct SOPK_instruction : public Instruction {
 775    uint16_t imm;
 776 };
 777
 778 struct SOPP_instruction : public Instruction {
 779    uint32_t imm;
 780    int block;
 781 };
 782
 783 struct SOPC_instruction : public Instruction {
 784 };
 785
 786 struct SOP1_instruction : public Instruction {
 787 };
 788
 789 struct SOP2_instruction : public Instruction {
 790 };
 791
 792 /**
 793  * Scalar Memory Format:
 794  * For s_(buffer_)load_dword*:
 795  * Operand(0): SBASE - SGPR-pair which provides base address
 796  * Operand(1): Offset - immediate (un)signed offset or SGPR
 797  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 798  * Operand(n-1): SOffset - SGPR offset (Vega only)
 799  *
 800  * Having no operands is also valid for instructions such as s_dcache_inv.
 801  *
 802  */
 803 struct SMEM_instruction : public Instruction {
 804    bool glc : 1; /* VI+: globally coherent */
 805    bool dlc : 1; /* NAVI: device level coherent */
 806    bool nv : 1; /* VEGA only: Non-volatile */
 807    bool can_reorder : 1;
 808    bool disable_wqm : 1;
 809    barrier_interaction barrier;
 810 };
 811
 812 struct VOP1_instruction : public Instruction {
 813 };
 814
 815 struct VOP2_instruction : public Instruction {
 816 };
 817
 818 struct VOPC_instruction : public Instruction {
 819 };
 820
 821 struct VOP3A_instruction : public Instruction {
 822    bool abs[3];
 823    bool neg[3];
 824    uint8_t opsel : 4;
 825    uint8_t omod : 2;
 826    bool clamp : 1;
 827 };
 828
 829 /**
 830  * Data Parallel Primitives Format:
 831  * This format can be used for VOP1, VOP2 or VOPC instructions.
 832  * The swizzle applies to the src0 operand.
 833  *
 834  */
 835 struct DPP_instruction : public Instruction {
 836    bool abs[2];
 837    bool neg[2];
 838    uint16_t dpp_ctrl;
 839    uint8_t row_mask : 4;
 840    uint8_t bank_mask : 4;
 841    bool bound_ctrl : 1;
 842 };
 843
 844 struct Interp_instruction : public Instruction {
 845    uint8_t attribute;
 846    uint8_t component;
 847 };
 848
 849 /**
 850  * Local and Global Data Sharing instructions
 851  * Operand(0): ADDR - VGPR which supplies the address.
 852  * Operand(1): DATA0 - First data VGPR.
 853  * Operand(2): DATA1 - Second data VGPR.
 854  * Operand(n-1): M0 - LDS size.
 855  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 856  *
 857  */
 858 struct DS_instruction : public Instruction {
 859    int16_t offset0;
 860    int8_t offset1;
 861    bool gds;
 862 };
 863
 864 /**
 865  * Vector Memory Untyped-buffer Instructions
 866  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 867  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 868  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 869  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 870  *
 871  */
 872 struct MUBUF_instruction : public Instruction {
 873    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 874    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 875    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 876    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 877    bool glc : 1; /* globally coherent */
 878    bool dlc : 1; /* NAVI: device level coherent */
 879    bool slc : 1; /* system level coherent */
 880    bool tfe : 1; /* texture fail enable */
 881    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 882    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 883    bool can_reorder : 1;
 884    barrier_interaction barrier;
 885 };
 886
 887 /**
 888  * Vector Memory Typed-buffer Instructions
 889  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 890  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 891  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 892  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 893  *
 894  */
 895 struct MTBUF_instruction : public Instruction {
 896    uint16_t offset; /* Unsigned byte offset - 12 bit */
 897    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 898    uint8_t nfmt : 3; /* Numeric format of data in memory */
 899    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 900    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 901    bool glc : 1; /* globally coherent */
 902    bool dlc : 1; /* NAVI: device level coherent */
 903    bool slc : 1; /* system level coherent */
 904    bool tfe : 1; /* texture fail enable */
 905    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 906    bool can_reorder : 1;
 907    barrier_interaction barrier;
 908 };
 909
 910 /**
 911  * Vector Memory Image Instructions
 912  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 913  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 914  *             or VDATA - Vector GPR for write data.
 915  * Operand(2): VADDR - Address source. Can carry an offset or an index.
 916  * Definition(0): VDATA - Vector GPR for read result.
 917  *
 918  */
 919 struct MIMG_instruction : public Instruction {
 920    uint8_t dmask; /* Data VGPR enable mask */
 921    uint8_t dim : 3; /* NAVI: dimensionality */
 922    bool unrm : 1; /* Force address to be un-normalized */
 923    bool dlc : 1; /* NAVI: device level coherent */
 924    bool glc : 1; /* globally coherent */
 925    bool slc : 1; /* system level coherent */
 926    bool tfe : 1; /* texture fail enable */
 927    bool da : 1; /* declare an array */
 928    bool lwe : 1; /* Force data to be un-normalized */
 929    bool r128 : 1; /* NAVI: Texture resource size */
 930    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
 931    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
 932    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 933    bool can_reorder : 1;
 934    barrier_interaction barrier;
 935 };
 936
 937 /**
 938  * Flat/Scratch/Global Instructions
 939  * Operand(0): ADDR
 940  * Operand(1): SADDR
 941  * Operand(2) / Definition(0): DATA/VDST
 942  *
 943  */
 944 struct FLAT_instruction : public Instruction {
 945    uint16_t offset; /* Vega/Navi only */
 946    bool slc : 1; /* system level coherent */
 947    bool glc : 1; /* globally coherent */
 948    bool dlc : 1; /* NAVI: device level coherent */
 949    bool lds : 1;
 950    bool nv : 1;
 951    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 952    bool can_reorder : 1;
 953    barrier_interaction barrier;
 954 };
 955
 956 struct Export_instruction : public Instruction {
 957    uint8_t enabled_mask;
 958    uint8_t dest;
 959    bool compressed : 1;
 960    bool done : 1;
 961    bool valid_mask : 1;
 962 };
 963
 964 struct Pseudo_instruction : public Instruction {
 965    bool tmp_in_scc;
 966    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 967 };
 968
 969 struct Pseudo_branch_instruction : public Instruction {
 970    /* target[0] is the block index of the branch target.
 971     * For conditional branches, target[1] contains the fall-through alternative.
 972     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 973     */
 974    uint32_t target[2];
 975 };
 976
 977 struct Pseudo_barrier_instruction : public Instruction {
 978 };
 979
 980 enum ReduceOp {
 981    iadd32, iadd64,
 982    imul32, imul64,
 983    fadd32, fadd64,
 984    fmul32, fmul64,
 985    imin32, imin64,
 986    imax32, imax64,
 987    umin32, umin64,
 988    umax32, umax64,
 989    fmin32, fmin64,
 990    fmax32, fmax64,
 991    iand32, iand64,
 992    ior32, ior64,
 993    ixor32, ixor64,
 994    gfx10_wave64_bpermute
 995 };
 996
 997 /**
 998  * Subgroup Reduction Instructions, everything except for the data to be
 999  * reduced and the result as inserted by setup_reduce_temp().
1000  * Operand(0): data to be reduced
1001  * Operand(1): reduce temporary
1002  * Operand(2): vector temporary
1003  * Definition(0): result
1004  * Definition(1): scalar temporary
1005  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1006  * Definition(3): scc clobber
1007  * Definition(4): vcc clobber
1008  *
1009  */
1010 struct Pseudo_reduction_instruction : public Instruction {
1011    ReduceOp reduce_op;
1012    unsigned cluster_size; // must be 0 for scans
1013 };
1014
1015 struct instr_deleter_functor {
1016    void operator()(void* p) {
1017       free(p);
1018    }
1019 };
1020
1021 template<typename T>
1022 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1023
1024 template<typename T>
1025 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1026 {
1027    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1028    char *data = (char*) calloc(1, size);
1029    T* inst = (T*) data;
1030
1031    inst->opcode = opcode;
1032    inst->format = format;
1033
1034    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1035    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1036    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1037    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1038
1039    return inst;
1040 }
1041
1042 constexpr bool Instruction::usesModifiers() const noexcept
1043 {
1044    if (isDPP() || isSDWA())
1045       return true;
1046    if (!isVOP3())
1047       return false;
1048    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1049    for (unsigned i = 0; i < operands.size(); i++) {
1050       if (vop3->abs[i] || vop3->neg[i])
1051          return true;
1052    }
1053    return vop3->opsel || vop3->clamp || vop3->omod;
1054 }
1055
1056 constexpr bool is_phi(Instruction* instr)
1057 {
1058    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1059 }
1060
1061 static inline bool is_phi(aco_ptr<Instruction>& instr)
1062 {
1063    return is_phi(instr.get());
1064 }
1065
1066 barrier_interaction get_barrier_interaction(Instruction* instr);
1067
1068 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1069
1070 enum block_kind {
1071    /* uniform indicates that leaving this block,
1072     * all actives lanes stay active */
1073    block_kind_uniform = 1 << 0,
1074    block_kind_top_level = 1 << 1,
1075    block_kind_loop_preheader = 1 << 2,
1076    block_kind_loop_header = 1 << 3,
1077    block_kind_loop_exit = 1 << 4,
1078    block_kind_continue = 1 << 5,
1079    block_kind_break = 1 << 6,
1080    block_kind_continue_or_break = 1 << 7,
1081    block_kind_discard = 1 << 8,
1082    block_kind_branch = 1 << 9,
1083    block_kind_merge = 1 << 10,
1084    block_kind_invert = 1 << 11,
1085    block_kind_uses_discard_if = 1 << 12,
1086    block_kind_needs_lowering = 1 << 13,
1087    block_kind_uses_demote = 1 << 14,
1088    block_kind_export_end = 1 << 15,
1089 };
1090
1091
1092 struct RegisterDemand {
1093    constexpr RegisterDemand() = default;
1094    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1095       : vgpr{v}, sgpr{s} {}
1096    int16_t vgpr = 0;
1097    int16_t sgpr = 0;
1098
1099    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1100       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1101    }
1102
1103    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1104       return vgpr > other.vgpr || sgpr > other.sgpr;
1105    }
1106
1107    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1108       if (t.type() == RegType::sgpr)
1109          return RegisterDemand( vgpr, sgpr + t.size() );
1110       else
1111          return RegisterDemand( vgpr + t.size(), sgpr );
1112    }
1113
1114    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1115       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1116    }
1117
1118    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1119       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1120    }
1121
1122    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1123       vgpr += other.vgpr;
1124       sgpr += other.sgpr;
1125       return *this;
1126    }
1127
1128    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1129       vgpr -= other.vgpr;
1130       sgpr -= other.sgpr;
1131       return *this;
1132    }
1133
1134    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1135       if (t.type() == RegType::sgpr)
1136          sgpr += t.size();
1137       else
1138          vgpr += t.size();
1139       return *this;
1140    }
1141
1142    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1143       if (t.type() == RegType::sgpr)
1144          sgpr -= t.size();
1145       else
1146          vgpr -= t.size();
1147       return *this;
1148    }
1149
1150    constexpr void update(const RegisterDemand other) noexcept {
1151       vgpr = std::max(vgpr, other.vgpr);
1152       sgpr = std::max(sgpr, other.sgpr);
1153    }
1154
1155 };
1156
1157 /* CFG */
1158 struct Block {
1159    float_mode fp_mode;
1160    unsigned index;
1161    unsigned offset = 0;
1162    std::vector<aco_ptr<Instruction>> instructions;
1163    std::vector<unsigned> logical_preds;
1164    std::vector<unsigned> linear_preds;
1165    std::vector<unsigned> logical_succs;
1166    std::vector<unsigned> linear_succs;
1167    RegisterDemand register_demand = RegisterDemand();
1168    uint16_t loop_nest_depth = 0;
1169    uint16_t kind = 0;
1170    int logical_idom = -1;
1171    int linear_idom = -1;
1172    Temp live_out_exec = Temp();
1173
1174    /* this information is needed for predecessors to blocks with phis when
1175     * moving out of ssa */
1176    bool scc_live_out = false;
1177    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1178
1179    Block(unsigned idx) : index(idx) {}
1180    Block() : index(0) {}
1181 };
1182
1183 using Stage = uint16_t;
1184
1185 /* software stages */
1186 static constexpr Stage sw_vs = 1 << 0;
1187 static constexpr Stage sw_gs = 1 << 1;
1188 static constexpr Stage sw_tcs = 1 << 2;
1189 static constexpr Stage sw_tes = 1 << 3;
1190 static constexpr Stage sw_fs = 1 << 4;
1191 static constexpr Stage sw_cs = 1 << 5;
1192 static constexpr Stage sw_gs_copy = 1 << 6;
1193 static constexpr Stage sw_mask = 0x7f;
1194
1195 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1196 static constexpr Stage hw_vs = 1 << 7;
1197 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1198 static constexpr Stage hw_gs = 1 << 9;
1199 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1200 static constexpr Stage hw_hs = 1 << 11;
1201 static constexpr Stage hw_fs = 1 << 12;
1202 static constexpr Stage hw_cs = 1 << 13;
1203 static constexpr Stage hw_mask = 0x7f << 7;
1204
1205 /* possible settings of Program::stage */
1206 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1207 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1208 static constexpr Stage compute_cs = sw_cs | hw_cs;
1209 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1210 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1211 /* GFX10/NGG */
1212 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1213 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1214 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1215 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1216 /* GFX9 (and GFX10 if NGG isn't used) */
1217 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1218 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1219 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1220 /* pre-GFX9 */
1221 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1222 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1223 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1224 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1225 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1226
1227 enum statistic {
1228    statistic_hash,
1229    statistic_instructions,
1230    statistic_copies,
1231    statistic_branches,
1232    statistic_cycles,
1233    statistic_vmem_clauses,
1234    statistic_smem_clauses,
1235    statistic_vmem_score,
1236    statistic_smem_score,
1237    statistic_sgpr_presched,
1238    statistic_vgpr_presched,
1239    num_statistics
1240 };
1241
1242 class Program final {
1243 public:
1244    float_mode next_fp_mode;
1245    std::vector<Block> blocks;
1246    RegisterDemand max_reg_demand = RegisterDemand();
1247    uint16_t num_waves = 0;
1248    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1249    ac_shader_config* config;
1250    struct radv_shader_info *info;
1251    enum chip_class chip_class;
1252    enum radeon_family family;
1253    unsigned wave_size;
1254    RegClass lane_mask;
1255    Stage stage; /* Stage */
1256    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1257    bool needs_wqm = false; /* there exists a p_wqm instruction */
1258    bool wb_smem_l1_on_end = false;
1259
1260    std::vector<uint8_t> constant_data;
1261    Temp private_segment_buffer;
1262    Temp scratch_offset;
1263
1264    uint16_t min_waves = 0;
1265    uint16_t lds_alloc_granule;
1266    uint32_t lds_limit; /* in bytes */
1267    bool has_16bank_lds;
1268    uint16_t vgpr_limit;
1269    uint16_t sgpr_limit;
1270    uint16_t physical_sgprs;
1271    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1272    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1273    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1274
1275    bool xnack_enabled = false;
1276
1277    bool needs_vcc = false;
1278    bool needs_flat_scr = false;
1279
1280    bool collect_statistics = false;
1281    uint32_t statistics[num_statistics];
1282
1283    uint32_t allocateId()
1284    {
1285       assert(allocationID <= 16777215);
1286       return allocationID++;
1287    }
1288
1289    uint32_t peekAllocationId()
1290    {
1291       return allocationID;
1292    }
1293
1294    void setAllocationId(uint32_t id)
1295    {
1296       allocationID = id;
1297    }
1298
1299    Block* create_and_insert_block() {
1300       blocks.emplace_back(blocks.size());
1301       blocks.back().fp_mode = next_fp_mode;
1302       return &blocks.back();
1303    }
1304
1305    Block* insert_block(Block&& block) {
1306       block.index = blocks.size();
1307       block.fp_mode = next_fp_mode;
1308       blocks.emplace_back(std::move(block));
1309       return &blocks.back();
1310    }
1311
1312 private:
1313    uint32_t allocationID = 1;
1314 };
1315
1316 struct live {
1317    /* live temps out per block */
1318    std::vector<std::set<Temp>> live_out;
1319    /* register demand (sgpr/vgpr) per instruction per block */
1320    std::vector<std::vector<RegisterDemand>> register_demand;
1321 };
1322
1323 void select_program(Program *program,
1324                     unsigned shader_count,
1325                     struct nir_shader *const *shaders,
1326                     ac_shader_config* config,
1327                     struct radv_shader_args *args);
1328 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1329                            ac_shader_config* config,
1330                            struct radv_shader_args *args);
1331
1332 void lower_wqm(Program* program, live& live_vars,
1333                const struct radv_nir_compiler_options *options);
1334 void lower_bool_phis(Program* program);
1335 void calc_min_waves(Program* program);
1336 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1337 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1338 std::vector<uint16_t> dead_code_analysis(Program *program);
1339 void dominator_tree(Program* program);
1340 void insert_exec_mask(Program *program);
1341 void value_numbering(Program* program);
1342 void optimize(Program* program);
1343 void setup_reduce_temp(Program* program);
1344 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1345 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1346 void ssa_elimination(Program* program);
1347 void lower_to_hw_instr(Program* program);
1348 void schedule_program(Program* program, live& live_vars);
1349 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1350 void insert_wait_states(Program* program);
1351 void insert_NOPs(Program* program);
1352 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1353 void print_asm(Program *program, std::vector<uint32_t>& binary,
1354                unsigned exec_size, std::ostream& out);
1355 void validate(Program* program, FILE *output);
1356 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1357 #ifndef NDEBUG
1358 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1359 #else
1360 #define perfwarn(program, cond, msg, ...) do {} while(0)
1361 #endif
1362
1363 void collect_presched_stats(Program *program);
1364 void collect_preasm_stats(Program *program);
1365 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1366
1367 void aco_print_instr(Instruction *instr, FILE *output);
1368 void aco_print_program(Program *program, FILE *output);
1369
1370 /* utilities for dealing with register demand */
1371 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1372 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1373 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1374
1375 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1376 uint16_t get_extra_sgprs(Program *program);
1377
1378 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1379 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1380 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1381
1382 /* return number of addressable sgprs/vgprs for max_waves */
1383 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1384 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1385
1386 typedef struct {
1387    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1388    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1389    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1390    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1391    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1392    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1393    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1394    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1395 } Info;
1396
1397 extern const Info instr_info;
1398
1399 }
1400
1401 #endif /* ACO_IR_H */
1402