src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 enum class RegType {
 173    none = 0,
 174    sgpr,
 175    vgpr,
 176    linear_vgpr,
 177 };
 178
 179 struct RegClass {
 180
 181    enum RC : uint8_t {
 182       s1 = 1,
 183       s2 = 2,
 184       s3 = 3,
 185       s4 = 4,
 186       s6 = 6,
 187       s8 = 8,
 188       s16 = 16,
 189       v1 = s1 | (1 << 5),
 190       v2 = s2 | (1 << 5),
 191       v3 = s3 | (1 << 5),
 192       v4 = s4 | (1 << 5),
 193       v5 = 5  | (1 << 5),
 194       v6 = 6  | (1 << 5),
 195       v7 = 7  | (1 << 5),
 196       v8 = 8  | (1 << 5),
 197       /* these are used for WWM and spills to vgpr */
 198       v1_linear = v1 | (1 << 6),
 199       v2_linear = v2 | (1 << 6),
 200    };
 201
 202    RegClass() = default;
 203    constexpr RegClass(RC rc)
 204       : rc(rc) {}
 205    constexpr RegClass(RegType type, unsigned size)
 206       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 207
 208    constexpr operator RC() const { return rc; }
 209    explicit operator bool() = delete;
 210
 211    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 212    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 213    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 214    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 215
 216 private:
 217    RC rc;
 218 };
 219
 220 /* transitional helper expressions */
 221 static constexpr RegClass s1{RegClass::s1};
 222 static constexpr RegClass s2{RegClass::s2};
 223 static constexpr RegClass s3{RegClass::s3};
 224 static constexpr RegClass s4{RegClass::s4};
 225 static constexpr RegClass s8{RegClass::s8};
 226 static constexpr RegClass s16{RegClass::s16};
 227 static constexpr RegClass v1{RegClass::v1};
 228 static constexpr RegClass v2{RegClass::v2};
 229 static constexpr RegClass v3{RegClass::v3};
 230 static constexpr RegClass v4{RegClass::v4};
 231 static constexpr RegClass v5{RegClass::v5};
 232 static constexpr RegClass v6{RegClass::v6};
 233 static constexpr RegClass v7{RegClass::v7};
 234 static constexpr RegClass v8{RegClass::v8};
 235
 236 /**
 237  * Temp Class
 238  * Each temporary virtual register has a
 239  * register class (i.e. size and type)
 240  * and SSA id.
 241  */
 242 struct Temp {
 243    Temp() = default;
 244    constexpr Temp(uint32_t id, RegClass cls) noexcept
 245       : id_(id), reg_class(cls) {}
 246
 247    constexpr uint32_t id() const noexcept { return id_; }
 248    constexpr RegClass regClass() const noexcept { return reg_class; }
 249
 250    constexpr unsigned size() const noexcept { return reg_class.size(); }
 251    constexpr RegType type() const noexcept { return reg_class.type(); }
 252    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 253
 254    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 255    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 256    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 257
 258 private:
 259    uint32_t id_:24;
 260    RegClass reg_class;
 261 };
 262
 263 /**
 264  * PhysReg
 265  * Represents the physical register for each
 266  * Operand and Definition.
 267  */
 268 struct PhysReg {
 269    constexpr PhysReg() = default;
 270    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 271    constexpr operator unsigned() const { return reg; }
 272
 273    uint16_t reg = 0;
 274 };
 275
 276 /* helper expressions for special registers */
 277 static constexpr PhysReg m0{124};
 278 static constexpr PhysReg vcc{106};
 279 static constexpr PhysReg vcc_hi{107};
 280 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 281 static constexpr PhysReg exec{126};
 282 static constexpr PhysReg exec_lo{126};
 283 static constexpr PhysReg exec_hi{127};
 284 static constexpr PhysReg vccz{251};
 285 static constexpr PhysReg execz{252};
 286 static constexpr PhysReg scc{253};
 287
 288 /**
 289  * Operand Class
 290  * Initially, each Operand refers to either
 291  * a temporary virtual register
 292  * or to a constant value
 293  * Temporary registers get mapped to physical register during RA
 294  * Constant values are inlined into the instruction sequence.
 295  */
 296 class Operand final
 297 {
 298 public:
 299    constexpr Operand()
 300       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 301         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 302         isLateKill_(false) {}
 303
 304    explicit Operand(Temp r) noexcept
 305    {
 306       data_.temp = r;
 307       if (r.id()) {
 308          isTemp_ = true;
 309       } else {
 310          isUndef_ = true;
 311          setFixed(PhysReg{128});
 312       }
 313    };
 314    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 315    {
 316       data_.i = v;
 317       isConstant_ = true;
 318       is64BitConst_ = is64bit;
 319       if (v <= 64)
 320          setFixed(PhysReg{128 + v});
 321       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 322          setFixed(PhysReg{192 - v});
 323       else if (v == 0x3f000000) /* 0.5 */
 324          setFixed(PhysReg{240});
 325       else if (v == 0xbf000000) /* -0.5 */
 326          setFixed(PhysReg{241});
 327       else if (v == 0x3f800000) /* 1.0 */
 328          setFixed(PhysReg{242});
 329       else if (v == 0xbf800000) /* -1.0 */
 330          setFixed(PhysReg{243});
 331       else if (v == 0x40000000) /* 2.0 */
 332          setFixed(PhysReg{244});
 333       else if (v == 0xc0000000) /* -2.0 */
 334          setFixed(PhysReg{245});
 335       else if (v == 0x40800000) /* 4.0 */
 336          setFixed(PhysReg{246});
 337       else if (v == 0xc0800000) /* -4.0 */
 338          setFixed(PhysReg{247});
 339       else { /* Literal Constant */
 340          assert(!is64bit && "attempt to create a 64-bit literal constant");
 341          setFixed(PhysReg{255});
 342       }
 343    };
 344    explicit Operand(uint64_t v) noexcept
 345    {
 346       isConstant_ = true;
 347       is64BitConst_ = true;
 348       if (v <= 64) {
 349          data_.i = (uint32_t) v;
 350          setFixed(PhysReg{128 + (uint32_t) v});
 351       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 352          data_.i = (uint32_t) v;
 353          setFixed(PhysReg{192 - (uint32_t) v});
 354       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 355          data_.i = 0x3f000000;
 356          setFixed(PhysReg{240});
 357       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 358          data_.i = 0xbf000000;
 359          setFixed(PhysReg{241});
 360       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 361          data_.i = 0x3f800000;
 362          setFixed(PhysReg{242});
 363       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 364          data_.i = 0xbf800000;
 365          setFixed(PhysReg{243});
 366       } else if (v == 0x4000000000000000) { /* 2.0 */
 367          data_.i = 0x40000000;
 368          setFixed(PhysReg{244});
 369       } else if (v == 0xC000000000000000) { /* -2.0 */
 370          data_.i = 0xc0000000;
 371          setFixed(PhysReg{245});
 372       } else if (v == 0x4010000000000000) { /* 4.0 */
 373          data_.i = 0x40800000;
 374          setFixed(PhysReg{246});
 375       } else if (v == 0xC010000000000000) { /* -4.0 */
 376          data_.i = 0xc0800000;
 377          setFixed(PhysReg{247});
 378       } else { /* Literal Constant: we don't know if it is a long or double.*/
 379          isConstant_ = 0;
 380          assert(false && "attempt to create a 64-bit literal constant");
 381       }
 382    };
 383    explicit Operand(RegClass type) noexcept
 384    {
 385       isUndef_ = true;
 386       data_.temp = Temp(0, type);
 387       setFixed(PhysReg{128});
 388    };
 389    explicit Operand(PhysReg reg, RegClass type) noexcept
 390    {
 391       data_.temp = Temp(0, type);
 392       setFixed(reg);
 393    }
 394
 395    constexpr bool isTemp() const noexcept
 396    {
 397       return isTemp_;
 398    }
 399
 400    constexpr void setTemp(Temp t) noexcept {
 401       assert(!isConstant_);
 402       isTemp_ = true;
 403       data_.temp = t;
 404    }
 405
 406    constexpr Temp getTemp() const noexcept
 407    {
 408       return data_.temp;
 409    }
 410
 411    constexpr uint32_t tempId() const noexcept
 412    {
 413       return data_.temp.id();
 414    }
 415
 416    constexpr bool hasRegClass() const noexcept
 417    {
 418       return isTemp() || isUndefined();
 419    }
 420
 421    constexpr RegClass regClass() const noexcept
 422    {
 423       return data_.temp.regClass();
 424    }
 425
 426    constexpr unsigned size() const noexcept
 427    {
 428       if (isConstant())
 429          return is64BitConst_ ? 2 : 1;
 430       else
 431          return data_.temp.size();
 432    }
 433
 434    constexpr bool isFixed() const noexcept
 435    {
 436       return isFixed_;
 437    }
 438
 439    constexpr PhysReg physReg() const noexcept
 440    {
 441       return reg_;
 442    }
 443
 444    constexpr void setFixed(PhysReg reg) noexcept
 445    {
 446       isFixed_ = reg != unsigned(-1);
 447       reg_ = reg;
 448    }
 449
 450    constexpr bool isConstant() const noexcept
 451    {
 452       return isConstant_;
 453    }
 454
 455    constexpr bool isLiteral() const noexcept
 456    {
 457       return isConstant() && reg_ == 255;
 458    }
 459
 460    constexpr bool isUndefined() const noexcept
 461    {
 462       return isUndef_;
 463    }
 464
 465    constexpr uint32_t constantValue() const noexcept
 466    {
 467       return data_.i;
 468    }
 469
 470    constexpr bool constantEquals(uint32_t cmp) const noexcept
 471    {
 472       return isConstant() && constantValue() == cmp;
 473    }
 474
 475    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 476    {
 477       if (is64BitConst_) {
 478          if (reg_.reg <= 192)
 479             return reg_.reg - 128;
 480          else if (reg_.reg <= 208)
 481             return 0xFFFFFFFFFFFFFFFF - (reg_.reg - 193);
 482
 483          switch (reg_.reg) {
 484          case 240:
 485             return 0x3FE0000000000000;
 486          case 241:
 487             return 0xBFE0000000000000;
 488          case 242:
 489             return 0x3FF0000000000000;
 490          case 243:
 491             return 0xBFF0000000000000;
 492          case 244:
 493             return 0x4000000000000000;
 494          case 245:
 495             return 0xC000000000000000;
 496          case 246:
 497             return 0x4010000000000000;
 498          case 247:
 499             return 0xC010000000000000;
 500          }
 501       }
 502       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 503    }
 504
 505    /* Indicates that the killed operand's live range intersects with the
 506     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 507     * not set by liveness analysis. */
 508    constexpr void setLateKill(bool flag) noexcept
 509    {
 510       isLateKill_ = flag;
 511    }
 512
 513    constexpr bool isLateKill() const noexcept
 514    {
 515       return isLateKill_;
 516    }
 517
 518    constexpr void setKill(bool flag) noexcept
 519    {
 520       isKill_ = flag;
 521       if (!flag)
 522          setFirstKill(false);
 523    }
 524
 525    constexpr bool isKill() const noexcept
 526    {
 527       return isKill_ || isFirstKill();
 528    }
 529
 530    constexpr void setFirstKill(bool flag) noexcept
 531    {
 532       isFirstKill_ = flag;
 533       if (flag)
 534          setKill(flag);
 535    }
 536
 537    /* When there are multiple operands killing the same temporary,
 538     * isFirstKill() is only returns true for the first one. */
 539    constexpr bool isFirstKill() const noexcept
 540    {
 541       return isFirstKill_;
 542    }
 543
 544    constexpr bool isKillBeforeDef() const noexcept
 545    {
 546       return isKill() && !isLateKill();
 547    }
 548
 549    constexpr bool isFirstKillBeforeDef() const noexcept
 550    {
 551       return isFirstKill() && !isLateKill();
 552    }
 553
 554    constexpr bool operator == (Operand other) const noexcept
 555    {
 556       if (other.size() != size())
 557          return false;
 558       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 559          return false;
 560       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 561          return false;
 562       if (isLiteral())
 563          return other.isLiteral() && other.constantValue() == constantValue();
 564       else if (isConstant())
 565          return other.isConstant() && other.physReg() == physReg();
 566       else if (isUndefined())
 567          return other.isUndefined() && other.regClass() == regClass();
 568       else
 569          return other.isTemp() && other.getTemp() == getTemp();
 570    }
 571 private:
 572    union {
 573       uint32_t i;
 574       float f;
 575       Temp temp = Temp(0, s1);
 576    } data_;
 577    PhysReg reg_;
 578    union {
 579       struct {
 580          uint8_t isTemp_:1;
 581          uint8_t isFixed_:1;
 582          uint8_t isConstant_:1;
 583          uint8_t isKill_:1;
 584          uint8_t isUndef_:1;
 585          uint8_t isFirstKill_:1;
 586          uint8_t is64BitConst_:1;
 587          uint8_t isLateKill_:1;
 588       };
 589       /* can't initialize bit-fields in c++11, so work around using a union */
 590       uint8_t control_ = 0;
 591    };
 592 };
 593
 594 /**
 595  * Definition Class
 596  * Definitions are the results of Instructions
 597  * and refer to temporary virtual registers
 598  * which are later mapped to physical registers
 599  */
 600 class Definition final
 601 {
 602 public:
 603    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 604    Definition(uint32_t index, RegClass type) noexcept
 605       : temp(index, type) {}
 606    explicit Definition(Temp tmp) noexcept
 607       : temp(tmp) {}
 608    Definition(PhysReg reg, RegClass type) noexcept
 609       : temp(Temp(0, type))
 610    {
 611       setFixed(reg);
 612    }
 613    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 614       : temp(Temp(tmpId, type))
 615    {
 616       setFixed(reg);
 617    }
 618
 619    constexpr bool isTemp() const noexcept
 620    {
 621       return tempId() > 0;
 622    }
 623
 624    constexpr Temp getTemp() const noexcept
 625    {
 626       return temp;
 627    }
 628
 629    constexpr uint32_t tempId() const noexcept
 630    {
 631       return temp.id();
 632    }
 633
 634    constexpr void setTemp(Temp t) noexcept {
 635       temp = t;
 636    }
 637
 638    constexpr RegClass regClass() const noexcept
 639    {
 640       return temp.regClass();
 641    }
 642
 643    constexpr unsigned size() const noexcept
 644    {
 645       return temp.size();
 646    }
 647
 648    constexpr bool isFixed() const noexcept
 649    {
 650       return isFixed_;
 651    }
 652
 653    constexpr PhysReg physReg() const noexcept
 654    {
 655       return reg_;
 656    }
 657
 658    constexpr void setFixed(PhysReg reg) noexcept
 659    {
 660       isFixed_ = 1;
 661       reg_ = reg;
 662    }
 663
 664    constexpr void setHint(PhysReg reg) noexcept
 665    {
 666       hasHint_ = 1;
 667       reg_ = reg;
 668    }
 669
 670    constexpr bool hasHint() const noexcept
 671    {
 672       return hasHint_;
 673    }
 674
 675    constexpr void setKill(bool flag) noexcept
 676    {
 677       isKill_ = flag;
 678    }
 679
 680    constexpr bool isKill() const noexcept
 681    {
 682       return isKill_;
 683    }
 684
 685 private:
 686    Temp temp = Temp(0, s1);
 687    PhysReg reg_;
 688    union {
 689       struct {
 690          uint8_t isFixed_:1;
 691          uint8_t hasHint_:1;
 692          uint8_t isKill_:1;
 693       };
 694       /* can't initialize bit-fields in c++11, so work around using a union */
 695       uint8_t control_ = 0;
 696    };
 697 };
 698
 699 class Block;
 700
 701 struct Instruction {
 702    aco_opcode opcode;
 703    Format format;
 704    uint32_t pass_flags;
 705
 706    aco::span<Operand> operands;
 707    aco::span<Definition> definitions;
 708
 709    constexpr bool isVALU() const noexcept
 710    {
 711       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 712           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 713           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 714           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 715           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 716           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 717    }
 718
 719    constexpr bool isSALU() const noexcept
 720    {
 721       return format == Format::SOP1 ||
 722              format == Format::SOP2 ||
 723              format == Format::SOPC ||
 724              format == Format::SOPK ||
 725              format == Format::SOPP;
 726    }
 727
 728    constexpr bool isVMEM() const noexcept
 729    {
 730       return format == Format::MTBUF ||
 731              format == Format::MUBUF ||
 732              format == Format::MIMG;
 733    }
 734
 735    constexpr bool isDPP() const noexcept
 736    {
 737       return (uint16_t) format & (uint16_t) Format::DPP;
 738    }
 739
 740    constexpr bool isVOP3() const noexcept
 741    {
 742       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 743              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 744              format == Format::VOP3P;
 745    }
 746
 747    constexpr bool isSDWA() const noexcept
 748    {
 749       return (uint16_t) format & (uint16_t) Format::SDWA;
 750    }
 751
 752    constexpr bool isFlatOrGlobal() const noexcept
 753    {
 754       return format == Format::FLAT || format == Format::GLOBAL;
 755    }
 756
 757    constexpr bool usesModifiers() const noexcept;
 758
 759    constexpr bool reads_exec() const noexcept
 760    {
 761       for (const Operand& op : operands) {
 762          if (op.isFixed() && op.physReg() == exec)
 763             return true;
 764       }
 765       return false;
 766    }
 767 };
 768
 769 struct SOPK_instruction : public Instruction {
 770    uint16_t imm;
 771 };
 772
 773 struct SOPP_instruction : public Instruction {
 774    uint32_t imm;
 775    int block;
 776 };
 777
 778 struct SOPC_instruction : public Instruction {
 779 };
 780
 781 struct SOP1_instruction : public Instruction {
 782 };
 783
 784 struct SOP2_instruction : public Instruction {
 785 };
 786
 787 /**
 788  * Scalar Memory Format:
 789  * For s_(buffer_)load_dword*:
 790  * Operand(0): SBASE - SGPR-pair which provides base address
 791  * Operand(1): Offset - immediate (un)signed offset or SGPR
 792  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 793  * Operand(n-1): SOffset - SGPR offset (Vega only)
 794  *
 795  * Having no operands is also valid for instructions such as s_dcache_inv.
 796  *
 797  */
 798 struct SMEM_instruction : public Instruction {
 799    bool glc : 1; /* VI+: globally coherent */
 800    bool dlc : 1; /* NAVI: device level coherent */
 801    bool nv : 1; /* VEGA only: Non-volatile */
 802    bool can_reorder : 1;
 803    bool disable_wqm : 1;
 804    barrier_interaction barrier;
 805 };
 806
 807 struct VOP1_instruction : public Instruction {
 808 };
 809
 810 struct VOP2_instruction : public Instruction {
 811 };
 812
 813 struct VOPC_instruction : public Instruction {
 814 };
 815
 816 struct VOP3A_instruction : public Instruction {
 817    bool abs[3];
 818    bool neg[3];
 819    uint8_t opsel : 4;
 820    uint8_t omod : 2;
 821    bool clamp : 1;
 822 };
 823
 824 /**
 825  * Data Parallel Primitives Format:
 826  * This format can be used for VOP1, VOP2 or VOPC instructions.
 827  * The swizzle applies to the src0 operand.
 828  *
 829  */
 830 struct DPP_instruction : public Instruction {
 831    bool abs[2];
 832    bool neg[2];
 833    uint16_t dpp_ctrl;
 834    uint8_t row_mask : 4;
 835    uint8_t bank_mask : 4;
 836    bool bound_ctrl : 1;
 837 };
 838
 839 struct Interp_instruction : public Instruction {
 840    uint8_t attribute;
 841    uint8_t component;
 842 };
 843
 844 /**
 845  * Local and Global Data Sharing instructions
 846  * Operand(0): ADDR - VGPR which supplies the address.
 847  * Operand(1): DATA0 - First data VGPR.
 848  * Operand(2): DATA1 - Second data VGPR.
 849  * Operand(n-1): M0 - LDS size.
 850  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 851  *
 852  */
 853 struct DS_instruction : public Instruction {
 854    int16_t offset0;
 855    int8_t offset1;
 856    bool gds;
 857 };
 858
 859 /**
 860  * Vector Memory Untyped-buffer Instructions
 861  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 862  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 863  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 864  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 865  *
 866  */
 867 struct MUBUF_instruction : public Instruction {
 868    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 869    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 870    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 871    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 872    bool glc : 1; /* globally coherent */
 873    bool dlc : 1; /* NAVI: device level coherent */
 874    bool slc : 1; /* system level coherent */
 875    bool tfe : 1; /* texture fail enable */
 876    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 877    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 878    bool can_reorder : 1;
 879    barrier_interaction barrier;
 880 };
 881
 882 /**
 883  * Vector Memory Typed-buffer Instructions
 884  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 885  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 886  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 887  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 888  *
 889  */
 890 struct MTBUF_instruction : public Instruction {
 891    uint16_t offset; /* Unsigned byte offset - 12 bit */
 892    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 893    uint8_t nfmt : 3; /* Numeric format of data in memory */
 894    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 895    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 896    bool glc : 1; /* globally coherent */
 897    bool dlc : 1; /* NAVI: device level coherent */
 898    bool slc : 1; /* system level coherent */
 899    bool tfe : 1; /* texture fail enable */
 900    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 901    bool can_reorder : 1;
 902    barrier_interaction barrier;
 903 };
 904
 905 /**
 906  * Vector Memory Image Instructions
 907  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 908  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 909  *             or VDATA - Vector GPR for write data.
 910  * Operand(2): VADDR - Address source. Can carry an offset or an index.
 911  * Definition(0): VDATA - Vector GPR for read result.
 912  *
 913  */
 914 struct MIMG_instruction : public Instruction {
 915    uint8_t dmask; /* Data VGPR enable mask */
 916    uint8_t dim : 3; /* NAVI: dimensionality */
 917    bool unrm : 1; /* Force address to be un-normalized */
 918    bool dlc : 1; /* NAVI: device level coherent */
 919    bool glc : 1; /* globally coherent */
 920    bool slc : 1; /* system level coherent */
 921    bool tfe : 1; /* texture fail enable */
 922    bool da : 1; /* declare an array */
 923    bool lwe : 1; /* Force data to be un-normalized */
 924    bool r128 : 1; /* NAVI: Texture resource size */
 925    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
 926    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
 927    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 928    bool can_reorder : 1;
 929    barrier_interaction barrier;
 930 };
 931
 932 /**
 933  * Flat/Scratch/Global Instructions
 934  * Operand(0): ADDR
 935  * Operand(1): SADDR
 936  * Operand(2) / Definition(0): DATA/VDST
 937  *
 938  */
 939 struct FLAT_instruction : public Instruction {
 940    uint16_t offset; /* Vega/Navi only */
 941    bool slc : 1; /* system level coherent */
 942    bool glc : 1; /* globally coherent */
 943    bool dlc : 1; /* NAVI: device level coherent */
 944    bool lds : 1;
 945    bool nv : 1;
 946    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 947    bool can_reorder : 1;
 948    barrier_interaction barrier;
 949 };
 950
 951 struct Export_instruction : public Instruction {
 952    uint8_t enabled_mask;
 953    uint8_t dest;
 954    bool compressed : 1;
 955    bool done : 1;
 956    bool valid_mask : 1;
 957 };
 958
 959 struct Pseudo_instruction : public Instruction {
 960    bool tmp_in_scc;
 961    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 962 };
 963
 964 struct Pseudo_branch_instruction : public Instruction {
 965    /* target[0] is the block index of the branch target.
 966     * For conditional branches, target[1] contains the fall-through alternative.
 967     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 968     */
 969    uint32_t target[2];
 970 };
 971
 972 struct Pseudo_barrier_instruction : public Instruction {
 973 };
 974
 975 enum ReduceOp {
 976    iadd32, iadd64,
 977    imul32, imul64,
 978    fadd32, fadd64,
 979    fmul32, fmul64,
 980    imin32, imin64,
 981    imax32, imax64,
 982    umin32, umin64,
 983    umax32, umax64,
 984    fmin32, fmin64,
 985    fmax32, fmax64,
 986    iand32, iand64,
 987    ior32, ior64,
 988    ixor32, ixor64,
 989    gfx10_wave64_bpermute
 990 };
 991
 992 /**
 993  * Subgroup Reduction Instructions, everything except for the data to be
 994  * reduced and the result as inserted by setup_reduce_temp().
 995  * Operand(0): data to be reduced
 996  * Operand(1): reduce temporary
 997  * Operand(2): vector temporary
 998  * Definition(0): result
 999  * Definition(1): scalar temporary
1000  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1001  * Definition(3): scc clobber
1002  * Definition(4): vcc clobber
1003  *
1004  */
1005 struct Pseudo_reduction_instruction : public Instruction {
1006    ReduceOp reduce_op;
1007    unsigned cluster_size; // must be 0 for scans
1008 };
1009
1010 struct instr_deleter_functor {
1011    void operator()(void* p) {
1012       free(p);
1013    }
1014 };
1015
1016 template<typename T>
1017 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1018
1019 template<typename T>
1020 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1021 {
1022    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1023    char *data = (char*) calloc(1, size);
1024    T* inst = (T*) data;
1025
1026    inst->opcode = opcode;
1027    inst->format = format;
1028
1029    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1030    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1031    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1032    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1033
1034    return inst;
1035 }
1036
1037 constexpr bool Instruction::usesModifiers() const noexcept
1038 {
1039    if (isDPP() || isSDWA())
1040       return true;
1041    if (!isVOP3())
1042       return false;
1043    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1044    for (unsigned i = 0; i < operands.size(); i++) {
1045       if (vop3->abs[i] || vop3->neg[i])
1046          return true;
1047    }
1048    return vop3->opsel || vop3->clamp || vop3->omod;
1049 }
1050
1051 constexpr bool is_phi(Instruction* instr)
1052 {
1053    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1054 }
1055
1056 static inline bool is_phi(aco_ptr<Instruction>& instr)
1057 {
1058    return is_phi(instr.get());
1059 }
1060
1061 barrier_interaction get_barrier_interaction(Instruction* instr);
1062
1063 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1064
1065 enum block_kind {
1066    /* uniform indicates that leaving this block,
1067     * all actives lanes stay active */
1068    block_kind_uniform = 1 << 0,
1069    block_kind_top_level = 1 << 1,
1070    block_kind_loop_preheader = 1 << 2,
1071    block_kind_loop_header = 1 << 3,
1072    block_kind_loop_exit = 1 << 4,
1073    block_kind_continue = 1 << 5,
1074    block_kind_break = 1 << 6,
1075    block_kind_continue_or_break = 1 << 7,
1076    block_kind_discard = 1 << 8,
1077    block_kind_branch = 1 << 9,
1078    block_kind_merge = 1 << 10,
1079    block_kind_invert = 1 << 11,
1080    block_kind_uses_discard_if = 1 << 12,
1081    block_kind_needs_lowering = 1 << 13,
1082    block_kind_uses_demote = 1 << 14,
1083    block_kind_export_end = 1 << 15,
1084 };
1085
1086
1087 struct RegisterDemand {
1088    constexpr RegisterDemand() = default;
1089    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1090       : vgpr{v}, sgpr{s} {}
1091    int16_t vgpr = 0;
1092    int16_t sgpr = 0;
1093
1094    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1095       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1096    }
1097
1098    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1099       return vgpr > other.vgpr || sgpr > other.sgpr;
1100    }
1101
1102    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1103       if (t.type() == RegType::sgpr)
1104          return RegisterDemand( vgpr, sgpr + t.size() );
1105       else
1106          return RegisterDemand( vgpr + t.size(), sgpr );
1107    }
1108
1109    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1110       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1111    }
1112
1113    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1114       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1115    }
1116
1117    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1118       vgpr += other.vgpr;
1119       sgpr += other.sgpr;
1120       return *this;
1121    }
1122
1123    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1124       vgpr -= other.vgpr;
1125       sgpr -= other.sgpr;
1126       return *this;
1127    }
1128
1129    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1130       if (t.type() == RegType::sgpr)
1131          sgpr += t.size();
1132       else
1133          vgpr += t.size();
1134       return *this;
1135    }
1136
1137    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1138       if (t.type() == RegType::sgpr)
1139          sgpr -= t.size();
1140       else
1141          vgpr -= t.size();
1142       return *this;
1143    }
1144
1145    constexpr void update(const RegisterDemand other) noexcept {
1146       vgpr = std::max(vgpr, other.vgpr);
1147       sgpr = std::max(sgpr, other.sgpr);
1148    }
1149
1150 };
1151
1152 /* CFG */
1153 struct Block {
1154    float_mode fp_mode;
1155    unsigned index;
1156    unsigned offset = 0;
1157    std::vector<aco_ptr<Instruction>> instructions;
1158    std::vector<unsigned> logical_preds;
1159    std::vector<unsigned> linear_preds;
1160    std::vector<unsigned> logical_succs;
1161    std::vector<unsigned> linear_succs;
1162    RegisterDemand register_demand = RegisterDemand();
1163    uint16_t loop_nest_depth = 0;
1164    uint16_t kind = 0;
1165    int logical_idom = -1;
1166    int linear_idom = -1;
1167    Temp live_out_exec = Temp();
1168
1169    /* this information is needed for predecessors to blocks with phis when
1170     * moving out of ssa */
1171    bool scc_live_out = false;
1172    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1173
1174    Block(unsigned idx) : index(idx) {}
1175    Block() : index(0) {}
1176 };
1177
1178 using Stage = uint16_t;
1179
1180 /* software stages */
1181 static constexpr Stage sw_vs = 1 << 0;
1182 static constexpr Stage sw_gs = 1 << 1;
1183 static constexpr Stage sw_tcs = 1 << 2;
1184 static constexpr Stage sw_tes = 1 << 3;
1185 static constexpr Stage sw_fs = 1 << 4;
1186 static constexpr Stage sw_cs = 1 << 5;
1187 static constexpr Stage sw_gs_copy = 1 << 6;
1188 static constexpr Stage sw_mask = 0x7f;
1189
1190 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1191 static constexpr Stage hw_vs = 1 << 7;
1192 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1193 static constexpr Stage hw_gs = 1 << 9;
1194 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1195 static constexpr Stage hw_hs = 1 << 11;
1196 static constexpr Stage hw_fs = 1 << 12;
1197 static constexpr Stage hw_cs = 1 << 13;
1198 static constexpr Stage hw_mask = 0x7f << 7;
1199
1200 /* possible settings of Program::stage */
1201 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1202 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1203 static constexpr Stage compute_cs = sw_cs | hw_cs;
1204 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1205 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1206 /* GFX10/NGG */
1207 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1208 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1209 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1210 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1211 /* GFX9 (and GFX10 if NGG isn't used) */
1212 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1213 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1214 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1215 /* pre-GFX9 */
1216 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1217 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1218 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1219 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1220 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1221
1222 enum statistic {
1223    statistic_hash,
1224    statistic_instructions,
1225    statistic_copies,
1226    statistic_branches,
1227    statistic_cycles,
1228    statistic_vmem_clauses,
1229    statistic_smem_clauses,
1230    statistic_vmem_score,
1231    statistic_smem_score,
1232    statistic_sgpr_presched,
1233    statistic_vgpr_presched,
1234    num_statistics
1235 };
1236
1237 class Program final {
1238 public:
1239    float_mode next_fp_mode;
1240    std::vector<Block> blocks;
1241    RegisterDemand max_reg_demand = RegisterDemand();
1242    uint16_t num_waves = 0;
1243    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1244    ac_shader_config* config;
1245    struct radv_shader_info *info;
1246    enum chip_class chip_class;
1247    enum radeon_family family;
1248    unsigned wave_size;
1249    RegClass lane_mask;
1250    Stage stage; /* Stage */
1251    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1252    bool needs_wqm = false; /* there exists a p_wqm instruction */
1253    bool wb_smem_l1_on_end = false;
1254
1255    std::vector<uint8_t> constant_data;
1256    Temp private_segment_buffer;
1257    Temp scratch_offset;
1258
1259    uint16_t min_waves = 0;
1260    uint16_t lds_alloc_granule;
1261    uint32_t lds_limit; /* in bytes */
1262    bool has_16bank_lds;
1263    uint16_t vgpr_limit;
1264    uint16_t sgpr_limit;
1265    uint16_t physical_sgprs;
1266    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1267    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1268    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1269
1270    bool xnack_enabled = false;
1271
1272    bool needs_vcc = false;
1273    bool needs_flat_scr = false;
1274
1275    bool collect_statistics = false;
1276    uint32_t statistics[num_statistics];
1277
1278    uint32_t allocateId()
1279    {
1280       assert(allocationID <= 16777215);
1281       return allocationID++;
1282    }
1283
1284    uint32_t peekAllocationId()
1285    {
1286       return allocationID;
1287    }
1288
1289    void setAllocationId(uint32_t id)
1290    {
1291       allocationID = id;
1292    }
1293
1294    Block* create_and_insert_block() {
1295       blocks.emplace_back(blocks.size());
1296       blocks.back().fp_mode = next_fp_mode;
1297       return &blocks.back();
1298    }
1299
1300    Block* insert_block(Block&& block) {
1301       block.index = blocks.size();
1302       block.fp_mode = next_fp_mode;
1303       blocks.emplace_back(std::move(block));
1304       return &blocks.back();
1305    }
1306
1307 private:
1308    uint32_t allocationID = 1;
1309 };
1310
1311 struct live {
1312    /* live temps out per block */
1313    std::vector<std::set<Temp>> live_out;
1314    /* register demand (sgpr/vgpr) per instruction per block */
1315    std::vector<std::vector<RegisterDemand>> register_demand;
1316 };
1317
1318 void select_program(Program *program,
1319                     unsigned shader_count,
1320                     struct nir_shader *const *shaders,
1321                     ac_shader_config* config,
1322                     struct radv_shader_args *args);
1323 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1324                            ac_shader_config* config,
1325                            struct radv_shader_args *args);
1326
1327 void lower_wqm(Program* program, live& live_vars,
1328                const struct radv_nir_compiler_options *options);
1329 void lower_bool_phis(Program* program);
1330 void calc_min_waves(Program* program);
1331 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1332 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1333 std::vector<uint16_t> dead_code_analysis(Program *program);
1334 void dominator_tree(Program* program);
1335 void insert_exec_mask(Program *program);
1336 void value_numbering(Program* program);
1337 void optimize(Program* program);
1338 void setup_reduce_temp(Program* program);
1339 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1340 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1341 void ssa_elimination(Program* program);
1342 void lower_to_hw_instr(Program* program);
1343 void schedule_program(Program* program, live& live_vars);
1344 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1345 void insert_wait_states(Program* program);
1346 void insert_NOPs(Program* program);
1347 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1348 void print_asm(Program *program, std::vector<uint32_t>& binary,
1349                unsigned exec_size, std::ostream& out);
1350 void validate(Program* program, FILE *output);
1351 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1352 #ifndef NDEBUG
1353 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1354 #else
1355 #define perfwarn(program, cond, msg, ...) do {} while(0)
1356 #endif
1357
1358 void collect_presched_stats(Program *program);
1359 void collect_preasm_stats(Program *program);
1360 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1361
1362 void aco_print_instr(Instruction *instr, FILE *output);
1363 void aco_print_program(Program *program, FILE *output);
1364
1365 /* utilities for dealing with register demand */
1366 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1367 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1368 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1369
1370 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1371 uint16_t get_extra_sgprs(Program *program);
1372
1373 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1374 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1375 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1376
1377 /* return number of addressable sgprs/vgprs for max_waves */
1378 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1379 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1380
1381 typedef struct {
1382    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1383    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1384    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1385    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1386    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1387    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1388    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1389    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1390 } Info;
1391
1392 extern const Info instr_info;
1393
1394 }
1395
1396 #endif /* ACO_IR_H */
1397