src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction : uint8_t {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    /* used for geometry shaders to ensure vertex data writes are before the
 112     * GS_DONE s_sendmsg. */
 113    barrier_gs_data = 0x10,
 114    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 115    barrier_gs_sendmsg = 0x20,
 116    /* used by barriers. created by s_barrier */
 117    barrier_barrier = 0x40,
 118    barrier_count = 6,
 119 };
 120
 121 enum fp_round {
 122    fp_round_ne = 0,
 123    fp_round_pi = 1,
 124    fp_round_ni = 2,
 125    fp_round_tz = 3,
 126 };
 127
 128 enum fp_denorm {
 129    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 130     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 131    fp_denorm_flush = 0x0,
 132    fp_denorm_keep = 0x3,
 133 };
 134
 135 struct float_mode {
 136    /* matches encoding of the MODE register */
 137    union {
 138       struct {
 139           fp_round round32:2;
 140           fp_round round16_64:2;
 141           unsigned denorm32:2;
 142           unsigned denorm16_64:2;
 143       };
 144       uint8_t val = 0;
 145    };
 146    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 147    bool preserve_signed_zero_inf_nan32:1;
 148    bool preserve_signed_zero_inf_nan16_64:1;
 149    /* if false, optimizations which may remove denormal flushing can be done */
 150    bool must_flush_denorms32:1;
 151    bool must_flush_denorms16_64:1;
 152    bool care_about_round32:1;
 153    bool care_about_round16_64:1;
 154
 155    /* Returns true if instructions using the mode "other" can safely use the
 156     * current one instead. */
 157    bool canReplace(float_mode other) const noexcept {
 158       return val == other.val &&
 159              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 160              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 161              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 162              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 163              (care_about_round32 || !other.care_about_round32) &&
 164              (care_about_round16_64 || !other.care_about_round16_64);
 165    }
 166 };
 167
 168 constexpr Format asVOP3(Format format) {
 169    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 170 };
 171
 172 enum class RegType {
 173    none = 0,
 174    sgpr,
 175    vgpr,
 176    linear_vgpr,
 177 };
 178
 179 struct RegClass {
 180
 181    enum RC : uint8_t {
 182       s1 = 1,
 183       s2 = 2,
 184       s3 = 3,
 185       s4 = 4,
 186       s6 = 6,
 187       s8 = 8,
 188       s16 = 16,
 189       v1 = s1 | (1 << 5),
 190       v2 = s2 | (1 << 5),
 191       v3 = s3 | (1 << 5),
 192       v4 = s4 | (1 << 5),
 193       v5 = 5  | (1 << 5),
 194       v6 = 6  | (1 << 5),
 195       v7 = 7  | (1 << 5),
 196       v8 = 8  | (1 << 5),
 197       /* these are used for WWM and spills to vgpr */
 198       v1_linear = v1 | (1 << 6),
 199       v2_linear = v2 | (1 << 6),
 200    };
 201
 202    RegClass() = default;
 203    constexpr RegClass(RC rc)
 204       : rc(rc) {}
 205    constexpr RegClass(RegType type, unsigned size)
 206       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 207
 208    constexpr operator RC() const { return rc; }
 209    explicit operator bool() = delete;
 210
 211    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 212    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 213    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 214    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 215
 216 private:
 217    RC rc;
 218 };
 219
 220 /* transitional helper expressions */
 221 static constexpr RegClass s1{RegClass::s1};
 222 static constexpr RegClass s2{RegClass::s2};
 223 static constexpr RegClass s3{RegClass::s3};
 224 static constexpr RegClass s4{RegClass::s4};
 225 static constexpr RegClass s8{RegClass::s8};
 226 static constexpr RegClass s16{RegClass::s16};
 227 static constexpr RegClass v1{RegClass::v1};
 228 static constexpr RegClass v2{RegClass::v2};
 229 static constexpr RegClass v3{RegClass::v3};
 230 static constexpr RegClass v4{RegClass::v4};
 231 static constexpr RegClass v5{RegClass::v5};
 232 static constexpr RegClass v6{RegClass::v6};
 233 static constexpr RegClass v7{RegClass::v7};
 234 static constexpr RegClass v8{RegClass::v8};
 235
 236 /**
 237  * Temp Class
 238  * Each temporary virtual register has a
 239  * register class (i.e. size and type)
 240  * and SSA id.
 241  */
 242 struct Temp {
 243    Temp() = default;
 244    constexpr Temp(uint32_t id, RegClass cls) noexcept
 245       : id_(id), reg_class(cls) {}
 246
 247    constexpr uint32_t id() const noexcept { return id_; }
 248    constexpr RegClass regClass() const noexcept { return reg_class; }
 249
 250    constexpr unsigned size() const noexcept { return reg_class.size(); }
 251    constexpr RegType type() const noexcept { return reg_class.type(); }
 252    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 253
 254    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 255    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 256    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 257
 258 private:
 259    uint32_t id_:24;
 260    RegClass reg_class;
 261 };
 262
 263 /**
 264  * PhysReg
 265  * Represents the physical register for each
 266  * Operand and Definition.
 267  */
 268 struct PhysReg {
 269    constexpr PhysReg() = default;
 270    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 271    constexpr unsigned reg() const { return reg_b >> 2; }
 272    constexpr unsigned byte() const { return reg_b & 0x3; }
 273    constexpr operator unsigned() const { return reg(); }
 274
 275    uint16_t reg_b = 0;
 276 };
 277
 278 /* helper expressions for special registers */
 279 static constexpr PhysReg m0{124};
 280 static constexpr PhysReg vcc{106};
 281 static constexpr PhysReg vcc_hi{107};
 282 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 283 static constexpr PhysReg exec{126};
 284 static constexpr PhysReg exec_lo{126};
 285 static constexpr PhysReg exec_hi{127};
 286 static constexpr PhysReg vccz{251};
 287 static constexpr PhysReg execz{252};
 288 static constexpr PhysReg scc{253};
 289
 290 /**
 291  * Operand Class
 292  * Initially, each Operand refers to either
 293  * a temporary virtual register
 294  * or to a constant value
 295  * Temporary registers get mapped to physical register during RA
 296  * Constant values are inlined into the instruction sequence.
 297  */
 298 class Operand final
 299 {
 300 public:
 301    constexpr Operand()
 302       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 303         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 304         isLateKill_(false) {}
 305
 306    explicit Operand(Temp r) noexcept
 307    {
 308       data_.temp = r;
 309       if (r.id()) {
 310          isTemp_ = true;
 311       } else {
 312          isUndef_ = true;
 313          setFixed(PhysReg{128});
 314       }
 315    };
 316    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 317    {
 318       data_.i = v;
 319       isConstant_ = true;
 320       is64BitConst_ = is64bit;
 321       if (v <= 64)
 322          setFixed(PhysReg{128 + v});
 323       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 324          setFixed(PhysReg{192 - v});
 325       else if (v == 0x3f000000) /* 0.5 */
 326          setFixed(PhysReg{240});
 327       else if (v == 0xbf000000) /* -0.5 */
 328          setFixed(PhysReg{241});
 329       else if (v == 0x3f800000) /* 1.0 */
 330          setFixed(PhysReg{242});
 331       else if (v == 0xbf800000) /* -1.0 */
 332          setFixed(PhysReg{243});
 333       else if (v == 0x40000000) /* 2.0 */
 334          setFixed(PhysReg{244});
 335       else if (v == 0xc0000000) /* -2.0 */
 336          setFixed(PhysReg{245});
 337       else if (v == 0x40800000) /* 4.0 */
 338          setFixed(PhysReg{246});
 339       else if (v == 0xc0800000) /* -4.0 */
 340          setFixed(PhysReg{247});
 341       else { /* Literal Constant */
 342          assert(!is64bit && "attempt to create a 64-bit literal constant");
 343          setFixed(PhysReg{255});
 344       }
 345    };
 346    explicit Operand(uint64_t v) noexcept
 347    {
 348       isConstant_ = true;
 349       is64BitConst_ = true;
 350       if (v <= 64) {
 351          data_.i = (uint32_t) v;
 352          setFixed(PhysReg{128 + (uint32_t) v});
 353       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 354          data_.i = (uint32_t) v;
 355          setFixed(PhysReg{192 - (uint32_t) v});
 356       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 357          data_.i = 0x3f000000;
 358          setFixed(PhysReg{240});
 359       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 360          data_.i = 0xbf000000;
 361          setFixed(PhysReg{241});
 362       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 363          data_.i = 0x3f800000;
 364          setFixed(PhysReg{242});
 365       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 366          data_.i = 0xbf800000;
 367          setFixed(PhysReg{243});
 368       } else if (v == 0x4000000000000000) { /* 2.0 */
 369          data_.i = 0x40000000;
 370          setFixed(PhysReg{244});
 371       } else if (v == 0xC000000000000000) { /* -2.0 */
 372          data_.i = 0xc0000000;
 373          setFixed(PhysReg{245});
 374       } else if (v == 0x4010000000000000) { /* 4.0 */
 375          data_.i = 0x40800000;
 376          setFixed(PhysReg{246});
 377       } else if (v == 0xC010000000000000) { /* -4.0 */
 378          data_.i = 0xc0800000;
 379          setFixed(PhysReg{247});
 380       } else { /* Literal Constant: we don't know if it is a long or double.*/
 381          isConstant_ = 0;
 382          assert(false && "attempt to create a 64-bit literal constant");
 383       }
 384    };
 385    explicit Operand(RegClass type) noexcept
 386    {
 387       isUndef_ = true;
 388       data_.temp = Temp(0, type);
 389       setFixed(PhysReg{128});
 390    };
 391    explicit Operand(PhysReg reg, RegClass type) noexcept
 392    {
 393       data_.temp = Temp(0, type);
 394       setFixed(reg);
 395    }
 396
 397    constexpr bool isTemp() const noexcept
 398    {
 399       return isTemp_;
 400    }
 401
 402    constexpr void setTemp(Temp t) noexcept {
 403       assert(!isConstant_);
 404       isTemp_ = true;
 405       data_.temp = t;
 406    }
 407
 408    constexpr Temp getTemp() const noexcept
 409    {
 410       return data_.temp;
 411    }
 412
 413    constexpr uint32_t tempId() const noexcept
 414    {
 415       return data_.temp.id();
 416    }
 417
 418    constexpr bool hasRegClass() const noexcept
 419    {
 420       return isTemp() || isUndefined();
 421    }
 422
 423    constexpr RegClass regClass() const noexcept
 424    {
 425       return data_.temp.regClass();
 426    }
 427
 428    constexpr unsigned size() const noexcept
 429    {
 430       if (isConstant())
 431          return is64BitConst_ ? 2 : 1;
 432       else
 433          return data_.temp.size();
 434    }
 435
 436    constexpr bool isFixed() const noexcept
 437    {
 438       return isFixed_;
 439    }
 440
 441    constexpr PhysReg physReg() const noexcept
 442    {
 443       return reg_;
 444    }
 445
 446    constexpr void setFixed(PhysReg reg) noexcept
 447    {
 448       isFixed_ = reg != unsigned(-1);
 449       reg_ = reg;
 450    }
 451
 452    constexpr bool isConstant() const noexcept
 453    {
 454       return isConstant_;
 455    }
 456
 457    constexpr bool isLiteral() const noexcept
 458    {
 459       return isConstant() && reg_ == 255;
 460    }
 461
 462    constexpr bool isUndefined() const noexcept
 463    {
 464       return isUndef_;
 465    }
 466
 467    constexpr uint32_t constantValue() const noexcept
 468    {
 469       return data_.i;
 470    }
 471
 472    constexpr bool constantEquals(uint32_t cmp) const noexcept
 473    {
 474       return isConstant() && constantValue() == cmp;
 475    }
 476
 477    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 478    {
 479       if (is64BitConst_) {
 480          if (reg_ <= 192)
 481             return reg_ - 128;
 482          else if (reg_ <= 208)
 483             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 484
 485          switch (reg_) {
 486          case 240:
 487             return 0x3FE0000000000000;
 488          case 241:
 489             return 0xBFE0000000000000;
 490          case 242:
 491             return 0x3FF0000000000000;
 492          case 243:
 493             return 0xBFF0000000000000;
 494          case 244:
 495             return 0x4000000000000000;
 496          case 245:
 497             return 0xC000000000000000;
 498          case 246:
 499             return 0x4010000000000000;
 500          case 247:
 501             return 0xC010000000000000;
 502          }
 503       }
 504       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 505    }
 506
 507    /* Indicates that the killed operand's live range intersects with the
 508     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 509     * not set by liveness analysis. */
 510    constexpr void setLateKill(bool flag) noexcept
 511    {
 512       isLateKill_ = flag;
 513    }
 514
 515    constexpr bool isLateKill() const noexcept
 516    {
 517       return isLateKill_;
 518    }
 519
 520    constexpr void setKill(bool flag) noexcept
 521    {
 522       isKill_ = flag;
 523       if (!flag)
 524          setFirstKill(false);
 525    }
 526
 527    constexpr bool isKill() const noexcept
 528    {
 529       return isKill_ || isFirstKill();
 530    }
 531
 532    constexpr void setFirstKill(bool flag) noexcept
 533    {
 534       isFirstKill_ = flag;
 535       if (flag)
 536          setKill(flag);
 537    }
 538
 539    /* When there are multiple operands killing the same temporary,
 540     * isFirstKill() is only returns true for the first one. */
 541    constexpr bool isFirstKill() const noexcept
 542    {
 543       return isFirstKill_;
 544    }
 545
 546    constexpr bool isKillBeforeDef() const noexcept
 547    {
 548       return isKill() && !isLateKill();
 549    }
 550
 551    constexpr bool isFirstKillBeforeDef() const noexcept
 552    {
 553       return isFirstKill() && !isLateKill();
 554    }
 555
 556    constexpr bool operator == (Operand other) const noexcept
 557    {
 558       if (other.size() != size())
 559          return false;
 560       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 561          return false;
 562       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 563          return false;
 564       if (isLiteral())
 565          return other.isLiteral() && other.constantValue() == constantValue();
 566       else if (isConstant())
 567          return other.isConstant() && other.physReg() == physReg();
 568       else if (isUndefined())
 569          return other.isUndefined() && other.regClass() == regClass();
 570       else
 571          return other.isTemp() && other.getTemp() == getTemp();
 572    }
 573 private:
 574    union {
 575       uint32_t i;
 576       float f;
 577       Temp temp = Temp(0, s1);
 578    } data_;
 579    PhysReg reg_;
 580    union {
 581       struct {
 582          uint8_t isTemp_:1;
 583          uint8_t isFixed_:1;
 584          uint8_t isConstant_:1;
 585          uint8_t isKill_:1;
 586          uint8_t isUndef_:1;
 587          uint8_t isFirstKill_:1;
 588          uint8_t is64BitConst_:1;
 589          uint8_t isLateKill_:1;
 590       };
 591       /* can't initialize bit-fields in c++11, so work around using a union */
 592       uint8_t control_ = 0;
 593    };
 594 };
 595
 596 /**
 597  * Definition Class
 598  * Definitions are the results of Instructions
 599  * and refer to temporary virtual registers
 600  * which are later mapped to physical registers
 601  */
 602 class Definition final
 603 {
 604 public:
 605    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 606    Definition(uint32_t index, RegClass type) noexcept
 607       : temp(index, type) {}
 608    explicit Definition(Temp tmp) noexcept
 609       : temp(tmp) {}
 610    Definition(PhysReg reg, RegClass type) noexcept
 611       : temp(Temp(0, type))
 612    {
 613       setFixed(reg);
 614    }
 615    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 616       : temp(Temp(tmpId, type))
 617    {
 618       setFixed(reg);
 619    }
 620
 621    constexpr bool isTemp() const noexcept
 622    {
 623       return tempId() > 0;
 624    }
 625
 626    constexpr Temp getTemp() const noexcept
 627    {
 628       return temp;
 629    }
 630
 631    constexpr uint32_t tempId() const noexcept
 632    {
 633       return temp.id();
 634    }
 635
 636    constexpr void setTemp(Temp t) noexcept {
 637       temp = t;
 638    }
 639
 640    constexpr RegClass regClass() const noexcept
 641    {
 642       return temp.regClass();
 643    }
 644
 645    constexpr unsigned size() const noexcept
 646    {
 647       return temp.size();
 648    }
 649
 650    constexpr bool isFixed() const noexcept
 651    {
 652       return isFixed_;
 653    }
 654
 655    constexpr PhysReg physReg() const noexcept
 656    {
 657       return reg_;
 658    }
 659
 660    constexpr void setFixed(PhysReg reg) noexcept
 661    {
 662       isFixed_ = 1;
 663       reg_ = reg;
 664    }
 665
 666    constexpr void setHint(PhysReg reg) noexcept
 667    {
 668       hasHint_ = 1;
 669       reg_ = reg;
 670    }
 671
 672    constexpr bool hasHint() const noexcept
 673    {
 674       return hasHint_;
 675    }
 676
 677    constexpr void setKill(bool flag) noexcept
 678    {
 679       isKill_ = flag;
 680    }
 681
 682    constexpr bool isKill() const noexcept
 683    {
 684       return isKill_;
 685    }
 686
 687 private:
 688    Temp temp = Temp(0, s1);
 689    PhysReg reg_;
 690    union {
 691       struct {
 692          uint8_t isFixed_:1;
 693          uint8_t hasHint_:1;
 694          uint8_t isKill_:1;
 695       };
 696       /* can't initialize bit-fields in c++11, so work around using a union */
 697       uint8_t control_ = 0;
 698    };
 699 };
 700
 701 class Block;
 702
 703 struct Instruction {
 704    aco_opcode opcode;
 705    Format format;
 706    uint32_t pass_flags;
 707
 708    aco::span<Operand> operands;
 709    aco::span<Definition> definitions;
 710
 711    constexpr bool isVALU() const noexcept
 712    {
 713       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 714           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 715           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 716           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 717           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 718           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 719    }
 720
 721    constexpr bool isSALU() const noexcept
 722    {
 723       return format == Format::SOP1 ||
 724              format == Format::SOP2 ||
 725              format == Format::SOPC ||
 726              format == Format::SOPK ||
 727              format == Format::SOPP;
 728    }
 729
 730    constexpr bool isVMEM() const noexcept
 731    {
 732       return format == Format::MTBUF ||
 733              format == Format::MUBUF ||
 734              format == Format::MIMG;
 735    }
 736
 737    constexpr bool isDPP() const noexcept
 738    {
 739       return (uint16_t) format & (uint16_t) Format::DPP;
 740    }
 741
 742    constexpr bool isVOP3() const noexcept
 743    {
 744       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 745              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 746              format == Format::VOP3P;
 747    }
 748
 749    constexpr bool isSDWA() const noexcept
 750    {
 751       return (uint16_t) format & (uint16_t) Format::SDWA;
 752    }
 753
 754    constexpr bool isFlatOrGlobal() const noexcept
 755    {
 756       return format == Format::FLAT || format == Format::GLOBAL;
 757    }
 758
 759    constexpr bool usesModifiers() const noexcept;
 760
 761    constexpr bool reads_exec() const noexcept
 762    {
 763       for (const Operand& op : operands) {
 764          if (op.isFixed() && op.physReg() == exec)
 765             return true;
 766       }
 767       return false;
 768    }
 769 };
 770
 771 struct SOPK_instruction : public Instruction {
 772    uint16_t imm;
 773 };
 774
 775 struct SOPP_instruction : public Instruction {
 776    uint32_t imm;
 777    int block;
 778 };
 779
 780 struct SOPC_instruction : public Instruction {
 781 };
 782
 783 struct SOP1_instruction : public Instruction {
 784 };
 785
 786 struct SOP2_instruction : public Instruction {
 787 };
 788
 789 /**
 790  * Scalar Memory Format:
 791  * For s_(buffer_)load_dword*:
 792  * Operand(0): SBASE - SGPR-pair which provides base address
 793  * Operand(1): Offset - immediate (un)signed offset or SGPR
 794  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 795  * Operand(n-1): SOffset - SGPR offset (Vega only)
 796  *
 797  * Having no operands is also valid for instructions such as s_dcache_inv.
 798  *
 799  */
 800 struct SMEM_instruction : public Instruction {
 801    bool glc : 1; /* VI+: globally coherent */
 802    bool dlc : 1; /* NAVI: device level coherent */
 803    bool nv : 1; /* VEGA only: Non-volatile */
 804    bool can_reorder : 1;
 805    bool disable_wqm : 1;
 806    barrier_interaction barrier;
 807 };
 808
 809 struct VOP1_instruction : public Instruction {
 810 };
 811
 812 struct VOP2_instruction : public Instruction {
 813 };
 814
 815 struct VOPC_instruction : public Instruction {
 816 };
 817
 818 struct VOP3A_instruction : public Instruction {
 819    bool abs[3];
 820    bool neg[3];
 821    uint8_t opsel : 4;
 822    uint8_t omod : 2;
 823    bool clamp : 1;
 824 };
 825
 826 /**
 827  * Data Parallel Primitives Format:
 828  * This format can be used for VOP1, VOP2 or VOPC instructions.
 829  * The swizzle applies to the src0 operand.
 830  *
 831  */
 832 struct DPP_instruction : public Instruction {
 833    bool abs[2];
 834    bool neg[2];
 835    uint16_t dpp_ctrl;
 836    uint8_t row_mask : 4;
 837    uint8_t bank_mask : 4;
 838    bool bound_ctrl : 1;
 839 };
 840
 841 struct Interp_instruction : public Instruction {
 842    uint8_t attribute;
 843    uint8_t component;
 844 };
 845
 846 /**
 847  * Local and Global Data Sharing instructions
 848  * Operand(0): ADDR - VGPR which supplies the address.
 849  * Operand(1): DATA0 - First data VGPR.
 850  * Operand(2): DATA1 - Second data VGPR.
 851  * Operand(n-1): M0 - LDS size.
 852  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 853  *
 854  */
 855 struct DS_instruction : public Instruction {
 856    int16_t offset0;
 857    int8_t offset1;
 858    bool gds;
 859 };
 860
 861 /**
 862  * Vector Memory Untyped-buffer Instructions
 863  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 864  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 865  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 866  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 867  *
 868  */
 869 struct MUBUF_instruction : public Instruction {
 870    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 871    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 872    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 873    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
 874    bool glc : 1; /* globally coherent */
 875    bool dlc : 1; /* NAVI: device level coherent */
 876    bool slc : 1; /* system level coherent */
 877    bool tfe : 1; /* texture fail enable */
 878    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
 879    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 880    bool can_reorder : 1;
 881    barrier_interaction barrier;
 882 };
 883
 884 /**
 885  * Vector Memory Typed-buffer Instructions
 886  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 887  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 888  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 889  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 890  *
 891  */
 892 struct MTBUF_instruction : public Instruction {
 893    uint16_t offset; /* Unsigned byte offset - 12 bit */
 894    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 895    uint8_t nfmt : 3; /* Numeric format of data in memory */
 896    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 897    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 898    bool glc : 1; /* globally coherent */
 899    bool dlc : 1; /* NAVI: device level coherent */
 900    bool slc : 1; /* system level coherent */
 901    bool tfe : 1; /* texture fail enable */
 902    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 903    bool can_reorder : 1;
 904    barrier_interaction barrier;
 905 };
 906
 907 /**
 908  * Vector Memory Image Instructions
 909  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 910  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 911  *             or VDATA - Vector GPR for write data.
 912  * Operand(2): VADDR - Address source. Can carry an offset or an index.
 913  * Definition(0): VDATA - Vector GPR for read result.
 914  *
 915  */
 916 struct MIMG_instruction : public Instruction {
 917    uint8_t dmask; /* Data VGPR enable mask */
 918    uint8_t dim : 3; /* NAVI: dimensionality */
 919    bool unrm : 1; /* Force address to be un-normalized */
 920    bool dlc : 1; /* NAVI: device level coherent */
 921    bool glc : 1; /* globally coherent */
 922    bool slc : 1; /* system level coherent */
 923    bool tfe : 1; /* texture fail enable */
 924    bool da : 1; /* declare an array */
 925    bool lwe : 1; /* Force data to be un-normalized */
 926    bool r128 : 1; /* NAVI: Texture resource size */
 927    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
 928    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
 929    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 930    bool can_reorder : 1;
 931    barrier_interaction barrier;
 932 };
 933
 934 /**
 935  * Flat/Scratch/Global Instructions
 936  * Operand(0): ADDR
 937  * Operand(1): SADDR
 938  * Operand(2) / Definition(0): DATA/VDST
 939  *
 940  */
 941 struct FLAT_instruction : public Instruction {
 942    uint16_t offset; /* Vega/Navi only */
 943    bool slc : 1; /* system level coherent */
 944    bool glc : 1; /* globally coherent */
 945    bool dlc : 1; /* NAVI: device level coherent */
 946    bool lds : 1;
 947    bool nv : 1;
 948    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
 949    bool can_reorder : 1;
 950    barrier_interaction barrier;
 951 };
 952
 953 struct Export_instruction : public Instruction {
 954    uint8_t enabled_mask;
 955    uint8_t dest;
 956    bool compressed : 1;
 957    bool done : 1;
 958    bool valid_mask : 1;
 959 };
 960
 961 struct Pseudo_instruction : public Instruction {
 962    bool tmp_in_scc;
 963    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 964 };
 965
 966 struct Pseudo_branch_instruction : public Instruction {
 967    /* target[0] is the block index of the branch target.
 968     * For conditional branches, target[1] contains the fall-through alternative.
 969     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 970     */
 971    uint32_t target[2];
 972 };
 973
 974 struct Pseudo_barrier_instruction : public Instruction {
 975 };
 976
 977 enum ReduceOp {
 978    iadd32, iadd64,
 979    imul32, imul64,
 980    fadd32, fadd64,
 981    fmul32, fmul64,
 982    imin32, imin64,
 983    imax32, imax64,
 984    umin32, umin64,
 985    umax32, umax64,
 986    fmin32, fmin64,
 987    fmax32, fmax64,
 988    iand32, iand64,
 989    ior32, ior64,
 990    ixor32, ixor64,
 991    gfx10_wave64_bpermute
 992 };
 993
 994 /**
 995  * Subgroup Reduction Instructions, everything except for the data to be
 996  * reduced and the result as inserted by setup_reduce_temp().
 997  * Operand(0): data to be reduced
 998  * Operand(1): reduce temporary
 999  * Operand(2): vector temporary
1000  * Definition(0): result
1001  * Definition(1): scalar temporary
1002  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1003  * Definition(3): scc clobber
1004  * Definition(4): vcc clobber
1005  *
1006  */
1007 struct Pseudo_reduction_instruction : public Instruction {
1008    ReduceOp reduce_op;
1009    unsigned cluster_size; // must be 0 for scans
1010 };
1011
1012 struct instr_deleter_functor {
1013    void operator()(void* p) {
1014       free(p);
1015    }
1016 };
1017
1018 template<typename T>
1019 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1020
1021 template<typename T>
1022 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1023 {
1024    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1025    char *data = (char*) calloc(1, size);
1026    T* inst = (T*) data;
1027
1028    inst->opcode = opcode;
1029    inst->format = format;
1030
1031    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1032    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1033    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1034    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1035
1036    return inst;
1037 }
1038
1039 constexpr bool Instruction::usesModifiers() const noexcept
1040 {
1041    if (isDPP() || isSDWA())
1042       return true;
1043    if (!isVOP3())
1044       return false;
1045    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1046    for (unsigned i = 0; i < operands.size(); i++) {
1047       if (vop3->abs[i] || vop3->neg[i])
1048          return true;
1049    }
1050    return vop3->opsel || vop3->clamp || vop3->omod;
1051 }
1052
1053 constexpr bool is_phi(Instruction* instr)
1054 {
1055    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1056 }
1057
1058 static inline bool is_phi(aco_ptr<Instruction>& instr)
1059 {
1060    return is_phi(instr.get());
1061 }
1062
1063 barrier_interaction get_barrier_interaction(Instruction* instr);
1064
1065 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1066
1067 enum block_kind {
1068    /* uniform indicates that leaving this block,
1069     * all actives lanes stay active */
1070    block_kind_uniform = 1 << 0,
1071    block_kind_top_level = 1 << 1,
1072    block_kind_loop_preheader = 1 << 2,
1073    block_kind_loop_header = 1 << 3,
1074    block_kind_loop_exit = 1 << 4,
1075    block_kind_continue = 1 << 5,
1076    block_kind_break = 1 << 6,
1077    block_kind_continue_or_break = 1 << 7,
1078    block_kind_discard = 1 << 8,
1079    block_kind_branch = 1 << 9,
1080    block_kind_merge = 1 << 10,
1081    block_kind_invert = 1 << 11,
1082    block_kind_uses_discard_if = 1 << 12,
1083    block_kind_needs_lowering = 1 << 13,
1084    block_kind_uses_demote = 1 << 14,
1085    block_kind_export_end = 1 << 15,
1086 };
1087
1088
1089 struct RegisterDemand {
1090    constexpr RegisterDemand() = default;
1091    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1092       : vgpr{v}, sgpr{s} {}
1093    int16_t vgpr = 0;
1094    int16_t sgpr = 0;
1095
1096    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1097       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1098    }
1099
1100    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1101       return vgpr > other.vgpr || sgpr > other.sgpr;
1102    }
1103
1104    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1105       if (t.type() == RegType::sgpr)
1106          return RegisterDemand( vgpr, sgpr + t.size() );
1107       else
1108          return RegisterDemand( vgpr + t.size(), sgpr );
1109    }
1110
1111    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1112       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1113    }
1114
1115    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1116       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1117    }
1118
1119    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1120       vgpr += other.vgpr;
1121       sgpr += other.sgpr;
1122       return *this;
1123    }
1124
1125    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1126       vgpr -= other.vgpr;
1127       sgpr -= other.sgpr;
1128       return *this;
1129    }
1130
1131    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1132       if (t.type() == RegType::sgpr)
1133          sgpr += t.size();
1134       else
1135          vgpr += t.size();
1136       return *this;
1137    }
1138
1139    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1140       if (t.type() == RegType::sgpr)
1141          sgpr -= t.size();
1142       else
1143          vgpr -= t.size();
1144       return *this;
1145    }
1146
1147    constexpr void update(const RegisterDemand other) noexcept {
1148       vgpr = std::max(vgpr, other.vgpr);
1149       sgpr = std::max(sgpr, other.sgpr);
1150    }
1151
1152 };
1153
1154 /* CFG */
1155 struct Block {
1156    float_mode fp_mode;
1157    unsigned index;
1158    unsigned offset = 0;
1159    std::vector<aco_ptr<Instruction>> instructions;
1160    std::vector<unsigned> logical_preds;
1161    std::vector<unsigned> linear_preds;
1162    std::vector<unsigned> logical_succs;
1163    std::vector<unsigned> linear_succs;
1164    RegisterDemand register_demand = RegisterDemand();
1165    uint16_t loop_nest_depth = 0;
1166    uint16_t kind = 0;
1167    int logical_idom = -1;
1168    int linear_idom = -1;
1169    Temp live_out_exec = Temp();
1170
1171    /* this information is needed for predecessors to blocks with phis when
1172     * moving out of ssa */
1173    bool scc_live_out = false;
1174    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1175
1176    Block(unsigned idx) : index(idx) {}
1177    Block() : index(0) {}
1178 };
1179
1180 using Stage = uint16_t;
1181
1182 /* software stages */
1183 static constexpr Stage sw_vs = 1 << 0;
1184 static constexpr Stage sw_gs = 1 << 1;
1185 static constexpr Stage sw_tcs = 1 << 2;
1186 static constexpr Stage sw_tes = 1 << 3;
1187 static constexpr Stage sw_fs = 1 << 4;
1188 static constexpr Stage sw_cs = 1 << 5;
1189 static constexpr Stage sw_gs_copy = 1 << 6;
1190 static constexpr Stage sw_mask = 0x7f;
1191
1192 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1193 static constexpr Stage hw_vs = 1 << 7;
1194 static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1195 static constexpr Stage hw_gs = 1 << 9;
1196 static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1197 static constexpr Stage hw_hs = 1 << 11;
1198 static constexpr Stage hw_fs = 1 << 12;
1199 static constexpr Stage hw_cs = 1 << 13;
1200 static constexpr Stage hw_mask = 0x7f << 7;
1201
1202 /* possible settings of Program::stage */
1203 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1204 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1205 static constexpr Stage compute_cs = sw_cs | hw_cs;
1206 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1207 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1208 /* GFX10/NGG */
1209 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1210 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1211 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1212 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1213 /* GFX9 (and GFX10 if NGG isn't used) */
1214 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1215 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1216 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1217 /* pre-GFX9 */
1218 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1219 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1220 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1221 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1222 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1223
1224 enum statistic {
1225    statistic_hash,
1226    statistic_instructions,
1227    statistic_copies,
1228    statistic_branches,
1229    statistic_cycles,
1230    statistic_vmem_clauses,
1231    statistic_smem_clauses,
1232    statistic_vmem_score,
1233    statistic_smem_score,
1234    statistic_sgpr_presched,
1235    statistic_vgpr_presched,
1236    num_statistics
1237 };
1238
1239 class Program final {
1240 public:
1241    float_mode next_fp_mode;
1242    std::vector<Block> blocks;
1243    RegisterDemand max_reg_demand = RegisterDemand();
1244    uint16_t num_waves = 0;
1245    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1246    ac_shader_config* config;
1247    struct radv_shader_info *info;
1248    enum chip_class chip_class;
1249    enum radeon_family family;
1250    unsigned wave_size;
1251    RegClass lane_mask;
1252    Stage stage; /* Stage */
1253    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1254    bool needs_wqm = false; /* there exists a p_wqm instruction */
1255    bool wb_smem_l1_on_end = false;
1256
1257    std::vector<uint8_t> constant_data;
1258    Temp private_segment_buffer;
1259    Temp scratch_offset;
1260
1261    uint16_t min_waves = 0;
1262    uint16_t lds_alloc_granule;
1263    uint32_t lds_limit; /* in bytes */
1264    bool has_16bank_lds;
1265    uint16_t vgpr_limit;
1266    uint16_t sgpr_limit;
1267    uint16_t physical_sgprs;
1268    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1269    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1270    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1271
1272    bool xnack_enabled = false;
1273
1274    bool needs_vcc = false;
1275    bool needs_flat_scr = false;
1276
1277    bool collect_statistics = false;
1278    uint32_t statistics[num_statistics];
1279
1280    uint32_t allocateId()
1281    {
1282       assert(allocationID <= 16777215);
1283       return allocationID++;
1284    }
1285
1286    uint32_t peekAllocationId()
1287    {
1288       return allocationID;
1289    }
1290
1291    void setAllocationId(uint32_t id)
1292    {
1293       allocationID = id;
1294    }
1295
1296    Block* create_and_insert_block() {
1297       blocks.emplace_back(blocks.size());
1298       blocks.back().fp_mode = next_fp_mode;
1299       return &blocks.back();
1300    }
1301
1302    Block* insert_block(Block&& block) {
1303       block.index = blocks.size();
1304       block.fp_mode = next_fp_mode;
1305       blocks.emplace_back(std::move(block));
1306       return &blocks.back();
1307    }
1308
1309 private:
1310    uint32_t allocationID = 1;
1311 };
1312
1313 struct live {
1314    /* live temps out per block */
1315    std::vector<std::set<Temp>> live_out;
1316    /* register demand (sgpr/vgpr) per instruction per block */
1317    std::vector<std::vector<RegisterDemand>> register_demand;
1318 };
1319
1320 void select_program(Program *program,
1321                     unsigned shader_count,
1322                     struct nir_shader *const *shaders,
1323                     ac_shader_config* config,
1324                     struct radv_shader_args *args);
1325 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1326                            ac_shader_config* config,
1327                            struct radv_shader_args *args);
1328
1329 void lower_wqm(Program* program, live& live_vars,
1330                const struct radv_nir_compiler_options *options);
1331 void lower_bool_phis(Program* program);
1332 void calc_min_waves(Program* program);
1333 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1334 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1335 std::vector<uint16_t> dead_code_analysis(Program *program);
1336 void dominator_tree(Program* program);
1337 void insert_exec_mask(Program *program);
1338 void value_numbering(Program* program);
1339 void optimize(Program* program);
1340 void setup_reduce_temp(Program* program);
1341 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1342 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1343 void ssa_elimination(Program* program);
1344 void lower_to_hw_instr(Program* program);
1345 void schedule_program(Program* program, live& live_vars);
1346 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1347 void insert_wait_states(Program* program);
1348 void insert_NOPs(Program* program);
1349 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1350 void print_asm(Program *program, std::vector<uint32_t>& binary,
1351                unsigned exec_size, std::ostream& out);
1352 void validate(Program* program, FILE *output);
1353 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1354 #ifndef NDEBUG
1355 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1356 #else
1357 #define perfwarn(program, cond, msg, ...) do {} while(0)
1358 #endif
1359
1360 void collect_presched_stats(Program *program);
1361 void collect_preasm_stats(Program *program);
1362 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1363
1364 void aco_print_instr(Instruction *instr, FILE *output);
1365 void aco_print_program(Program *program, FILE *output);
1366
1367 /* utilities for dealing with register demand */
1368 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1369 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1370 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1371
1372 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1373 uint16_t get_extra_sgprs(Program *program);
1374
1375 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1376 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1377 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1378
1379 /* return number of addressable sgprs/vgprs for max_waves */
1380 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1381 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1382
1383 typedef struct {
1384    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1385    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1386    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1387    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1388    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1389    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1390    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1391    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1392 } Info;
1393
1394 extern const Info instr_info;
1395
1396 }
1397
1398 #endif /* ACO_IR_H */
1399