src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 struct radv_nir_compiler_options;
  41 struct radv_shader_args;
  42 struct radv_shader_info;
  43
  44 namespace aco {
  45
  46 extern uint64_t debug_flags;
  47
  48 enum {
  49    DEBUG_VALIDATE = 0x1,
  50    DEBUG_VALIDATE_RA = 0x2,
  51    DEBUG_PERFWARN = 0x4,
  52 };
  53
  54 /**
  55  * Representation of the instruction's microcode encoding format
  56  * Note: Some Vector ALU Formats can be combined, such that:
  57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  60  *
  61  * (*) The same is applicable for VOP1 and VOPC instructions.
  62  */
  63 enum class Format : std::uint16_t {
  64    /* Pseudo Instruction Format */
  65    PSEUDO = 0,
  66    /* Scalar ALU & Control Formats */
  67    SOP1 = 1,
  68    SOP2 = 2,
  69    SOPK = 3,
  70    SOPP = 4,
  71    SOPC = 5,
  72    /* Scalar Memory Format */
  73    SMEM = 6,
  74    /* LDS/GDS Format */
  75    DS = 8,
  76    /* Vector Memory Buffer Formats */
  77    MTBUF = 9,
  78    MUBUF = 10,
  79    /* Vector Memory Image Format */
  80    MIMG = 11,
  81    /* Export Format */
  82    EXP = 12,
  83    /* Flat Formats */
  84    FLAT = 13,
  85    GLOBAL = 14,
  86    SCRATCH = 15,
  87
  88    PSEUDO_BRANCH = 16,
  89    PSEUDO_BARRIER = 17,
  90    PSEUDO_REDUCTION = 18,
  91
  92    /* Vector ALU Formats */
  93    VOP3P = 19,
  94    VOP1 = 1 << 8,
  95    VOP2 = 1 << 9,
  96    VOPC = 1 << 10,
  97    VOP3 = 1 << 11,
  98    VOP3A = 1 << 11,
  99    VOP3B = 1 << 11,
 100    /* Vector Parameter Interpolation Format */
 101    VINTRP = 1 << 12,
 102    DPP = 1 << 13,
 103    SDWA = 1 << 14,
 104 };
 105
 106 enum barrier_interaction : uint8_t {
 107    barrier_none = 0,
 108    barrier_buffer = 0x1,
 109    barrier_image = 0x2,
 110    barrier_atomic = 0x4,
 111    barrier_shared = 0x8,
 112    /* used for geometry shaders to ensure vertex data writes are before the
 113     * GS_DONE s_sendmsg. */
 114    barrier_gs_data = 0x10,
 115    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 116    barrier_gs_sendmsg = 0x20,
 117    /* used by barriers. created by s_barrier */
 118    barrier_barrier = 0x40,
 119    barrier_count = 7,
 120 };
 121
 122 enum fp_round {
 123    fp_round_ne = 0,
 124    fp_round_pi = 1,
 125    fp_round_ni = 2,
 126    fp_round_tz = 3,
 127 };
 128
 129 enum fp_denorm {
 130    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 131     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 132    fp_denorm_flush = 0x0,
 133    fp_denorm_keep = 0x3,
 134 };
 135
 136 struct float_mode {
 137    /* matches encoding of the MODE register */
 138    union {
 139       struct {
 140           fp_round round32:2;
 141           fp_round round16_64:2;
 142           unsigned denorm32:2;
 143           unsigned denorm16_64:2;
 144       };
 145       uint8_t val = 0;
 146    };
 147    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 148    bool preserve_signed_zero_inf_nan32:1;
 149    bool preserve_signed_zero_inf_nan16_64:1;
 150    /* if false, optimizations which may remove denormal flushing can be done */
 151    bool must_flush_denorms32:1;
 152    bool must_flush_denorms16_64:1;
 153    bool care_about_round32:1;
 154    bool care_about_round16_64:1;
 155
 156    /* Returns true if instructions using the mode "other" can safely use the
 157     * current one instead. */
 158    bool canReplace(float_mode other) const noexcept {
 159       return val == other.val &&
 160              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 161              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 162              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 163              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 164              (care_about_round32 || !other.care_about_round32) &&
 165              (care_about_round16_64 || !other.care_about_round16_64);
 166    }
 167 };
 168
 169 constexpr Format asVOP3(Format format) {
 170    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 171 };
 172
 173 constexpr Format asSDWA(Format format) {
 174    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 175    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 176 }
 177
 178 enum class RegType {
 179    none = 0,
 180    sgpr,
 181    vgpr,
 182    linear_vgpr,
 183 };
 184
 185 struct RegClass {
 186
 187    enum RC : uint8_t {
 188       s1 = 1,
 189       s2 = 2,
 190       s3 = 3,
 191       s4 = 4,
 192       s6 = 6,
 193       s8 = 8,
 194       s16 = 16,
 195       v1 = s1 | (1 << 5),
 196       v2 = s2 | (1 << 5),
 197       v3 = s3 | (1 << 5),
 198       v4 = s4 | (1 << 5),
 199       v5 = 5  | (1 << 5),
 200       v6 = 6  | (1 << 5),
 201       v7 = 7  | (1 << 5),
 202       v8 = 8  | (1 << 5),
 203       /* byte-sized register class */
 204       v1b = v1 | (1 << 7),
 205       v2b = v2 | (1 << 7),
 206       v3b = v3 | (1 << 7),
 207       v4b = v4 | (1 << 7),
 208       v6b = v6 | (1 << 7),
 209       v8b = v8 | (1 << 7),
 210       /* these are used for WWM and spills to vgpr */
 211       v1_linear = v1 | (1 << 6),
 212       v2_linear = v2 | (1 << 6),
 213    };
 214
 215    RegClass() = default;
 216    constexpr RegClass(RC rc)
 217       : rc(rc) {}
 218    constexpr RegClass(RegType type, unsigned size)
 219       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 220
 221    constexpr operator RC() const { return rc; }
 222    explicit operator bool() = delete;
 223
 224    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 225    constexpr bool is_subdword() const { return rc & (1 << 7); }
 226    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 227    //TODO: use size() less in favor of bytes()
 228    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 229    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 230    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 231    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 232
 233 private:
 234    RC rc;
 235 };
 236
 237 /* transitional helper expressions */
 238 static constexpr RegClass s1{RegClass::s1};
 239 static constexpr RegClass s2{RegClass::s2};
 240 static constexpr RegClass s3{RegClass::s3};
 241 static constexpr RegClass s4{RegClass::s4};
 242 static constexpr RegClass s8{RegClass::s8};
 243 static constexpr RegClass s16{RegClass::s16};
 244 static constexpr RegClass v1{RegClass::v1};
 245 static constexpr RegClass v2{RegClass::v2};
 246 static constexpr RegClass v3{RegClass::v3};
 247 static constexpr RegClass v4{RegClass::v4};
 248 static constexpr RegClass v5{RegClass::v5};
 249 static constexpr RegClass v6{RegClass::v6};
 250 static constexpr RegClass v7{RegClass::v7};
 251 static constexpr RegClass v8{RegClass::v8};
 252 static constexpr RegClass v1b{RegClass::v1b};
 253 static constexpr RegClass v2b{RegClass::v2b};
 254 static constexpr RegClass v3b{RegClass::v3b};
 255 static constexpr RegClass v4b{RegClass::v4b};
 256 static constexpr RegClass v6b{RegClass::v6b};
 257 static constexpr RegClass v8b{RegClass::v8b};
 258
 259 /**
 260  * Temp Class
 261  * Each temporary virtual register has a
 262  * register class (i.e. size and type)
 263  * and SSA id.
 264  */
 265 struct Temp {
 266    Temp() noexcept : id_(0), reg_class(0) {}
 267    constexpr Temp(uint32_t id, RegClass cls) noexcept
 268       : id_(id), reg_class(uint8_t(cls)) {}
 269
 270    constexpr uint32_t id() const noexcept { return id_; }
 271    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 272
 273    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 274    constexpr unsigned size() const noexcept { return regClass().size(); }
 275    constexpr RegType type() const noexcept { return regClass().type(); }
 276    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 277
 278    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 279    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 280    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 281
 282 private:
 283    uint32_t id_: 24;
 284    uint32_t reg_class : 8;
 285 };
 286
 287 /**
 288  * PhysReg
 289  * Represents the physical register for each
 290  * Operand and Definition.
 291  */
 292 struct PhysReg {
 293    constexpr PhysReg() = default;
 294    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 295    constexpr unsigned reg() const { return reg_b >> 2; }
 296    constexpr unsigned byte() const { return reg_b & 0x3; }
 297    constexpr operator unsigned() const { return reg(); }
 298    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 299    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 300    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 301
 302    uint16_t reg_b = 0;
 303 };
 304
 305 /* helper expressions for special registers */
 306 static constexpr PhysReg m0{124};
 307 static constexpr PhysReg vcc{106};
 308 static constexpr PhysReg vcc_hi{107};
 309 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 310 static constexpr PhysReg exec{126};
 311 static constexpr PhysReg exec_lo{126};
 312 static constexpr PhysReg exec_hi{127};
 313 static constexpr PhysReg vccz{251};
 314 static constexpr PhysReg execz{252};
 315 static constexpr PhysReg scc{253};
 316
 317 /**
 318  * Operand Class
 319  * Initially, each Operand refers to either
 320  * a temporary virtual register
 321  * or to a constant value
 322  * Temporary registers get mapped to physical register during RA
 323  * Constant values are inlined into the instruction sequence.
 324  */
 325 class Operand final
 326 {
 327 public:
 328    constexpr Operand()
 329       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 330         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 331         isLateKill_(false) {}
 332
 333    explicit Operand(Temp r) noexcept
 334    {
 335       data_.temp = r;
 336       if (r.id()) {
 337          isTemp_ = true;
 338       } else {
 339          isUndef_ = true;
 340          setFixed(PhysReg{128});
 341       }
 342    };
 343    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 344    {
 345       data_.i = v;
 346       isConstant_ = true;
 347       is64BitConst_ = is64bit;
 348       if (v <= 64)
 349          setFixed(PhysReg{128 + v});
 350       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 351          setFixed(PhysReg{192 - v});
 352       else if (v == 0x3f000000) /* 0.5 */
 353          setFixed(PhysReg{240});
 354       else if (v == 0xbf000000) /* -0.5 */
 355          setFixed(PhysReg{241});
 356       else if (v == 0x3f800000) /* 1.0 */
 357          setFixed(PhysReg{242});
 358       else if (v == 0xbf800000) /* -1.0 */
 359          setFixed(PhysReg{243});
 360       else if (v == 0x40000000) /* 2.0 */
 361          setFixed(PhysReg{244});
 362       else if (v == 0xc0000000) /* -2.0 */
 363          setFixed(PhysReg{245});
 364       else if (v == 0x40800000) /* 4.0 */
 365          setFixed(PhysReg{246});
 366       else if (v == 0xc0800000) /* -4.0 */
 367          setFixed(PhysReg{247});
 368       else { /* Literal Constant */
 369          assert(!is64bit && "attempt to create a 64-bit literal constant");
 370          setFixed(PhysReg{255});
 371       }
 372    };
 373    explicit Operand(uint64_t v) noexcept
 374    {
 375       isConstant_ = true;
 376       is64BitConst_ = true;
 377       if (v <= 64) {
 378          data_.i = (uint32_t) v;
 379          setFixed(PhysReg{128 + (uint32_t) v});
 380       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 381          data_.i = (uint32_t) v;
 382          setFixed(PhysReg{192 - (uint32_t) v});
 383       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 384          data_.i = 0x3f000000;
 385          setFixed(PhysReg{240});
 386       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 387          data_.i = 0xbf000000;
 388          setFixed(PhysReg{241});
 389       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 390          data_.i = 0x3f800000;
 391          setFixed(PhysReg{242});
 392       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 393          data_.i = 0xbf800000;
 394          setFixed(PhysReg{243});
 395       } else if (v == 0x4000000000000000) { /* 2.0 */
 396          data_.i = 0x40000000;
 397          setFixed(PhysReg{244});
 398       } else if (v == 0xC000000000000000) { /* -2.0 */
 399          data_.i = 0xc0000000;
 400          setFixed(PhysReg{245});
 401       } else if (v == 0x4010000000000000) { /* 4.0 */
 402          data_.i = 0x40800000;
 403          setFixed(PhysReg{246});
 404       } else if (v == 0xC010000000000000) { /* -4.0 */
 405          data_.i = 0xc0800000;
 406          setFixed(PhysReg{247});
 407       } else { /* Literal Constant: we don't know if it is a long or double.*/
 408          isConstant_ = 0;
 409          assert(false && "attempt to create a 64-bit literal constant");
 410       }
 411    };
 412    explicit Operand(RegClass type) noexcept
 413    {
 414       isUndef_ = true;
 415       data_.temp = Temp(0, type);
 416       setFixed(PhysReg{128});
 417    };
 418    explicit Operand(PhysReg reg, RegClass type) noexcept
 419    {
 420       data_.temp = Temp(0, type);
 421       setFixed(reg);
 422    }
 423
 424    constexpr bool isTemp() const noexcept
 425    {
 426       return isTemp_;
 427    }
 428
 429    constexpr void setTemp(Temp t) noexcept {
 430       assert(!isConstant_);
 431       isTemp_ = true;
 432       data_.temp = t;
 433    }
 434
 435    constexpr Temp getTemp() const noexcept
 436    {
 437       return data_.temp;
 438    }
 439
 440    constexpr uint32_t tempId() const noexcept
 441    {
 442       return data_.temp.id();
 443    }
 444
 445    constexpr bool hasRegClass() const noexcept
 446    {
 447       return isTemp() || isUndefined();
 448    }
 449
 450    constexpr RegClass regClass() const noexcept
 451    {
 452       return data_.temp.regClass();
 453    }
 454
 455    constexpr unsigned bytes() const noexcept
 456    {
 457       if (isConstant())
 458          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
 459       else
 460          return data_.temp.bytes();
 461    }
 462
 463    constexpr unsigned size() const noexcept
 464    {
 465       if (isConstant())
 466          return is64BitConst_ ? 2 : 1;
 467       else
 468          return data_.temp.size();
 469    }
 470
 471    constexpr bool isFixed() const noexcept
 472    {
 473       return isFixed_;
 474    }
 475
 476    constexpr PhysReg physReg() const noexcept
 477    {
 478       return reg_;
 479    }
 480
 481    constexpr void setFixed(PhysReg reg) noexcept
 482    {
 483       isFixed_ = reg != unsigned(-1);
 484       reg_ = reg;
 485    }
 486
 487    constexpr bool isConstant() const noexcept
 488    {
 489       return isConstant_;
 490    }
 491
 492    constexpr bool isLiteral() const noexcept
 493    {
 494       return isConstant() && reg_ == 255;
 495    }
 496
 497    constexpr bool isUndefined() const noexcept
 498    {
 499       return isUndef_;
 500    }
 501
 502    constexpr uint32_t constantValue() const noexcept
 503    {
 504       return data_.i;
 505    }
 506
 507    constexpr bool constantEquals(uint32_t cmp) const noexcept
 508    {
 509       return isConstant() && constantValue() == cmp;
 510    }
 511
 512    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 513    {
 514       if (is64BitConst_) {
 515          if (reg_ <= 192)
 516             return reg_ - 128;
 517          else if (reg_ <= 208)
 518             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 519
 520          switch (reg_) {
 521          case 240:
 522             return 0x3FE0000000000000;
 523          case 241:
 524             return 0xBFE0000000000000;
 525          case 242:
 526             return 0x3FF0000000000000;
 527          case 243:
 528             return 0xBFF0000000000000;
 529          case 244:
 530             return 0x4000000000000000;
 531          case 245:
 532             return 0xC000000000000000;
 533          case 246:
 534             return 0x4010000000000000;
 535          case 247:
 536             return 0xC010000000000000;
 537          }
 538       }
 539       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 540    }
 541
 542    /* Indicates that the killed operand's live range intersects with the
 543     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 544     * not set by liveness analysis. */
 545    constexpr void setLateKill(bool flag) noexcept
 546    {
 547       isLateKill_ = flag;
 548    }
 549
 550    constexpr bool isLateKill() const noexcept
 551    {
 552       return isLateKill_;
 553    }
 554
 555    constexpr void setKill(bool flag) noexcept
 556    {
 557       isKill_ = flag;
 558       if (!flag)
 559          setFirstKill(false);
 560    }
 561
 562    constexpr bool isKill() const noexcept
 563    {
 564       return isKill_ || isFirstKill();
 565    }
 566
 567    constexpr void setFirstKill(bool flag) noexcept
 568    {
 569       isFirstKill_ = flag;
 570       if (flag)
 571          setKill(flag);
 572    }
 573
 574    /* When there are multiple operands killing the same temporary,
 575     * isFirstKill() is only returns true for the first one. */
 576    constexpr bool isFirstKill() const noexcept
 577    {
 578       return isFirstKill_;
 579    }
 580
 581    constexpr bool isKillBeforeDef() const noexcept
 582    {
 583       return isKill() && !isLateKill();
 584    }
 585
 586    constexpr bool isFirstKillBeforeDef() const noexcept
 587    {
 588       return isFirstKill() && !isLateKill();
 589    }
 590
 591    constexpr bool operator == (Operand other) const noexcept
 592    {
 593       if (other.size() != size())
 594          return false;
 595       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 596          return false;
 597       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 598          return false;
 599       if (isLiteral())
 600          return other.isLiteral() && other.constantValue() == constantValue();
 601       else if (isConstant())
 602          return other.isConstant() && other.physReg() == physReg();
 603       else if (isUndefined())
 604          return other.isUndefined() && other.regClass() == regClass();
 605       else
 606          return other.isTemp() && other.getTemp() == getTemp();
 607    }
 608 private:
 609    union {
 610       uint32_t i;
 611       float f;
 612       Temp temp = Temp(0, s1);
 613    } data_;
 614    PhysReg reg_;
 615    union {
 616       struct {
 617          uint8_t isTemp_:1;
 618          uint8_t isFixed_:1;
 619          uint8_t isConstant_:1;
 620          uint8_t isKill_:1;
 621          uint8_t isUndef_:1;
 622          uint8_t isFirstKill_:1;
 623          uint8_t is64BitConst_:1;
 624          uint8_t isLateKill_:1;
 625       };
 626       /* can't initialize bit-fields in c++11, so work around using a union */
 627       uint8_t control_ = 0;
 628    };
 629 };
 630
 631 /**
 632  * Definition Class
 633  * Definitions are the results of Instructions
 634  * and refer to temporary virtual registers
 635  * which are later mapped to physical registers
 636  */
 637 class Definition final
 638 {
 639 public:
 640    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 641    Definition(uint32_t index, RegClass type) noexcept
 642       : temp(index, type) {}
 643    explicit Definition(Temp tmp) noexcept
 644       : temp(tmp) {}
 645    Definition(PhysReg reg, RegClass type) noexcept
 646       : temp(Temp(0, type))
 647    {
 648       setFixed(reg);
 649    }
 650    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 651       : temp(Temp(tmpId, type))
 652    {
 653       setFixed(reg);
 654    }
 655
 656    constexpr bool isTemp() const noexcept
 657    {
 658       return tempId() > 0;
 659    }
 660
 661    constexpr Temp getTemp() const noexcept
 662    {
 663       return temp;
 664    }
 665
 666    constexpr uint32_t tempId() const noexcept
 667    {
 668       return temp.id();
 669    }
 670
 671    constexpr void setTemp(Temp t) noexcept {
 672       temp = t;
 673    }
 674
 675    constexpr RegClass regClass() const noexcept
 676    {
 677       return temp.regClass();
 678    }
 679
 680    constexpr unsigned bytes() const noexcept
 681    {
 682       return temp.bytes();
 683    }
 684
 685    constexpr unsigned size() const noexcept
 686    {
 687       return temp.size();
 688    }
 689
 690    constexpr bool isFixed() const noexcept
 691    {
 692       return isFixed_;
 693    }
 694
 695    constexpr PhysReg physReg() const noexcept
 696    {
 697       return reg_;
 698    }
 699
 700    constexpr void setFixed(PhysReg reg) noexcept
 701    {
 702       isFixed_ = 1;
 703       reg_ = reg;
 704    }
 705
 706    constexpr void setHint(PhysReg reg) noexcept
 707    {
 708       hasHint_ = 1;
 709       reg_ = reg;
 710    }
 711
 712    constexpr bool hasHint() const noexcept
 713    {
 714       return hasHint_;
 715    }
 716
 717    constexpr void setKill(bool flag) noexcept
 718    {
 719       isKill_ = flag;
 720    }
 721
 722    constexpr bool isKill() const noexcept
 723    {
 724       return isKill_;
 725    }
 726
 727 private:
 728    Temp temp = Temp(0, s1);
 729    PhysReg reg_;
 730    union {
 731       struct {
 732          uint8_t isFixed_:1;
 733          uint8_t hasHint_:1;
 734          uint8_t isKill_:1;
 735       };
 736       /* can't initialize bit-fields in c++11, so work around using a union */
 737       uint8_t control_ = 0;
 738    };
 739 };
 740
 741 class Block;
 742
 743 struct Instruction {
 744    aco_opcode opcode;
 745    Format format;
 746    uint32_t pass_flags;
 747
 748    aco::span<Operand> operands;
 749    aco::span<Definition> definitions;
 750
 751    constexpr bool isVALU() const noexcept
 752    {
 753       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 754           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 755           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 756           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 757           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 758           || format == Format::VOP3P;
 759    }
 760
 761    constexpr bool isSALU() const noexcept
 762    {
 763       return format == Format::SOP1 ||
 764              format == Format::SOP2 ||
 765              format == Format::SOPC ||
 766              format == Format::SOPK ||
 767              format == Format::SOPP;
 768    }
 769
 770    constexpr bool isVMEM() const noexcept
 771    {
 772       return format == Format::MTBUF ||
 773              format == Format::MUBUF ||
 774              format == Format::MIMG;
 775    }
 776
 777    constexpr bool isDPP() const noexcept
 778    {
 779       return (uint16_t) format & (uint16_t) Format::DPP;
 780    }
 781
 782    constexpr bool isVOP3() const noexcept
 783    {
 784       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 785              ((uint16_t) format & (uint16_t) Format::VOP3B);
 786    }
 787
 788    constexpr bool isSDWA() const noexcept
 789    {
 790       return (uint16_t) format & (uint16_t) Format::SDWA;
 791    }
 792
 793    constexpr bool isFlatOrGlobal() const noexcept
 794    {
 795       return format == Format::FLAT || format == Format::GLOBAL;
 796    }
 797
 798    constexpr bool usesModifiers() const noexcept;
 799
 800    constexpr bool reads_exec() const noexcept
 801    {
 802       for (const Operand& op : operands) {
 803          if (op.isFixed() && op.physReg() == exec)
 804             return true;
 805       }
 806       return false;
 807    }
 808 };
 809 static_assert(sizeof(Instruction) == 16);
 810
 811 struct SOPK_instruction : public Instruction {
 812    uint16_t imm;
 813    uint16_t padding;
 814 };
 815 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4);
 816
 817 struct SOPP_instruction : public Instruction {
 818    uint32_t imm;
 819    int block;
 820 };
 821 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8);
 822
 823 struct SOPC_instruction : public Instruction {
 824 };
 825 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0);
 826
 827 struct SOP1_instruction : public Instruction {
 828 };
 829 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0);
 830
 831 struct SOP2_instruction : public Instruction {
 832 };
 833 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0);
 834
 835 /**
 836  * Scalar Memory Format:
 837  * For s_(buffer_)load_dword*:
 838  * Operand(0): SBASE - SGPR-pair which provides base address
 839  * Operand(1): Offset - immediate (un)signed offset or SGPR
 840  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 841  * Operand(n-1): SOffset - SGPR offset (Vega only)
 842  *
 843  * Having no operands is also valid for instructions such as s_dcache_inv.
 844  *
 845  */
 846 struct SMEM_instruction : public Instruction {
 847    barrier_interaction barrier;
 848    bool glc : 1; /* VI+: globally coherent */
 849    bool dlc : 1; /* NAVI: device level coherent */
 850    bool nv : 1; /* VEGA only: Non-volatile */
 851    bool can_reorder : 1;
 852    bool disable_wqm : 1;
 853    uint32_t padding: 19;
 854 };
 855 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4);
 856
 857 struct VOP1_instruction : public Instruction {
 858 };
 859 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0);
 860
 861 struct VOP2_instruction : public Instruction {
 862 };
 863 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0);
 864
 865 struct VOPC_instruction : public Instruction {
 866 };
 867 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0);
 868
 869 struct VOP3A_instruction : public Instruction {
 870    bool abs[3];
 871    bool neg[3];
 872    uint8_t opsel : 4;
 873    uint8_t omod : 2;
 874    bool clamp : 1;
 875    uint32_t padding : 9;
 876 };
 877 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8);
 878
 879 struct VOP3P_instruction : public Instruction {
 880    bool neg_lo[3];
 881    bool neg_hi[3];
 882    uint8_t opsel_lo : 3;
 883    uint8_t opsel_hi : 3;
 884    bool clamp : 1;
 885    uint32_t padding : 9;
 886 };
 887 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8);
 888
 889 /**
 890  * Data Parallel Primitives Format:
 891  * This format can be used for VOP1, VOP2 or VOPC instructions.
 892  * The swizzle applies to the src0 operand.
 893  *
 894  */
 895 struct DPP_instruction : public Instruction {
 896    bool abs[2];
 897    bool neg[2];
 898    uint16_t dpp_ctrl;
 899    uint8_t row_mask : 4;
 900    uint8_t bank_mask : 4;
 901    bool bound_ctrl : 1;
 902    uint32_t padding : 7;
 903 };
 904 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8);
 905
 906 enum sdwa_sel : uint8_t {
 907     /* masks */
 908     sdwa_wordnum = 0x1,
 909     sdwa_bytenum = 0x3,
 910     sdwa_asuint = 0x7 | 0x10,
 911     sdwa_rasize = 0x3,
 912
 913     /* flags */
 914     sdwa_isword = 0x4,
 915     sdwa_sext = 0x8,
 916     sdwa_isra = 0x10,
 917
 918     /* specific values */
 919     sdwa_ubyte0 = 0,
 920     sdwa_ubyte1 = 1,
 921     sdwa_ubyte2 = 2,
 922     sdwa_ubyte3 = 3,
 923     sdwa_uword0 = sdwa_isword | 0,
 924     sdwa_uword1 = sdwa_isword | 1,
 925     sdwa_udword = 6,
 926
 927     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 928     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 929     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 930     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 931     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 932     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 933     sdwa_sdword = sdwa_udword | sdwa_sext,
 934
 935     /* register-allocated */
 936     sdwa_ubyte = 1 | sdwa_isra,
 937     sdwa_uword = 2 | sdwa_isra,
 938     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
 939     sdwa_sword = sdwa_uword | sdwa_sext,
 940 };
 941
 942 /**
 943  * Sub-Dword Addressing Format:
 944  * This format can be used for VOP1, VOP2 or VOPC instructions.
 945  *
 946  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 947  * the definition doesn't have to be VCC on GFX9+.
 948  *
 949  */
 950 struct SDWA_instruction : public Instruction {
 951    /* these destination modifiers aren't available with VOPC except for
 952     * clamp on GFX8 */
 953    uint8_t sel[2];
 954    uint8_t dst_sel;
 955    bool neg[2];
 956    bool abs[2];
 957    bool dst_preserve : 1;
 958    bool clamp : 1;
 959    uint8_t omod : 2; /* GFX9+ */
 960    uint32_t padding : 4;
 961 };
 962 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8);
 963
 964 struct Interp_instruction : public Instruction {
 965    uint8_t attribute;
 966    uint8_t component;
 967    uint16_t padding;
 968 };
 969 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4);
 970
 971 /**
 972  * Local and Global Data Sharing instructions
 973  * Operand(0): ADDR - VGPR which supplies the address.
 974  * Operand(1): DATA0 - First data VGPR.
 975  * Operand(2): DATA1 - Second data VGPR.
 976  * Operand(n-1): M0 - LDS size.
 977  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 978  *
 979  */
 980 struct DS_instruction : public Instruction {
 981    int16_t offset0;
 982    int8_t offset1;
 983    bool gds;
 984 };
 985 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4);
 986
 987 /**
 988  * Vector Memory Untyped-buffer Instructions
 989  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 990  * Operand(1): VADDR - Address source. Can carry an index and/or offset
 991  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 992  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 993  *
 994  */
 995 struct MUBUF_instruction : public Instruction {
 996    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
 997    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
 998    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
 999    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1000    bool glc : 1; /* globally coherent */
1001    bool dlc : 1; /* NAVI: device level coherent */
1002    bool slc : 1; /* system level coherent */
1003    bool tfe : 1; /* texture fail enable */
1004    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1005    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1006    bool can_reorder : 1;
1007    uint8_t padding : 2;
1008    barrier_interaction barrier;
1009 };
1010 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4);
1011
1012 /**
1013  * Vector Memory Typed-buffer Instructions
1014  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1015  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1016  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1017  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1018  *
1019  */
1020 struct MTBUF_instruction : public Instruction {
1021    uint16_t offset; /* Unsigned byte offset - 12 bit */
1022    barrier_interaction barrier;
1023    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1024    uint8_t nfmt : 3; /* Numeric format of data in memory */
1025    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1026    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1027    bool glc : 1; /* globally coherent */
1028    bool dlc : 1; /* NAVI: device level coherent */
1029    bool slc : 1; /* system level coherent */
1030    bool tfe : 1; /* texture fail enable */
1031    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1032    bool can_reorder : 1;
1033    uint32_t padding : 25;
1034 };
1035 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8);
1036
1037 /**
1038  * Vector Memory Image Instructions
1039  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1040  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1041  *             or VDATA - Vector GPR for write data.
1042  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1043  * Definition(0): VDATA - Vector GPR for read result.
1044  *
1045  */
1046 struct MIMG_instruction : public Instruction {
1047    uint8_t dmask; /* Data VGPR enable mask */
1048    uint8_t dim : 3; /* NAVI: dimensionality */
1049    bool unrm : 1; /* Force address to be un-normalized */
1050    bool dlc : 1; /* NAVI: device level coherent */
1051    bool glc : 1; /* globally coherent */
1052    bool slc : 1; /* system level coherent */
1053    bool tfe : 1; /* texture fail enable */
1054    bool da : 1; /* declare an array */
1055    bool lwe : 1; /* Force data to be un-normalized */
1056    bool r128 : 1; /* NAVI: Texture resource size */
1057    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1058    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1059    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1060    bool can_reorder : 1;
1061    uint8_t padding : 1;
1062    barrier_interaction barrier;
1063 };
1064 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4);
1065
1066 /**
1067  * Flat/Scratch/Global Instructions
1068  * Operand(0): ADDR
1069  * Operand(1): SADDR
1070  * Operand(2) / Definition(0): DATA/VDST
1071  *
1072  */
1073 struct FLAT_instruction : public Instruction {
1074    uint16_t offset; /* Vega/Navi only */
1075    bool slc : 1; /* system level coherent */
1076    bool glc : 1; /* globally coherent */
1077    bool dlc : 1; /* NAVI: device level coherent */
1078    bool lds : 1;
1079    bool nv : 1;
1080    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1081    bool can_reorder : 1;
1082    uint8_t padding : 1;
1083    barrier_interaction barrier;
1084 };
1085 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4);
1086
1087 struct Export_instruction : public Instruction {
1088    uint8_t enabled_mask;
1089    uint8_t dest;
1090    bool compressed : 1;
1091    bool done : 1;
1092    bool valid_mask : 1;
1093    uint32_t padding : 13;
1094 };
1095 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4);
1096
1097 struct Pseudo_instruction : public Instruction {
1098    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1099    bool tmp_in_scc;
1100    uint8_t padding;
1101 };
1102 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4);
1103
1104 struct Pseudo_branch_instruction : public Instruction {
1105    /* target[0] is the block index of the branch target.
1106     * For conditional branches, target[1] contains the fall-through alternative.
1107     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1108     */
1109    uint32_t target[2];
1110 };
1111 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8);
1112
1113 struct Pseudo_barrier_instruction : public Instruction {
1114 };
1115 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0);
1116
1117 enum ReduceOp : uint16_t {
1118    iadd32, iadd64,
1119    imul32, imul64,
1120    fadd32, fadd64,
1121    fmul32, fmul64,
1122    imin32, imin64,
1123    imax32, imax64,
1124    umin32, umin64,
1125    umax32, umax64,
1126    fmin32, fmin64,
1127    fmax32, fmax64,
1128    iand32, iand64,
1129    ior32, ior64,
1130    ixor32, ixor64,
1131    gfx10_wave64_bpermute
1132 };
1133
1134 /**
1135  * Subgroup Reduction Instructions, everything except for the data to be
1136  * reduced and the result as inserted by setup_reduce_temp().
1137  * Operand(0): data to be reduced
1138  * Operand(1): reduce temporary
1139  * Operand(2): vector temporary
1140  * Definition(0): result
1141  * Definition(1): scalar temporary
1142  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1143  * Definition(3): scc clobber
1144  * Definition(4): vcc clobber
1145  *
1146  */
1147 struct Pseudo_reduction_instruction : public Instruction {
1148    ReduceOp reduce_op;
1149    uint16_t cluster_size; // must be 0 for scans
1150 };
1151 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4);
1152
1153 struct instr_deleter_functor {
1154    void operator()(void* p) {
1155       free(p);
1156    }
1157 };
1158
1159 template<typename T>
1160 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1161
1162 template<typename T>
1163 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1164 {
1165    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1166    char *data = (char*) calloc(1, size);
1167    T* inst = (T*) data;
1168
1169    inst->opcode = opcode;
1170    inst->format = format;
1171
1172    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1173    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1174    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1175    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1176
1177    return inst;
1178 }
1179
1180 constexpr bool Instruction::usesModifiers() const noexcept
1181 {
1182    if (isDPP() || isSDWA())
1183       return true;
1184
1185    if (format == Format::VOP3P) {
1186       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1187       for (unsigned i = 0; i < operands.size(); i++) {
1188          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1189             return true;
1190       }
1191       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1192    } else if (isVOP3()) {
1193       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1194       for (unsigned i = 0; i < operands.size(); i++) {
1195          if (vop3->abs[i] || vop3->neg[i])
1196             return true;
1197       }
1198       return vop3->opsel || vop3->clamp || vop3->omod;
1199    }
1200    return false;
1201 }
1202
1203 constexpr bool is_phi(Instruction* instr)
1204 {
1205    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1206 }
1207
1208 static inline bool is_phi(aco_ptr<Instruction>& instr)
1209 {
1210    return is_phi(instr.get());
1211 }
1212
1213 barrier_interaction get_barrier_interaction(Instruction* instr);
1214
1215 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1216
1217 enum block_kind {
1218    /* uniform indicates that leaving this block,
1219     * all actives lanes stay active */
1220    block_kind_uniform = 1 << 0,
1221    block_kind_top_level = 1 << 1,
1222    block_kind_loop_preheader = 1 << 2,
1223    block_kind_loop_header = 1 << 3,
1224    block_kind_loop_exit = 1 << 4,
1225    block_kind_continue = 1 << 5,
1226    block_kind_break = 1 << 6,
1227    block_kind_continue_or_break = 1 << 7,
1228    block_kind_discard = 1 << 8,
1229    block_kind_branch = 1 << 9,
1230    block_kind_merge = 1 << 10,
1231    block_kind_invert = 1 << 11,
1232    block_kind_uses_discard_if = 1 << 12,
1233    block_kind_needs_lowering = 1 << 13,
1234    block_kind_uses_demote = 1 << 14,
1235    block_kind_export_end = 1 << 15,
1236 };
1237
1238
1239 struct RegisterDemand {
1240    constexpr RegisterDemand() = default;
1241    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1242       : vgpr{v}, sgpr{s} {}
1243    int16_t vgpr = 0;
1244    int16_t sgpr = 0;
1245
1246    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1247       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1248    }
1249
1250    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1251       return vgpr > other.vgpr || sgpr > other.sgpr;
1252    }
1253
1254    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1255       if (t.type() == RegType::sgpr)
1256          return RegisterDemand( vgpr, sgpr + t.size() );
1257       else
1258          return RegisterDemand( vgpr + t.size(), sgpr );
1259    }
1260
1261    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1262       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1263    }
1264
1265    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1266       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1267    }
1268
1269    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1270       vgpr += other.vgpr;
1271       sgpr += other.sgpr;
1272       return *this;
1273    }
1274
1275    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1276       vgpr -= other.vgpr;
1277       sgpr -= other.sgpr;
1278       return *this;
1279    }
1280
1281    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1282       if (t.type() == RegType::sgpr)
1283          sgpr += t.size();
1284       else
1285          vgpr += t.size();
1286       return *this;
1287    }
1288
1289    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1290       if (t.type() == RegType::sgpr)
1291          sgpr -= t.size();
1292       else
1293          vgpr -= t.size();
1294       return *this;
1295    }
1296
1297    constexpr void update(const RegisterDemand other) noexcept {
1298       vgpr = std::max(vgpr, other.vgpr);
1299       sgpr = std::max(sgpr, other.sgpr);
1300    }
1301
1302 };
1303
1304 /* CFG */
1305 struct Block {
1306    float_mode fp_mode;
1307    unsigned index;
1308    unsigned offset = 0;
1309    std::vector<aco_ptr<Instruction>> instructions;
1310    std::vector<unsigned> logical_preds;
1311    std::vector<unsigned> linear_preds;
1312    std::vector<unsigned> logical_succs;
1313    std::vector<unsigned> linear_succs;
1314    RegisterDemand register_demand = RegisterDemand();
1315    uint16_t loop_nest_depth = 0;
1316    uint16_t kind = 0;
1317    int logical_idom = -1;
1318    int linear_idom = -1;
1319    Temp live_out_exec = Temp();
1320
1321    /* this information is needed for predecessors to blocks with phis when
1322     * moving out of ssa */
1323    bool scc_live_out = false;
1324    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1325
1326    Block(unsigned idx) : index(idx) {}
1327    Block() : index(0) {}
1328 };
1329
1330 using Stage = uint16_t;
1331
1332 /* software stages */
1333 static constexpr Stage sw_vs = 1 << 0;
1334 static constexpr Stage sw_gs = 1 << 1;
1335 static constexpr Stage sw_tcs = 1 << 2;
1336 static constexpr Stage sw_tes = 1 << 3;
1337 static constexpr Stage sw_fs = 1 << 4;
1338 static constexpr Stage sw_cs = 1 << 5;
1339 static constexpr Stage sw_gs_copy = 1 << 6;
1340 static constexpr Stage sw_mask = 0x7f;
1341
1342 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1343 static constexpr Stage hw_vs = 1 << 7;
1344 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1345 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1346 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1347 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1348 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1349 static constexpr Stage hw_fs = 1 << 13;
1350 static constexpr Stage hw_cs = 1 << 14;
1351 static constexpr Stage hw_mask = 0xff << 7;
1352
1353 /* possible settings of Program::stage */
1354 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1355 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1356 static constexpr Stage compute_cs = sw_cs | hw_cs;
1357 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1358 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1359 /* GFX10/NGG */
1360 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1361 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1362 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1363 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1364 /* GFX9 (and GFX10 if NGG isn't used) */
1365 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1366 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1367 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1368 /* pre-GFX9 */
1369 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1370 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1371 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1372 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1373 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1374
1375 enum statistic {
1376    statistic_hash,
1377    statistic_instructions,
1378    statistic_copies,
1379    statistic_branches,
1380    statistic_cycles,
1381    statistic_vmem_clauses,
1382    statistic_smem_clauses,
1383    statistic_vmem_score,
1384    statistic_smem_score,
1385    statistic_sgpr_presched,
1386    statistic_vgpr_presched,
1387    num_statistics
1388 };
1389
1390 class Program final {
1391 public:
1392    float_mode next_fp_mode;
1393    std::vector<Block> blocks;
1394    RegisterDemand max_reg_demand = RegisterDemand();
1395    uint16_t num_waves = 0;
1396    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1397    ac_shader_config* config;
1398    struct radv_shader_info *info;
1399    enum chip_class chip_class;
1400    enum radeon_family family;
1401    unsigned wave_size;
1402    RegClass lane_mask;
1403    Stage stage; /* Stage */
1404    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1405    bool needs_wqm = false; /* there exists a p_wqm instruction */
1406    bool wb_smem_l1_on_end = false;
1407
1408    std::vector<uint8_t> constant_data;
1409    Temp private_segment_buffer;
1410    Temp scratch_offset;
1411
1412    uint16_t min_waves = 0;
1413    uint16_t lds_alloc_granule;
1414    uint32_t lds_limit; /* in bytes */
1415    bool has_16bank_lds;
1416    uint16_t vgpr_limit;
1417    uint16_t sgpr_limit;
1418    uint16_t physical_sgprs;
1419    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1420    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1421    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1422
1423    bool xnack_enabled = false;
1424
1425    bool needs_vcc = false;
1426    bool needs_flat_scr = false;
1427
1428    bool collect_statistics = false;
1429    uint32_t statistics[num_statistics];
1430
1431    uint32_t allocateId()
1432    {
1433       assert(allocationID <= 16777215);
1434       return allocationID++;
1435    }
1436
1437    uint32_t peekAllocationId()
1438    {
1439       return allocationID;
1440    }
1441
1442    void setAllocationId(uint32_t id)
1443    {
1444       allocationID = id;
1445    }
1446
1447    Block* create_and_insert_block() {
1448       blocks.emplace_back(blocks.size());
1449       blocks.back().fp_mode = next_fp_mode;
1450       return &blocks.back();
1451    }
1452
1453    Block* insert_block(Block&& block) {
1454       block.index = blocks.size();
1455       block.fp_mode = next_fp_mode;
1456       blocks.emplace_back(std::move(block));
1457       return &blocks.back();
1458    }
1459
1460 private:
1461    uint32_t allocationID = 1;
1462 };
1463
1464 struct TempHash {
1465    std::size_t operator()(Temp t) const {
1466       return t.id();
1467    }
1468 };
1469 using TempSet = std::unordered_set<Temp, TempHash>;
1470
1471 struct live {
1472    /* live temps out per block */
1473    std::vector<TempSet> live_out;
1474    /* register demand (sgpr/vgpr) per instruction per block */
1475    std::vector<std::vector<RegisterDemand>> register_demand;
1476 };
1477
1478 void select_program(Program *program,
1479                     unsigned shader_count,
1480                     struct nir_shader *const *shaders,
1481                     ac_shader_config* config,
1482                     struct radv_shader_args *args);
1483 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1484                            ac_shader_config* config,
1485                            struct radv_shader_args *args);
1486
1487 void lower_wqm(Program* program, live& live_vars,
1488                const struct radv_nir_compiler_options *options);
1489 void lower_phis(Program* program);
1490 void calc_min_waves(Program* program);
1491 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1492 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1493 std::vector<uint16_t> dead_code_analysis(Program *program);
1494 void dominator_tree(Program* program);
1495 void insert_exec_mask(Program *program);
1496 void value_numbering(Program* program);
1497 void optimize(Program* program);
1498 void setup_reduce_temp(Program* program);
1499 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1500 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1501 void ssa_elimination(Program* program);
1502 void lower_to_hw_instr(Program* program);
1503 void schedule_program(Program* program, live& live_vars);
1504 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1505 void insert_wait_states(Program* program);
1506 void insert_NOPs(Program* program);
1507 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1508 void print_asm(Program *program, std::vector<uint32_t>& binary,
1509                unsigned exec_size, std::ostream& out);
1510 void validate(Program* program, FILE *output);
1511 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1512 #ifndef NDEBUG
1513 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1514 #else
1515 #define perfwarn(program, cond, msg, ...) do {} while(0)
1516 #endif
1517
1518 void collect_presched_stats(Program *program);
1519 void collect_preasm_stats(Program *program);
1520 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1521
1522 void aco_print_instr(Instruction *instr, FILE *output);
1523 void aco_print_program(Program *program, FILE *output);
1524
1525 /* utilities for dealing with register demand */
1526 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1527 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1528 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1529
1530 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1531 uint16_t get_extra_sgprs(Program *program);
1532
1533 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1534 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1535 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1536
1537 /* return number of addressable sgprs/vgprs for max_waves */
1538 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1539 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1540
1541 typedef struct {
1542    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1543    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1544    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1545    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1546    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1547    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1548    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1549    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1550 } Info;
1551
1552 extern const Info instr_info;
1553
1554 }
1555
1556 #endif /* ACO_IR_H */
1557