src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 struct radv_nir_compiler_options;
  41 struct radv_shader_args;
  42 struct radv_shader_info;
  43
  44 namespace aco {
  45
  46 extern uint64_t debug_flags;
  47
  48 enum {
  49    DEBUG_VALIDATE = 0x1,
  50    DEBUG_VALIDATE_RA = 0x2,
  51    DEBUG_PERFWARN = 0x4,
  52 };
  53
  54 /**
  55  * Representation of the instruction's microcode encoding format
  56  * Note: Some Vector ALU Formats can be combined, such that:
  57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  60  *
  61  * (*) The same is applicable for VOP1 and VOPC instructions.
  62  */
  63 enum class Format : std::uint16_t {
  64    /* Pseudo Instruction Format */
  65    PSEUDO = 0,
  66    /* Scalar ALU & Control Formats */
  67    SOP1 = 1,
  68    SOP2 = 2,
  69    SOPK = 3,
  70    SOPP = 4,
  71    SOPC = 5,
  72    /* Scalar Memory Format */
  73    SMEM = 6,
  74    /* LDS/GDS Format */
  75    DS = 8,
  76    /* Vector Memory Buffer Formats */
  77    MTBUF = 9,
  78    MUBUF = 10,
  79    /* Vector Memory Image Format */
  80    MIMG = 11,
  81    /* Export Format */
  82    EXP = 12,
  83    /* Flat Formats */
  84    FLAT = 13,
  85    GLOBAL = 14,
  86    SCRATCH = 15,
  87
  88    PSEUDO_BRANCH = 16,
  89    PSEUDO_BARRIER = 17,
  90    PSEUDO_REDUCTION = 18,
  91
  92    /* Vector ALU Formats */
  93    VOP3P = 19,
  94    VOP1 = 1 << 8,
  95    VOP2 = 1 << 9,
  96    VOPC = 1 << 10,
  97    VOP3 = 1 << 11,
  98    VOP3A = 1 << 11,
  99    VOP3B = 1 << 11,
 100    /* Vector Parameter Interpolation Format */
 101    VINTRP = 1 << 12,
 102    DPP = 1 << 13,
 103    SDWA = 1 << 14,
 104 };
 105
 106 enum barrier_interaction : uint8_t {
 107    barrier_none = 0,
 108    barrier_buffer = 0x1,
 109    barrier_image = 0x2,
 110    barrier_atomic = 0x4,
 111    barrier_shared = 0x8,
 112    /* used for geometry shaders to ensure vertex data writes are before the
 113     * GS_DONE s_sendmsg. */
 114    barrier_gs_data = 0x10,
 115    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 116    barrier_gs_sendmsg = 0x20,
 117    /* used by barriers. created by s_barrier */
 118    barrier_barrier = 0x40,
 119    barrier_count = 7,
 120 };
 121
 122 enum fp_round {
 123    fp_round_ne = 0,
 124    fp_round_pi = 1,
 125    fp_round_ni = 2,
 126    fp_round_tz = 3,
 127 };
 128
 129 enum fp_denorm {
 130    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 131     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 132    fp_denorm_flush = 0x0,
 133    fp_denorm_keep = 0x3,
 134 };
 135
 136 struct float_mode {
 137    /* matches encoding of the MODE register */
 138    union {
 139       struct {
 140           fp_round round32:2;
 141           fp_round round16_64:2;
 142           unsigned denorm32:2;
 143           unsigned denorm16_64:2;
 144       };
 145       uint8_t val = 0;
 146    };
 147    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 148    bool preserve_signed_zero_inf_nan32:1;
 149    bool preserve_signed_zero_inf_nan16_64:1;
 150    /* if false, optimizations which may remove denormal flushing can be done */
 151    bool must_flush_denorms32:1;
 152    bool must_flush_denorms16_64:1;
 153    bool care_about_round32:1;
 154    bool care_about_round16_64:1;
 155
 156    /* Returns true if instructions using the mode "other" can safely use the
 157     * current one instead. */
 158    bool canReplace(float_mode other) const noexcept {
 159       return val == other.val &&
 160              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 161              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 162              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 163              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 164              (care_about_round32 || !other.care_about_round32) &&
 165              (care_about_round16_64 || !other.care_about_round16_64);
 166    }
 167 };
 168
 169 constexpr Format asVOP3(Format format) {
 170    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 171 };
 172
 173 constexpr Format asSDWA(Format format) {
 174    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 175    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 176 }
 177
 178 enum class RegType {
 179    none = 0,
 180    sgpr,
 181    vgpr,
 182    linear_vgpr,
 183 };
 184
 185 struct RegClass {
 186
 187    enum RC : uint8_t {
 188       s1 = 1,
 189       s2 = 2,
 190       s3 = 3,
 191       s4 = 4,
 192       s6 = 6,
 193       s8 = 8,
 194       s16 = 16,
 195       v1 = s1 | (1 << 5),
 196       v2 = s2 | (1 << 5),
 197       v3 = s3 | (1 << 5),
 198       v4 = s4 | (1 << 5),
 199       v5 = 5  | (1 << 5),
 200       v6 = 6  | (1 << 5),
 201       v7 = 7  | (1 << 5),
 202       v8 = 8  | (1 << 5),
 203       /* byte-sized register class */
 204       v1b = v1 | (1 << 7),
 205       v2b = v2 | (1 << 7),
 206       v3b = v3 | (1 << 7),
 207       v4b = v4 | (1 << 7),
 208       v6b = v6 | (1 << 7),
 209       v8b = v8 | (1 << 7),
 210       /* these are used for WWM and spills to vgpr */
 211       v1_linear = v1 | (1 << 6),
 212       v2_linear = v2 | (1 << 6),
 213    };
 214
 215    RegClass() = default;
 216    constexpr RegClass(RC rc)
 217       : rc(rc) {}
 218    constexpr RegClass(RegType type, unsigned size)
 219       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 220
 221    constexpr operator RC() const { return rc; }
 222    explicit operator bool() = delete;
 223
 224    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 225    constexpr bool is_subdword() const { return rc & (1 << 7); }
 226    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 227    //TODO: use size() less in favor of bytes()
 228    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 229    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 230    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 231    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 232
 233    static constexpr RegClass get(RegType type, unsigned bytes) {
 234       if (type == RegType::sgpr) {
 235          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 236       } else {
 237          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 238                              RegClass(type, bytes / 4u);
 239       }
 240    }
 241
 242 private:
 243    RC rc;
 244 };
 245
 246 /* transitional helper expressions */
 247 static constexpr RegClass s1{RegClass::s1};
 248 static constexpr RegClass s2{RegClass::s2};
 249 static constexpr RegClass s3{RegClass::s3};
 250 static constexpr RegClass s4{RegClass::s4};
 251 static constexpr RegClass s8{RegClass::s8};
 252 static constexpr RegClass s16{RegClass::s16};
 253 static constexpr RegClass v1{RegClass::v1};
 254 static constexpr RegClass v2{RegClass::v2};
 255 static constexpr RegClass v3{RegClass::v3};
 256 static constexpr RegClass v4{RegClass::v4};
 257 static constexpr RegClass v5{RegClass::v5};
 258 static constexpr RegClass v6{RegClass::v6};
 259 static constexpr RegClass v7{RegClass::v7};
 260 static constexpr RegClass v8{RegClass::v8};
 261 static constexpr RegClass v1b{RegClass::v1b};
 262 static constexpr RegClass v2b{RegClass::v2b};
 263 static constexpr RegClass v3b{RegClass::v3b};
 264 static constexpr RegClass v4b{RegClass::v4b};
 265 static constexpr RegClass v6b{RegClass::v6b};
 266 static constexpr RegClass v8b{RegClass::v8b};
 267
 268 /**
 269  * Temp Class
 270  * Each temporary virtual register has a
 271  * register class (i.e. size and type)
 272  * and SSA id.
 273  */
 274 struct Temp {
 275    Temp() noexcept : id_(0), reg_class(0) {}
 276    constexpr Temp(uint32_t id, RegClass cls) noexcept
 277       : id_(id), reg_class(uint8_t(cls)) {}
 278
 279    constexpr uint32_t id() const noexcept { return id_; }
 280    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 281
 282    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 283    constexpr unsigned size() const noexcept { return regClass().size(); }
 284    constexpr RegType type() const noexcept { return regClass().type(); }
 285    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 286
 287    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 288    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 289    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 290
 291 private:
 292    uint32_t id_: 24;
 293    uint32_t reg_class : 8;
 294 };
 295
 296 /**
 297  * PhysReg
 298  * Represents the physical register for each
 299  * Operand and Definition.
 300  */
 301 struct PhysReg {
 302    constexpr PhysReg() = default;
 303    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 304    constexpr unsigned reg() const { return reg_b >> 2; }
 305    constexpr unsigned byte() const { return reg_b & 0x3; }
 306    constexpr operator unsigned() const { return reg(); }
 307    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 308    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 309    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 310    constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 311
 312    uint16_t reg_b = 0;
 313 };
 314
 315 /* helper expressions for special registers */
 316 static constexpr PhysReg m0{124};
 317 static constexpr PhysReg vcc{106};
 318 static constexpr PhysReg vcc_hi{107};
 319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 320 static constexpr PhysReg exec{126};
 321 static constexpr PhysReg exec_lo{126};
 322 static constexpr PhysReg exec_hi{127};
 323 static constexpr PhysReg vccz{251};
 324 static constexpr PhysReg execz{252};
 325 static constexpr PhysReg scc{253};
 326
 327 /**
 328  * Operand Class
 329  * Initially, each Operand refers to either
 330  * a temporary virtual register
 331  * or to a constant value
 332  * Temporary registers get mapped to physical register during RA
 333  * Constant values are inlined into the instruction sequence.
 334  */
 335 class Operand final
 336 {
 337 public:
 338    constexpr Operand()
 339       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 340         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false),
 341         isLateKill_(false) {}
 342
 343    explicit Operand(Temp r) noexcept
 344    {
 345       data_.temp = r;
 346       if (r.id()) {
 347          isTemp_ = true;
 348       } else {
 349          isUndef_ = true;
 350          setFixed(PhysReg{128});
 351       }
 352    };
 353    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 354    {
 355       data_.i = v;
 356       isConstant_ = true;
 357       is64BitConst_ = is64bit;
 358       if (v <= 64)
 359          setFixed(PhysReg{128 + v});
 360       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 361          setFixed(PhysReg{192 - v});
 362       else if (v == 0x3f000000) /* 0.5 */
 363          setFixed(PhysReg{240});
 364       else if (v == 0xbf000000) /* -0.5 */
 365          setFixed(PhysReg{241});
 366       else if (v == 0x3f800000) /* 1.0 */
 367          setFixed(PhysReg{242});
 368       else if (v == 0xbf800000) /* -1.0 */
 369          setFixed(PhysReg{243});
 370       else if (v == 0x40000000) /* 2.0 */
 371          setFixed(PhysReg{244});
 372       else if (v == 0xc0000000) /* -2.0 */
 373          setFixed(PhysReg{245});
 374       else if (v == 0x40800000) /* 4.0 */
 375          setFixed(PhysReg{246});
 376       else if (v == 0xc0800000) /* -4.0 */
 377          setFixed(PhysReg{247});
 378       else { /* Literal Constant */
 379          assert(!is64bit && "attempt to create a 64-bit literal constant");
 380          setFixed(PhysReg{255});
 381       }
 382    };
 383    explicit Operand(uint64_t v) noexcept
 384    {
 385       isConstant_ = true;
 386       is64BitConst_ = true;
 387       if (v <= 64) {
 388          data_.i = (uint32_t) v;
 389          setFixed(PhysReg{128 + (uint32_t) v});
 390       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 391          data_.i = (uint32_t) v;
 392          setFixed(PhysReg{192 - (uint32_t) v});
 393       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 394          data_.i = 0x3f000000;
 395          setFixed(PhysReg{240});
 396       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 397          data_.i = 0xbf000000;
 398          setFixed(PhysReg{241});
 399       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 400          data_.i = 0x3f800000;
 401          setFixed(PhysReg{242});
 402       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 403          data_.i = 0xbf800000;
 404          setFixed(PhysReg{243});
 405       } else if (v == 0x4000000000000000) { /* 2.0 */
 406          data_.i = 0x40000000;
 407          setFixed(PhysReg{244});
 408       } else if (v == 0xC000000000000000) { /* -2.0 */
 409          data_.i = 0xc0000000;
 410          setFixed(PhysReg{245});
 411       } else if (v == 0x4010000000000000) { /* 4.0 */
 412          data_.i = 0x40800000;
 413          setFixed(PhysReg{246});
 414       } else if (v == 0xC010000000000000) { /* -4.0 */
 415          data_.i = 0xc0800000;
 416          setFixed(PhysReg{247});
 417       } else { /* Literal Constant: we don't know if it is a long or double.*/
 418          isConstant_ = 0;
 419          assert(false && "attempt to create a 64-bit literal constant");
 420       }
 421    };
 422    explicit Operand(RegClass type) noexcept
 423    {
 424       isUndef_ = true;
 425       data_.temp = Temp(0, type);
 426       setFixed(PhysReg{128});
 427    };
 428    explicit Operand(PhysReg reg, RegClass type) noexcept
 429    {
 430       data_.temp = Temp(0, type);
 431       setFixed(reg);
 432    }
 433
 434    constexpr bool isTemp() const noexcept
 435    {
 436       return isTemp_;
 437    }
 438
 439    constexpr void setTemp(Temp t) noexcept {
 440       assert(!isConstant_);
 441       isTemp_ = true;
 442       data_.temp = t;
 443    }
 444
 445    constexpr Temp getTemp() const noexcept
 446    {
 447       return data_.temp;
 448    }
 449
 450    constexpr uint32_t tempId() const noexcept
 451    {
 452       return data_.temp.id();
 453    }
 454
 455    constexpr bool hasRegClass() const noexcept
 456    {
 457       return isTemp() || isUndefined();
 458    }
 459
 460    constexpr RegClass regClass() const noexcept
 461    {
 462       return data_.temp.regClass();
 463    }
 464
 465    constexpr unsigned bytes() const noexcept
 466    {
 467       if (isConstant())
 468          return is64BitConst_ ? 8 : 4; //TODO: sub-dword constants
 469       else
 470          return data_.temp.bytes();
 471    }
 472
 473    constexpr unsigned size() const noexcept
 474    {
 475       if (isConstant())
 476          return is64BitConst_ ? 2 : 1;
 477       else
 478          return data_.temp.size();
 479    }
 480
 481    constexpr bool isFixed() const noexcept
 482    {
 483       return isFixed_;
 484    }
 485
 486    constexpr PhysReg physReg() const noexcept
 487    {
 488       return reg_;
 489    }
 490
 491    constexpr void setFixed(PhysReg reg) noexcept
 492    {
 493       isFixed_ = reg != unsigned(-1);
 494       reg_ = reg;
 495    }
 496
 497    constexpr bool isConstant() const noexcept
 498    {
 499       return isConstant_;
 500    }
 501
 502    constexpr bool isLiteral() const noexcept
 503    {
 504       return isConstant() && reg_ == 255;
 505    }
 506
 507    constexpr bool isUndefined() const noexcept
 508    {
 509       return isUndef_;
 510    }
 511
 512    constexpr uint32_t constantValue() const noexcept
 513    {
 514       return data_.i;
 515    }
 516
 517    constexpr bool constantEquals(uint32_t cmp) const noexcept
 518    {
 519       return isConstant() && constantValue() == cmp;
 520    }
 521
 522    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 523    {
 524       if (is64BitConst_) {
 525          if (reg_ <= 192)
 526             return reg_ - 128;
 527          else if (reg_ <= 208)
 528             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 529
 530          switch (reg_) {
 531          case 240:
 532             return 0x3FE0000000000000;
 533          case 241:
 534             return 0xBFE0000000000000;
 535          case 242:
 536             return 0x3FF0000000000000;
 537          case 243:
 538             return 0xBFF0000000000000;
 539          case 244:
 540             return 0x4000000000000000;
 541          case 245:
 542             return 0xC000000000000000;
 543          case 246:
 544             return 0x4010000000000000;
 545          case 247:
 546             return 0xC010000000000000;
 547          }
 548       }
 549       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 550    }
 551
 552    constexpr bool isOfType(RegType type) const noexcept
 553    {
 554       return hasRegClass() && regClass().type() == type;
 555    }
 556
 557    /* Indicates that the killed operand's live range intersects with the
 558     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 559     * not set by liveness analysis. */
 560    constexpr void setLateKill(bool flag) noexcept
 561    {
 562       isLateKill_ = flag;
 563    }
 564
 565    constexpr bool isLateKill() const noexcept
 566    {
 567       return isLateKill_;
 568    }
 569
 570    constexpr void setKill(bool flag) noexcept
 571    {
 572       isKill_ = flag;
 573       if (!flag)
 574          setFirstKill(false);
 575    }
 576
 577    constexpr bool isKill() const noexcept
 578    {
 579       return isKill_ || isFirstKill();
 580    }
 581
 582    constexpr void setFirstKill(bool flag) noexcept
 583    {
 584       isFirstKill_ = flag;
 585       if (flag)
 586          setKill(flag);
 587    }
 588
 589    /* When there are multiple operands killing the same temporary,
 590     * isFirstKill() is only returns true for the first one. */
 591    constexpr bool isFirstKill() const noexcept
 592    {
 593       return isFirstKill_;
 594    }
 595
 596    constexpr bool isKillBeforeDef() const noexcept
 597    {
 598       return isKill() && !isLateKill();
 599    }
 600
 601    constexpr bool isFirstKillBeforeDef() const noexcept
 602    {
 603       return isFirstKill() && !isLateKill();
 604    }
 605
 606    constexpr bool operator == (Operand other) const noexcept
 607    {
 608       if (other.size() != size())
 609          return false;
 610       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 611          return false;
 612       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 613          return false;
 614       if (isLiteral())
 615          return other.isLiteral() && other.constantValue() == constantValue();
 616       else if (isConstant())
 617          return other.isConstant() && other.physReg() == physReg();
 618       else if (isUndefined())
 619          return other.isUndefined() && other.regClass() == regClass();
 620       else
 621          return other.isTemp() && other.getTemp() == getTemp();
 622    }
 623 private:
 624    union {
 625       uint32_t i;
 626       float f;
 627       Temp temp = Temp(0, s1);
 628    } data_;
 629    PhysReg reg_;
 630    union {
 631       struct {
 632          uint8_t isTemp_:1;
 633          uint8_t isFixed_:1;
 634          uint8_t isConstant_:1;
 635          uint8_t isKill_:1;
 636          uint8_t isUndef_:1;
 637          uint8_t isFirstKill_:1;
 638          uint8_t is64BitConst_:1;
 639          uint8_t isLateKill_:1;
 640       };
 641       /* can't initialize bit-fields in c++11, so work around using a union */
 642       uint8_t control_ = 0;
 643    };
 644 };
 645
 646 /**
 647  * Definition Class
 648  * Definitions are the results of Instructions
 649  * and refer to temporary virtual registers
 650  * which are later mapped to physical registers
 651  */
 652 class Definition final
 653 {
 654 public:
 655    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {}
 656    Definition(uint32_t index, RegClass type) noexcept
 657       : temp(index, type) {}
 658    explicit Definition(Temp tmp) noexcept
 659       : temp(tmp) {}
 660    Definition(PhysReg reg, RegClass type) noexcept
 661       : temp(Temp(0, type))
 662    {
 663       setFixed(reg);
 664    }
 665    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 666       : temp(Temp(tmpId, type))
 667    {
 668       setFixed(reg);
 669    }
 670
 671    constexpr bool isTemp() const noexcept
 672    {
 673       return tempId() > 0;
 674    }
 675
 676    constexpr Temp getTemp() const noexcept
 677    {
 678       return temp;
 679    }
 680
 681    constexpr uint32_t tempId() const noexcept
 682    {
 683       return temp.id();
 684    }
 685
 686    constexpr void setTemp(Temp t) noexcept {
 687       temp = t;
 688    }
 689
 690    constexpr RegClass regClass() const noexcept
 691    {
 692       return temp.regClass();
 693    }
 694
 695    constexpr unsigned bytes() const noexcept
 696    {
 697       return temp.bytes();
 698    }
 699
 700    constexpr unsigned size() const noexcept
 701    {
 702       return temp.size();
 703    }
 704
 705    constexpr bool isFixed() const noexcept
 706    {
 707       return isFixed_;
 708    }
 709
 710    constexpr PhysReg physReg() const noexcept
 711    {
 712       return reg_;
 713    }
 714
 715    constexpr void setFixed(PhysReg reg) noexcept
 716    {
 717       isFixed_ = 1;
 718       reg_ = reg;
 719    }
 720
 721    constexpr void setHint(PhysReg reg) noexcept
 722    {
 723       hasHint_ = 1;
 724       reg_ = reg;
 725    }
 726
 727    constexpr bool hasHint() const noexcept
 728    {
 729       return hasHint_;
 730    }
 731
 732    constexpr void setKill(bool flag) noexcept
 733    {
 734       isKill_ = flag;
 735    }
 736
 737    constexpr bool isKill() const noexcept
 738    {
 739       return isKill_;
 740    }
 741
 742    constexpr void setPrecise(bool precise) noexcept
 743    {
 744       isPrecise_ = precise;
 745    }
 746
 747    constexpr bool isPrecise() const noexcept
 748    {
 749       return isPrecise_;
 750    }
 751
 752 private:
 753    Temp temp = Temp(0, s1);
 754    PhysReg reg_;
 755    union {
 756       struct {
 757          uint8_t isFixed_:1;
 758          uint8_t hasHint_:1;
 759          uint8_t isKill_:1;
 760          uint8_t isPrecise_:1;
 761       };
 762       /* can't initialize bit-fields in c++11, so work around using a union */
 763       uint8_t control_ = 0;
 764    };
 765 };
 766
 767 struct Block;
 768
 769 struct Instruction {
 770    aco_opcode opcode;
 771    Format format;
 772    uint32_t pass_flags;
 773
 774    aco::span<Operand> operands;
 775    aco::span<Definition> definitions;
 776
 777    constexpr bool isVALU() const noexcept
 778    {
 779       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 780           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 781           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 782           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 783           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 784           || format == Format::VOP3P;
 785    }
 786
 787    constexpr bool isSALU() const noexcept
 788    {
 789       return format == Format::SOP1 ||
 790              format == Format::SOP2 ||
 791              format == Format::SOPC ||
 792              format == Format::SOPK ||
 793              format == Format::SOPP;
 794    }
 795
 796    constexpr bool isVMEM() const noexcept
 797    {
 798       return format == Format::MTBUF ||
 799              format == Format::MUBUF ||
 800              format == Format::MIMG;
 801    }
 802
 803    constexpr bool isDPP() const noexcept
 804    {
 805       return (uint16_t) format & (uint16_t) Format::DPP;
 806    }
 807
 808    constexpr bool isVOP3() const noexcept
 809    {
 810       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 811              ((uint16_t) format & (uint16_t) Format::VOP3B);
 812    }
 813
 814    constexpr bool isSDWA() const noexcept
 815    {
 816       return (uint16_t) format & (uint16_t) Format::SDWA;
 817    }
 818
 819    constexpr bool isFlatOrGlobal() const noexcept
 820    {
 821       return format == Format::FLAT || format == Format::GLOBAL;
 822    }
 823
 824    constexpr bool usesModifiers() const noexcept;
 825
 826    constexpr bool reads_exec() const noexcept
 827    {
 828       for (const Operand& op : operands) {
 829          if (op.isFixed() && op.physReg() == exec)
 830             return true;
 831       }
 832       return false;
 833    }
 834 };
 835 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 836
 837 struct SOPK_instruction : public Instruction {
 838    uint16_t imm;
 839    uint16_t padding;
 840 };
 841 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 842
 843 struct SOPP_instruction : public Instruction {
 844    uint32_t imm;
 845    int block;
 846 };
 847 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 848
 849 struct SOPC_instruction : public Instruction {
 850 };
 851 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 852
 853 struct SOP1_instruction : public Instruction {
 854 };
 855 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 856
 857 struct SOP2_instruction : public Instruction {
 858 };
 859 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 860
 861 /**
 862  * Scalar Memory Format:
 863  * For s_(buffer_)load_dword*:
 864  * Operand(0): SBASE - SGPR-pair which provides base address
 865  * Operand(1): Offset - immediate (un)signed offset or SGPR
 866  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 867  * Operand(n-1): SOffset - SGPR offset (Vega only)
 868  *
 869  * Having no operands is also valid for instructions such as s_dcache_inv.
 870  *
 871  */
 872 struct SMEM_instruction : public Instruction {
 873    barrier_interaction barrier;
 874    bool glc : 1; /* VI+: globally coherent */
 875    bool dlc : 1; /* NAVI: device level coherent */
 876    bool nv : 1; /* VEGA only: Non-volatile */
 877    bool can_reorder : 1;
 878    bool disable_wqm : 1;
 879    uint32_t padding: 19;
 880 };
 881 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 882
 883 struct VOP1_instruction : public Instruction {
 884 };
 885 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 886
 887 struct VOP2_instruction : public Instruction {
 888 };
 889 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 890
 891 struct VOPC_instruction : public Instruction {
 892 };
 893 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 894
 895 struct VOP3A_instruction : public Instruction {
 896    bool abs[3];
 897    bool neg[3];
 898    uint8_t opsel : 4;
 899    uint8_t omod : 2;
 900    bool clamp : 1;
 901    uint32_t padding : 9;
 902 };
 903 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 904
 905 struct VOP3P_instruction : public Instruction {
 906    bool neg_lo[3];
 907    bool neg_hi[3];
 908    uint8_t opsel_lo : 3;
 909    uint8_t opsel_hi : 3;
 910    bool clamp : 1;
 911    uint32_t padding : 9;
 912 };
 913 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 914
 915 /**
 916  * Data Parallel Primitives Format:
 917  * This format can be used for VOP1, VOP2 or VOPC instructions.
 918  * The swizzle applies to the src0 operand.
 919  *
 920  */
 921 struct DPP_instruction : public Instruction {
 922    bool abs[2];
 923    bool neg[2];
 924    uint16_t dpp_ctrl;
 925    uint8_t row_mask : 4;
 926    uint8_t bank_mask : 4;
 927    bool bound_ctrl : 1;
 928    uint32_t padding : 7;
 929 };
 930 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 931
 932 enum sdwa_sel : uint8_t {
 933     /* masks */
 934     sdwa_wordnum = 0x1,
 935     sdwa_bytenum = 0x3,
 936     sdwa_asuint = 0x7 | 0x10,
 937     sdwa_rasize = 0x3,
 938
 939     /* flags */
 940     sdwa_isword = 0x4,
 941     sdwa_sext = 0x8,
 942     sdwa_isra = 0x10,
 943
 944     /* specific values */
 945     sdwa_ubyte0 = 0,
 946     sdwa_ubyte1 = 1,
 947     sdwa_ubyte2 = 2,
 948     sdwa_ubyte3 = 3,
 949     sdwa_uword0 = sdwa_isword | 0,
 950     sdwa_uword1 = sdwa_isword | 1,
 951     sdwa_udword = 6,
 952
 953     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 954     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 955     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
 956     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
 957     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
 958     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
 959     sdwa_sdword = sdwa_udword | sdwa_sext,
 960
 961     /* register-allocated */
 962     sdwa_ubyte = 1 | sdwa_isra,
 963     sdwa_uword = 2 | sdwa_isra,
 964     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
 965     sdwa_sword = sdwa_uword | sdwa_sext,
 966 };
 967
 968 /**
 969  * Sub-Dword Addressing Format:
 970  * This format can be used for VOP1, VOP2 or VOPC instructions.
 971  *
 972  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 973  * the definition doesn't have to be VCC on GFX9+.
 974  *
 975  */
 976 struct SDWA_instruction : public Instruction {
 977    /* these destination modifiers aren't available with VOPC except for
 978     * clamp on GFX8 */
 979    uint8_t sel[2];
 980    uint8_t dst_sel;
 981    bool neg[2];
 982    bool abs[2];
 983    bool dst_preserve : 1;
 984    bool clamp : 1;
 985    uint8_t omod : 2; /* GFX9+ */
 986    uint32_t padding : 4;
 987 };
 988 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 989
 990 struct Interp_instruction : public Instruction {
 991    uint8_t attribute;
 992    uint8_t component;
 993    uint16_t padding;
 994 };
 995 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 996
 997 /**
 998  * Local and Global Data Sharing instructions
 999  * Operand(0): ADDR - VGPR which supplies the address.
1000  * Operand(1): DATA0 - First data VGPR.
1001  * Operand(2): DATA1 - Second data VGPR.
1002  * Operand(n-1): M0 - LDS size.
1003  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1004  *
1005  */
1006 struct DS_instruction : public Instruction {
1007    int16_t offset0;
1008    int8_t offset1;
1009    bool gds;
1010 };
1011 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1012
1013 /**
1014  * Vector Memory Untyped-buffer Instructions
1015  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1016  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1017  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1018  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1019  *
1020  */
1021 struct MUBUF_instruction : public Instruction {
1022    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1023    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1024    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1025    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1026    bool glc : 1; /* globally coherent */
1027    bool dlc : 1; /* NAVI: device level coherent */
1028    bool slc : 1; /* system level coherent */
1029    bool tfe : 1; /* texture fail enable */
1030    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1031    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1032    bool can_reorder : 1;
1033    uint8_t padding : 2;
1034    barrier_interaction barrier;
1035 };
1036 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1037
1038 /**
1039  * Vector Memory Typed-buffer Instructions
1040  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1041  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1042  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1043  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1044  *
1045  */
1046 struct MTBUF_instruction : public Instruction {
1047    uint16_t offset; /* Unsigned byte offset - 12 bit */
1048    barrier_interaction barrier;
1049    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1050    uint8_t nfmt : 3; /* Numeric format of data in memory */
1051    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1052    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1053    bool glc : 1; /* globally coherent */
1054    bool dlc : 1; /* NAVI: device level coherent */
1055    bool slc : 1; /* system level coherent */
1056    bool tfe : 1; /* texture fail enable */
1057    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1058    bool can_reorder : 1;
1059    uint32_t padding : 25;
1060 };
1061 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1062
1063 /**
1064  * Vector Memory Image Instructions
1065  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1066  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1067  *             or VDATA - Vector GPR for write data.
1068  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1069  * Definition(0): VDATA - Vector GPR for read result.
1070  *
1071  */
1072 struct MIMG_instruction : public Instruction {
1073    uint8_t dmask; /* Data VGPR enable mask */
1074    uint8_t dim : 3; /* NAVI: dimensionality */
1075    bool unrm : 1; /* Force address to be un-normalized */
1076    bool dlc : 1; /* NAVI: device level coherent */
1077    bool glc : 1; /* globally coherent */
1078    bool slc : 1; /* system level coherent */
1079    bool tfe : 1; /* texture fail enable */
1080    bool da : 1; /* declare an array */
1081    bool lwe : 1; /* Force data to be un-normalized */
1082    bool r128 : 1; /* NAVI: Texture resource size */
1083    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1084    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1085    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1086    bool can_reorder : 1;
1087    uint8_t padding : 1;
1088    barrier_interaction barrier;
1089 };
1090 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1091
1092 /**
1093  * Flat/Scratch/Global Instructions
1094  * Operand(0): ADDR
1095  * Operand(1): SADDR
1096  * Operand(2) / Definition(0): DATA/VDST
1097  *
1098  */
1099 struct FLAT_instruction : public Instruction {
1100    uint16_t offset; /* Vega/Navi only */
1101    bool slc : 1; /* system level coherent */
1102    bool glc : 1; /* globally coherent */
1103    bool dlc : 1; /* NAVI: device level coherent */
1104    bool lds : 1;
1105    bool nv : 1;
1106    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1107    bool can_reorder : 1;
1108    uint8_t padding : 1;
1109    barrier_interaction barrier;
1110 };
1111 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1112
1113 struct Export_instruction : public Instruction {
1114    uint8_t enabled_mask;
1115    uint8_t dest;
1116    bool compressed : 1;
1117    bool done : 1;
1118    bool valid_mask : 1;
1119    uint32_t padding : 13;
1120 };
1121 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1122
1123 struct Pseudo_instruction : public Instruction {
1124    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1125    bool tmp_in_scc;
1126    uint8_t padding;
1127 };
1128 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1129
1130 struct Pseudo_branch_instruction : public Instruction {
1131    /* target[0] is the block index of the branch target.
1132     * For conditional branches, target[1] contains the fall-through alternative.
1133     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1134     */
1135    uint32_t target[2];
1136 };
1137 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1138
1139 struct Pseudo_barrier_instruction : public Instruction {
1140 };
1141 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1142
1143 enum ReduceOp : uint16_t {
1144    iadd8, iadd16, iadd32, iadd64,
1145    imul8, imul16, imul32, imul64,
1146           fadd16, fadd32, fadd64,
1147           fmul16, fmul32, fmul64,
1148    imin8, imin16, imin32, imin64,
1149    imax8, imax16, imax32, imax64,
1150    umin8, umin16, umin32, umin64,
1151    umax8, umax16, umax32, umax64,
1152           fmin16, fmin32, fmin64,
1153           fmax16, fmax32, fmax64,
1154    iand8, iand16, iand32, iand64,
1155    ior8, ior16, ior32, ior64,
1156    ixor8, ixor16, ixor32, ixor64,
1157 };
1158
1159 /**
1160  * Subgroup Reduction Instructions, everything except for the data to be
1161  * reduced and the result as inserted by setup_reduce_temp().
1162  * Operand(0): data to be reduced
1163  * Operand(1): reduce temporary
1164  * Operand(2): vector temporary
1165  * Definition(0): result
1166  * Definition(1): scalar temporary
1167  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1168  * Definition(3): scc clobber
1169  * Definition(4): vcc clobber
1170  *
1171  */
1172 struct Pseudo_reduction_instruction : public Instruction {
1173    ReduceOp reduce_op;
1174    uint16_t cluster_size; // must be 0 for scans
1175 };
1176 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1177
1178 struct instr_deleter_functor {
1179    void operator()(void* p) {
1180       free(p);
1181    }
1182 };
1183
1184 template<typename T>
1185 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1186
1187 template<typename T>
1188 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1189 {
1190    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1191    char *data = (char*) calloc(1, size);
1192    T* inst = (T*) data;
1193
1194    inst->opcode = opcode;
1195    inst->format = format;
1196
1197    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1198    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1199    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1200    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1201
1202    return inst;
1203 }
1204
1205 constexpr bool Instruction::usesModifiers() const noexcept
1206 {
1207    if (isDPP() || isSDWA())
1208       return true;
1209
1210    if (format == Format::VOP3P) {
1211       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1212       for (unsigned i = 0; i < operands.size(); i++) {
1213          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1214             return true;
1215       }
1216       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1217    } else if (isVOP3()) {
1218       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1219       for (unsigned i = 0; i < operands.size(); i++) {
1220          if (vop3->abs[i] || vop3->neg[i])
1221             return true;
1222       }
1223       return vop3->opsel || vop3->clamp || vop3->omod;
1224    }
1225    return false;
1226 }
1227
1228 constexpr bool is_phi(Instruction* instr)
1229 {
1230    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1231 }
1232
1233 static inline bool is_phi(aco_ptr<Instruction>& instr)
1234 {
1235    return is_phi(instr.get());
1236 }
1237
1238 barrier_interaction get_barrier_interaction(const Instruction* instr);
1239 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1240
1241 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1242 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1243 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1244 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1245
1246 enum block_kind {
1247    /* uniform indicates that leaving this block,
1248     * all actives lanes stay active */
1249    block_kind_uniform = 1 << 0,
1250    block_kind_top_level = 1 << 1,
1251    block_kind_loop_preheader = 1 << 2,
1252    block_kind_loop_header = 1 << 3,
1253    block_kind_loop_exit = 1 << 4,
1254    block_kind_continue = 1 << 5,
1255    block_kind_break = 1 << 6,
1256    block_kind_continue_or_break = 1 << 7,
1257    block_kind_discard = 1 << 8,
1258    block_kind_branch = 1 << 9,
1259    block_kind_merge = 1 << 10,
1260    block_kind_invert = 1 << 11,
1261    block_kind_uses_discard_if = 1 << 12,
1262    block_kind_needs_lowering = 1 << 13,
1263    block_kind_uses_demote = 1 << 14,
1264    block_kind_export_end = 1 << 15,
1265 };
1266
1267
1268 struct RegisterDemand {
1269    constexpr RegisterDemand() = default;
1270    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1271       : vgpr{v}, sgpr{s} {}
1272    int16_t vgpr = 0;
1273    int16_t sgpr = 0;
1274
1275    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1276       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1277    }
1278
1279    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1280       return vgpr > other.vgpr || sgpr > other.sgpr;
1281    }
1282
1283    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1284       if (t.type() == RegType::sgpr)
1285          return RegisterDemand( vgpr, sgpr + t.size() );
1286       else
1287          return RegisterDemand( vgpr + t.size(), sgpr );
1288    }
1289
1290    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1291       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1292    }
1293
1294    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1295       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1296    }
1297
1298    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1299       vgpr += other.vgpr;
1300       sgpr += other.sgpr;
1301       return *this;
1302    }
1303
1304    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1305       vgpr -= other.vgpr;
1306       sgpr -= other.sgpr;
1307       return *this;
1308    }
1309
1310    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1311       if (t.type() == RegType::sgpr)
1312          sgpr += t.size();
1313       else
1314          vgpr += t.size();
1315       return *this;
1316    }
1317
1318    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1319       if (t.type() == RegType::sgpr)
1320          sgpr -= t.size();
1321       else
1322          vgpr -= t.size();
1323       return *this;
1324    }
1325
1326    constexpr void update(const RegisterDemand other) noexcept {
1327       vgpr = std::max(vgpr, other.vgpr);
1328       sgpr = std::max(sgpr, other.sgpr);
1329    }
1330
1331 };
1332
1333 /* CFG */
1334 struct Block {
1335    float_mode fp_mode;
1336    unsigned index;
1337    unsigned offset = 0;
1338    std::vector<aco_ptr<Instruction>> instructions;
1339    std::vector<unsigned> logical_preds;
1340    std::vector<unsigned> linear_preds;
1341    std::vector<unsigned> logical_succs;
1342    std::vector<unsigned> linear_succs;
1343    RegisterDemand register_demand = RegisterDemand();
1344    uint16_t loop_nest_depth = 0;
1345    uint16_t kind = 0;
1346    int logical_idom = -1;
1347    int linear_idom = -1;
1348    Temp live_out_exec = Temp();
1349
1350    /* this information is needed for predecessors to blocks with phis when
1351     * moving out of ssa */
1352    bool scc_live_out = false;
1353    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1354
1355    Block(unsigned idx) : index(idx) {}
1356    Block() : index(0) {}
1357 };
1358
1359 using Stage = uint16_t;
1360
1361 /* software stages */
1362 static constexpr Stage sw_vs = 1 << 0;
1363 static constexpr Stage sw_gs = 1 << 1;
1364 static constexpr Stage sw_tcs = 1 << 2;
1365 static constexpr Stage sw_tes = 1 << 3;
1366 static constexpr Stage sw_fs = 1 << 4;
1367 static constexpr Stage sw_cs = 1 << 5;
1368 static constexpr Stage sw_gs_copy = 1 << 6;
1369 static constexpr Stage sw_mask = 0x7f;
1370
1371 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1372 static constexpr Stage hw_vs = 1 << 7;
1373 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1374 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1375 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1376 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1377 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1378 static constexpr Stage hw_fs = 1 << 13;
1379 static constexpr Stage hw_cs = 1 << 14;
1380 static constexpr Stage hw_mask = 0xff << 7;
1381
1382 /* possible settings of Program::stage */
1383 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1384 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1385 static constexpr Stage compute_cs = sw_cs | hw_cs;
1386 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1387 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1388 /* GFX10/NGG */
1389 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1390 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1391 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1392 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1393 /* GFX9 (and GFX10 if NGG isn't used) */
1394 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1395 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1396 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1397 /* pre-GFX9 */
1398 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1399 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1400 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1401 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1402 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1403
1404 enum statistic {
1405    statistic_hash,
1406    statistic_instructions,
1407    statistic_copies,
1408    statistic_branches,
1409    statistic_cycles,
1410    statistic_vmem_clauses,
1411    statistic_smem_clauses,
1412    statistic_vmem_score,
1413    statistic_smem_score,
1414    statistic_sgpr_presched,
1415    statistic_vgpr_presched,
1416    num_statistics
1417 };
1418
1419 class Program final {
1420 public:
1421    float_mode next_fp_mode;
1422    std::vector<Block> blocks;
1423    RegisterDemand max_reg_demand = RegisterDemand();
1424    uint16_t num_waves = 0;
1425    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1426    ac_shader_config* config;
1427    struct radv_shader_info *info;
1428    enum chip_class chip_class;
1429    enum radeon_family family;
1430    unsigned wave_size;
1431    RegClass lane_mask;
1432    Stage stage; /* Stage */
1433    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1434    bool needs_wqm = false; /* there exists a p_wqm instruction */
1435    bool wb_smem_l1_on_end = false;
1436
1437    std::vector<uint8_t> constant_data;
1438    Temp private_segment_buffer;
1439    Temp scratch_offset;
1440
1441    uint16_t min_waves = 0;
1442    uint16_t lds_alloc_granule;
1443    uint32_t lds_limit; /* in bytes */
1444    bool has_16bank_lds;
1445    uint16_t vgpr_limit;
1446    uint16_t sgpr_limit;
1447    uint16_t physical_sgprs;
1448    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1449    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1450    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1451
1452    bool xnack_enabled = false;
1453    bool sram_ecc_enabled = false;
1454
1455    bool needs_vcc = false;
1456    bool needs_flat_scr = false;
1457
1458    bool collect_statistics = false;
1459    uint32_t statistics[num_statistics];
1460
1461    uint32_t allocateId()
1462    {
1463       assert(allocationID <= 16777215);
1464       return allocationID++;
1465    }
1466
1467    uint32_t peekAllocationId()
1468    {
1469       return allocationID;
1470    }
1471
1472    void setAllocationId(uint32_t id)
1473    {
1474       allocationID = id;
1475    }
1476
1477    Block* create_and_insert_block() {
1478       blocks.emplace_back(blocks.size());
1479       blocks.back().fp_mode = next_fp_mode;
1480       return &blocks.back();
1481    }
1482
1483    Block* insert_block(Block&& block) {
1484       block.index = blocks.size();
1485       block.fp_mode = next_fp_mode;
1486       blocks.emplace_back(std::move(block));
1487       return &blocks.back();
1488    }
1489
1490 private:
1491    uint32_t allocationID = 1;
1492 };
1493
1494 struct TempHash {
1495    std::size_t operator()(Temp t) const {
1496       return t.id();
1497    }
1498 };
1499 using TempSet = std::unordered_set<Temp, TempHash>;
1500
1501 struct live {
1502    /* live temps out per block */
1503    std::vector<TempSet> live_out;
1504    /* register demand (sgpr/vgpr) per instruction per block */
1505    std::vector<std::vector<RegisterDemand>> register_demand;
1506 };
1507
1508 void select_program(Program *program,
1509                     unsigned shader_count,
1510                     struct nir_shader *const *shaders,
1511                     ac_shader_config* config,
1512                     struct radv_shader_args *args);
1513 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1514                            ac_shader_config* config,
1515                            struct radv_shader_args *args);
1516
1517 void lower_wqm(Program* program, live& live_vars,
1518                const struct radv_nir_compiler_options *options);
1519 void lower_phis(Program* program);
1520 void calc_min_waves(Program* program);
1521 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1522 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1523 std::vector<uint16_t> dead_code_analysis(Program *program);
1524 void dominator_tree(Program* program);
1525 void insert_exec_mask(Program *program);
1526 void value_numbering(Program* program);
1527 void optimize(Program* program);
1528 void setup_reduce_temp(Program* program);
1529 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1530 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1531 void ssa_elimination(Program* program);
1532 void lower_to_hw_instr(Program* program);
1533 void schedule_program(Program* program, live& live_vars);
1534 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1535 void insert_wait_states(Program* program);
1536 void insert_NOPs(Program* program);
1537 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1538 void print_asm(Program *program, std::vector<uint32_t>& binary,
1539                unsigned exec_size, std::ostream& out);
1540 void validate(Program* program, FILE *output);
1541 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1542 #ifndef NDEBUG
1543 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1544 #else
1545 #define perfwarn(program, cond, msg, ...) do {} while(0)
1546 #endif
1547
1548 void collect_presched_stats(Program *program);
1549 void collect_preasm_stats(Program *program);
1550 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1551
1552 void aco_print_instr(const Instruction *instr, FILE *output);
1553 void aco_print_program(const Program *program, FILE *output);
1554
1555 /* utilities for dealing with register demand */
1556 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1557 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1558 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1559
1560 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1561 uint16_t get_extra_sgprs(Program *program);
1562
1563 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1564 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1565 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1566
1567 /* return number of addressable sgprs/vgprs for max_waves */
1568 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1569 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1570
1571 typedef struct {
1572    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1573    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1574    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1575    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1576    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1577    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1578    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1579    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1580    /* sizes used for input/output modifiers and constants */
1581    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1582    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1583 } Info;
1584
1585 extern const Info instr_info;
1586
1587 }
1588
1589 #endif /* ACO_IR_H */
1590