src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 struct radv_nir_compiler_options;
  41 struct radv_shader_args;
  42 struct radv_shader_info;
  43
  44 namespace aco {
  45
  46 extern uint64_t debug_flags;
  47
  48 enum {
  49    DEBUG_VALIDATE = 0x1,
  50    DEBUG_VALIDATE_RA = 0x2,
  51    DEBUG_PERFWARN = 0x4,
  52 };
  53
  54 /**
  55  * Representation of the instruction's microcode encoding format
  56  * Note: Some Vector ALU Formats can be combined, such that:
  57  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  58  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  59  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  60  *
  61  * (*) The same is applicable for VOP1 and VOPC instructions.
  62  */
  63 enum class Format : std::uint16_t {
  64    /* Pseudo Instruction Format */
  65    PSEUDO = 0,
  66    /* Scalar ALU & Control Formats */
  67    SOP1 = 1,
  68    SOP2 = 2,
  69    SOPK = 3,
  70    SOPP = 4,
  71    SOPC = 5,
  72    /* Scalar Memory Format */
  73    SMEM = 6,
  74    /* LDS/GDS Format */
  75    DS = 8,
  76    /* Vector Memory Buffer Formats */
  77    MTBUF = 9,
  78    MUBUF = 10,
  79    /* Vector Memory Image Format */
  80    MIMG = 11,
  81    /* Export Format */
  82    EXP = 12,
  83    /* Flat Formats */
  84    FLAT = 13,
  85    GLOBAL = 14,
  86    SCRATCH = 15,
  87
  88    PSEUDO_BRANCH = 16,
  89    PSEUDO_BARRIER = 17,
  90    PSEUDO_REDUCTION = 18,
  91
  92    /* Vector ALU Formats */
  93    VOP3P = 19,
  94    VOP1 = 1 << 8,
  95    VOP2 = 1 << 9,
  96    VOPC = 1 << 10,
  97    VOP3 = 1 << 11,
  98    VOP3A = 1 << 11,
  99    VOP3B = 1 << 11,
 100    /* Vector Parameter Interpolation Format */
 101    VINTRP = 1 << 12,
 102    DPP = 1 << 13,
 103    SDWA = 1 << 14,
 104 };
 105
 106 enum barrier_interaction : uint8_t {
 107    barrier_none = 0,
 108    barrier_buffer = 0x1,
 109    barrier_image = 0x2,
 110    barrier_atomic = 0x4,
 111    barrier_shared = 0x8,
 112    /* used for geometry shaders to ensure vertex data writes are before the
 113     * GS_DONE s_sendmsg. */
 114    barrier_gs_data = 0x10,
 115    /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
 116    barrier_gs_sendmsg = 0x20,
 117    /* used by barriers. created by s_barrier */
 118    barrier_barrier = 0x40,
 119    barrier_count = 7,
 120 };
 121
 122 enum fp_round {
 123    fp_round_ne = 0,
 124    fp_round_pi = 1,
 125    fp_round_ni = 2,
 126    fp_round_tz = 3,
 127 };
 128
 129 enum fp_denorm {
 130    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 131     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 132    fp_denorm_flush = 0x0,
 133    fp_denorm_keep = 0x3,
 134 };
 135
 136 struct float_mode {
 137    /* matches encoding of the MODE register */
 138    union {
 139       struct {
 140           fp_round round32:2;
 141           fp_round round16_64:2;
 142           unsigned denorm32:2;
 143           unsigned denorm16_64:2;
 144       };
 145       uint8_t val = 0;
 146    };
 147    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 148    bool preserve_signed_zero_inf_nan32:1;
 149    bool preserve_signed_zero_inf_nan16_64:1;
 150    /* if false, optimizations which may remove denormal flushing can be done */
 151    bool must_flush_denorms32:1;
 152    bool must_flush_denorms16_64:1;
 153    bool care_about_round32:1;
 154    bool care_about_round16_64:1;
 155
 156    /* Returns true if instructions using the mode "other" can safely use the
 157     * current one instead. */
 158    bool canReplace(float_mode other) const noexcept {
 159       return val == other.val &&
 160              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 161              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 162              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 163              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 164              (care_about_round32 || !other.care_about_round32) &&
 165              (care_about_round16_64 || !other.care_about_round16_64);
 166    }
 167 };
 168
 169 constexpr Format asVOP3(Format format) {
 170    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 171 };
 172
 173 constexpr Format asSDWA(Format format) {
 174    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 175    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 176 }
 177
 178 enum class RegType {
 179    none = 0,
 180    sgpr,
 181    vgpr,
 182    linear_vgpr,
 183 };
 184
 185 struct RegClass {
 186
 187    enum RC : uint8_t {
 188       s1 = 1,
 189       s2 = 2,
 190       s3 = 3,
 191       s4 = 4,
 192       s6 = 6,
 193       s8 = 8,
 194       s16 = 16,
 195       v1 = s1 | (1 << 5),
 196       v2 = s2 | (1 << 5),
 197       v3 = s3 | (1 << 5),
 198       v4 = s4 | (1 << 5),
 199       v5 = 5  | (1 << 5),
 200       v6 = 6  | (1 << 5),
 201       v7 = 7  | (1 << 5),
 202       v8 = 8  | (1 << 5),
 203       /* byte-sized register class */
 204       v1b = v1 | (1 << 7),
 205       v2b = v2 | (1 << 7),
 206       v3b = v3 | (1 << 7),
 207       v4b = v4 | (1 << 7),
 208       v6b = v6 | (1 << 7),
 209       v8b = v8 | (1 << 7),
 210       /* these are used for WWM and spills to vgpr */
 211       v1_linear = v1 | (1 << 6),
 212       v2_linear = v2 | (1 << 6),
 213    };
 214
 215    RegClass() = default;
 216    constexpr RegClass(RC rc)
 217       : rc(rc) {}
 218    constexpr RegClass(RegType type, unsigned size)
 219       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 220
 221    constexpr operator RC() const { return rc; }
 222    explicit operator bool() = delete;
 223
 224    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 225    constexpr bool is_subdword() const { return rc & (1 << 7); }
 226    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 227    //TODO: use size() less in favor of bytes()
 228    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 229    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 230    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 231    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 232
 233    static constexpr RegClass get(RegType type, unsigned bytes) {
 234       if (type == RegType::sgpr) {
 235          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 236       } else {
 237          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 238                              RegClass(type, bytes / 4u);
 239       }
 240    }
 241
 242 private:
 243    RC rc;
 244 };
 245
 246 /* transitional helper expressions */
 247 static constexpr RegClass s1{RegClass::s1};
 248 static constexpr RegClass s2{RegClass::s2};
 249 static constexpr RegClass s3{RegClass::s3};
 250 static constexpr RegClass s4{RegClass::s4};
 251 static constexpr RegClass s8{RegClass::s8};
 252 static constexpr RegClass s16{RegClass::s16};
 253 static constexpr RegClass v1{RegClass::v1};
 254 static constexpr RegClass v2{RegClass::v2};
 255 static constexpr RegClass v3{RegClass::v3};
 256 static constexpr RegClass v4{RegClass::v4};
 257 static constexpr RegClass v5{RegClass::v5};
 258 static constexpr RegClass v6{RegClass::v6};
 259 static constexpr RegClass v7{RegClass::v7};
 260 static constexpr RegClass v8{RegClass::v8};
 261 static constexpr RegClass v1b{RegClass::v1b};
 262 static constexpr RegClass v2b{RegClass::v2b};
 263 static constexpr RegClass v3b{RegClass::v3b};
 264 static constexpr RegClass v4b{RegClass::v4b};
 265 static constexpr RegClass v6b{RegClass::v6b};
 266 static constexpr RegClass v8b{RegClass::v8b};
 267
 268 /**
 269  * Temp Class
 270  * Each temporary virtual register has a
 271  * register class (i.e. size and type)
 272  * and SSA id.
 273  */
 274 struct Temp {
 275    Temp() noexcept : id_(0), reg_class(0) {}
 276    constexpr Temp(uint32_t id, RegClass cls) noexcept
 277       : id_(id), reg_class(uint8_t(cls)) {}
 278
 279    constexpr uint32_t id() const noexcept { return id_; }
 280    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 281
 282    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 283    constexpr unsigned size() const noexcept { return regClass().size(); }
 284    constexpr RegType type() const noexcept { return regClass().type(); }
 285    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 286
 287    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 288    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 289    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 290
 291 private:
 292    uint32_t id_: 24;
 293    uint32_t reg_class : 8;
 294 };
 295
 296 /**
 297  * PhysReg
 298  * Represents the physical register for each
 299  * Operand and Definition.
 300  */
 301 struct PhysReg {
 302    constexpr PhysReg() = default;
 303    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 304    constexpr unsigned reg() const { return reg_b >> 2; }
 305    constexpr unsigned byte() const { return reg_b & 0x3; }
 306    constexpr operator unsigned() const { return reg(); }
 307    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 308    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 309    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 310    constexpr PhysReg advance(unsigned bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 311
 312    uint16_t reg_b = 0;
 313 };
 314
 315 /* helper expressions for special registers */
 316 static constexpr PhysReg m0{124};
 317 static constexpr PhysReg vcc{106};
 318 static constexpr PhysReg vcc_hi{107};
 319 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 320 static constexpr PhysReg exec{126};
 321 static constexpr PhysReg exec_lo{126};
 322 static constexpr PhysReg exec_hi{127};
 323 static constexpr PhysReg vccz{251};
 324 static constexpr PhysReg execz{252};
 325 static constexpr PhysReg scc{253};
 326
 327 /**
 328  * Operand Class
 329  * Initially, each Operand refers to either
 330  * a temporary virtual register
 331  * or to a constant value
 332  * Temporary registers get mapped to physical register during RA
 333  * Constant values are inlined into the instruction sequence.
 334  */
 335 class Operand final
 336 {
 337 public:
 338    constexpr Operand()
 339       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 340         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 341         isLateKill_(false) {}
 342
 343    explicit Operand(Temp r) noexcept
 344    {
 345       data_.temp = r;
 346       if (r.id()) {
 347          isTemp_ = true;
 348       } else {
 349          isUndef_ = true;
 350          setFixed(PhysReg{128});
 351       }
 352    };
 353    explicit Operand(uint8_t v) noexcept
 354    {
 355       /* 8-bit constants are only used for copies and copies from any 8-bit
 356        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 357        * to be inline constants. */
 358       data_.i = v;
 359       isConstant_ = true;
 360       constSize = 0;
 361       setFixed(PhysReg{0u});
 362    };
 363    explicit Operand(uint16_t v) noexcept
 364    {
 365       data_.i = v;
 366       isConstant_ = true;
 367       constSize = 1;
 368       if (v <= 64)
 369          setFixed(PhysReg{128u + v});
 370       else if (v >= 0xFFF0) /* [-16 .. -1] */
 371          setFixed(PhysReg{192u + (0xFFFF - v)});
 372       else if (v == 0x3800) /* 0.5 */
 373          setFixed(PhysReg{240});
 374       else if (v == 0xB800) /* -0.5 */
 375          setFixed(PhysReg{241});
 376       else if (v == 0x3C00) /* 1.0 */
 377          setFixed(PhysReg{242});
 378       else if (v == 0xBC00) /* -1.0 */
 379          setFixed(PhysReg{243});
 380       else if (v == 0x4000) /* 2.0 */
 381          setFixed(PhysReg{244});
 382       else if (v == 0xC000) /* -2.0 */
 383          setFixed(PhysReg{245});
 384       else if (v == 0x4400) /* 4.0 */
 385          setFixed(PhysReg{246});
 386       else if (v == 0xC400) /* -4.0 */
 387          setFixed(PhysReg{247});
 388       else if (v == 0x3118) /* 1/2 PI */
 389          setFixed(PhysReg{248});
 390       else /* Literal Constant */
 391          setFixed(PhysReg{255});
 392    };
 393    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 394    {
 395       data_.i = v;
 396       isConstant_ = true;
 397       constSize = is64bit ? 3 : 2;
 398       if (v <= 64)
 399          setFixed(PhysReg{128 + v});
 400       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 401          setFixed(PhysReg{192 - v});
 402       else if (v == 0x3f000000) /* 0.5 */
 403          setFixed(PhysReg{240});
 404       else if (v == 0xbf000000) /* -0.5 */
 405          setFixed(PhysReg{241});
 406       else if (v == 0x3f800000) /* 1.0 */
 407          setFixed(PhysReg{242});
 408       else if (v == 0xbf800000) /* -1.0 */
 409          setFixed(PhysReg{243});
 410       else if (v == 0x40000000) /* 2.0 */
 411          setFixed(PhysReg{244});
 412       else if (v == 0xc0000000) /* -2.0 */
 413          setFixed(PhysReg{245});
 414       else if (v == 0x40800000) /* 4.0 */
 415          setFixed(PhysReg{246});
 416       else if (v == 0xc0800000) /* -4.0 */
 417          setFixed(PhysReg{247});
 418       else { /* Literal Constant */
 419          assert(!is64bit && "attempt to create a 64-bit literal constant");
 420          setFixed(PhysReg{255});
 421       }
 422    };
 423    explicit Operand(uint64_t v) noexcept
 424    {
 425       isConstant_ = true;
 426       constSize = 3;
 427       if (v <= 64) {
 428          data_.i = (uint32_t) v;
 429          setFixed(PhysReg{128 + (uint32_t) v});
 430       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 431          data_.i = (uint32_t) v;
 432          setFixed(PhysReg{192 - (uint32_t) v});
 433       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 434          data_.i = 0x3f000000;
 435          setFixed(PhysReg{240});
 436       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 437          data_.i = 0xbf000000;
 438          setFixed(PhysReg{241});
 439       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 440          data_.i = 0x3f800000;
 441          setFixed(PhysReg{242});
 442       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 443          data_.i = 0xbf800000;
 444          setFixed(PhysReg{243});
 445       } else if (v == 0x4000000000000000) { /* 2.0 */
 446          data_.i = 0x40000000;
 447          setFixed(PhysReg{244});
 448       } else if (v == 0xC000000000000000) { /* -2.0 */
 449          data_.i = 0xc0000000;
 450          setFixed(PhysReg{245});
 451       } else if (v == 0x4010000000000000) { /* 4.0 */
 452          data_.i = 0x40800000;
 453          setFixed(PhysReg{246});
 454       } else if (v == 0xC010000000000000) { /* -4.0 */
 455          data_.i = 0xc0800000;
 456          setFixed(PhysReg{247});
 457       } else { /* Literal Constant: we don't know if it is a long or double.*/
 458          isConstant_ = 0;
 459          assert(false && "attempt to create a 64-bit literal constant");
 460       }
 461    };
 462    explicit Operand(RegClass type) noexcept
 463    {
 464       isUndef_ = true;
 465       data_.temp = Temp(0, type);
 466       setFixed(PhysReg{128});
 467    };
 468    explicit Operand(PhysReg reg, RegClass type) noexcept
 469    {
 470       data_.temp = Temp(0, type);
 471       setFixed(reg);
 472    }
 473
 474    constexpr bool isTemp() const noexcept
 475    {
 476       return isTemp_;
 477    }
 478
 479    constexpr void setTemp(Temp t) noexcept {
 480       assert(!isConstant_);
 481       isTemp_ = true;
 482       data_.temp = t;
 483    }
 484
 485    constexpr Temp getTemp() const noexcept
 486    {
 487       return data_.temp;
 488    }
 489
 490    constexpr uint32_t tempId() const noexcept
 491    {
 492       return data_.temp.id();
 493    }
 494
 495    constexpr bool hasRegClass() const noexcept
 496    {
 497       return isTemp() || isUndefined();
 498    }
 499
 500    constexpr RegClass regClass() const noexcept
 501    {
 502       return data_.temp.regClass();
 503    }
 504
 505    constexpr unsigned bytes() const noexcept
 506    {
 507       if (isConstant())
 508          return 1 << constSize;
 509       else
 510          return data_.temp.bytes();
 511    }
 512
 513    constexpr unsigned size() const noexcept
 514    {
 515       if (isConstant())
 516          return constSize > 2 ? 2 : 1;
 517       else
 518          return data_.temp.size();
 519    }
 520
 521    constexpr bool isFixed() const noexcept
 522    {
 523       return isFixed_;
 524    }
 525
 526    constexpr PhysReg physReg() const noexcept
 527    {
 528       return reg_;
 529    }
 530
 531    constexpr void setFixed(PhysReg reg) noexcept
 532    {
 533       isFixed_ = reg != unsigned(-1);
 534       reg_ = reg;
 535    }
 536
 537    constexpr bool isConstant() const noexcept
 538    {
 539       return isConstant_;
 540    }
 541
 542    constexpr bool isLiteral() const noexcept
 543    {
 544       return isConstant() && reg_ == 255;
 545    }
 546
 547    constexpr bool isUndefined() const noexcept
 548    {
 549       return isUndef_;
 550    }
 551
 552    constexpr uint32_t constantValue() const noexcept
 553    {
 554       return data_.i;
 555    }
 556
 557    constexpr bool constantEquals(uint32_t cmp) const noexcept
 558    {
 559       return isConstant() && constantValue() == cmp;
 560    }
 561
 562    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 563    {
 564       if (constSize == 3) {
 565          if (reg_ <= 192)
 566             return reg_ - 128;
 567          else if (reg_ <= 208)
 568             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 569
 570          switch (reg_) {
 571          case 240:
 572             return 0x3FE0000000000000;
 573          case 241:
 574             return 0xBFE0000000000000;
 575          case 242:
 576             return 0x3FF0000000000000;
 577          case 243:
 578             return 0xBFF0000000000000;
 579          case 244:
 580             return 0x4000000000000000;
 581          case 245:
 582             return 0xC000000000000000;
 583          case 246:
 584             return 0x4010000000000000;
 585          case 247:
 586             return 0xC010000000000000;
 587          }
 588       } else if (constSize == 1) {
 589          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 590       } else if (constSize == 0) {
 591          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 592       }
 593       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 594    }
 595
 596    constexpr bool isOfType(RegType type) const noexcept
 597    {
 598       return hasRegClass() && regClass().type() == type;
 599    }
 600
 601    /* Indicates that the killed operand's live range intersects with the
 602     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 603     * not set by liveness analysis. */
 604    constexpr void setLateKill(bool flag) noexcept
 605    {
 606       isLateKill_ = flag;
 607    }
 608
 609    constexpr bool isLateKill() const noexcept
 610    {
 611       return isLateKill_;
 612    }
 613
 614    constexpr void setKill(bool flag) noexcept
 615    {
 616       isKill_ = flag;
 617       if (!flag)
 618          setFirstKill(false);
 619    }
 620
 621    constexpr bool isKill() const noexcept
 622    {
 623       return isKill_ || isFirstKill();
 624    }
 625
 626    constexpr void setFirstKill(bool flag) noexcept
 627    {
 628       isFirstKill_ = flag;
 629       if (flag)
 630          setKill(flag);
 631    }
 632
 633    /* When there are multiple operands killing the same temporary,
 634     * isFirstKill() is only returns true for the first one. */
 635    constexpr bool isFirstKill() const noexcept
 636    {
 637       return isFirstKill_;
 638    }
 639
 640    constexpr bool isKillBeforeDef() const noexcept
 641    {
 642       return isKill() && !isLateKill();
 643    }
 644
 645    constexpr bool isFirstKillBeforeDef() const noexcept
 646    {
 647       return isFirstKill() && !isLateKill();
 648    }
 649
 650    constexpr bool operator == (Operand other) const noexcept
 651    {
 652       if (other.size() != size())
 653          return false;
 654       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 655          return false;
 656       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 657          return false;
 658       if (isLiteral())
 659          return other.isLiteral() && other.constantValue() == constantValue();
 660       else if (isConstant())
 661          return other.isConstant() && other.physReg() == physReg();
 662       else if (isUndefined())
 663          return other.isUndefined() && other.regClass() == regClass();
 664       else
 665          return other.isTemp() && other.getTemp() == getTemp();
 666    }
 667 private:
 668    union {
 669       uint32_t i;
 670       float f;
 671       Temp temp = Temp(0, s1);
 672    } data_;
 673    PhysReg reg_;
 674    union {
 675       struct {
 676          uint8_t isTemp_:1;
 677          uint8_t isFixed_:1;
 678          uint8_t isConstant_:1;
 679          uint8_t isKill_:1;
 680          uint8_t isUndef_:1;
 681          uint8_t isFirstKill_:1;
 682          uint8_t constSize:2;
 683          uint8_t isLateKill_:1;
 684       };
 685       /* can't initialize bit-fields in c++11, so work around using a union */
 686       uint16_t control_ = 0;
 687    };
 688 };
 689
 690 /**
 691  * Definition Class
 692  * Definitions are the results of Instructions
 693  * and refer to temporary virtual registers
 694  * which are later mapped to physical registers
 695  */
 696 class Definition final
 697 {
 698 public:
 699    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {}
 700    Definition(uint32_t index, RegClass type) noexcept
 701       : temp(index, type) {}
 702    explicit Definition(Temp tmp) noexcept
 703       : temp(tmp) {}
 704    Definition(PhysReg reg, RegClass type) noexcept
 705       : temp(Temp(0, type))
 706    {
 707       setFixed(reg);
 708    }
 709    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 710       : temp(Temp(tmpId, type))
 711    {
 712       setFixed(reg);
 713    }
 714
 715    constexpr bool isTemp() const noexcept
 716    {
 717       return tempId() > 0;
 718    }
 719
 720    constexpr Temp getTemp() const noexcept
 721    {
 722       return temp;
 723    }
 724
 725    constexpr uint32_t tempId() const noexcept
 726    {
 727       return temp.id();
 728    }
 729
 730    constexpr void setTemp(Temp t) noexcept {
 731       temp = t;
 732    }
 733
 734    constexpr RegClass regClass() const noexcept
 735    {
 736       return temp.regClass();
 737    }
 738
 739    constexpr unsigned bytes() const noexcept
 740    {
 741       return temp.bytes();
 742    }
 743
 744    constexpr unsigned size() const noexcept
 745    {
 746       return temp.size();
 747    }
 748
 749    constexpr bool isFixed() const noexcept
 750    {
 751       return isFixed_;
 752    }
 753
 754    constexpr PhysReg physReg() const noexcept
 755    {
 756       return reg_;
 757    }
 758
 759    constexpr void setFixed(PhysReg reg) noexcept
 760    {
 761       isFixed_ = 1;
 762       reg_ = reg;
 763    }
 764
 765    constexpr void setHint(PhysReg reg) noexcept
 766    {
 767       hasHint_ = 1;
 768       reg_ = reg;
 769    }
 770
 771    constexpr bool hasHint() const noexcept
 772    {
 773       return hasHint_;
 774    }
 775
 776    constexpr void setKill(bool flag) noexcept
 777    {
 778       isKill_ = flag;
 779    }
 780
 781    constexpr bool isKill() const noexcept
 782    {
 783       return isKill_;
 784    }
 785
 786    constexpr void setPrecise(bool precise) noexcept
 787    {
 788       isPrecise_ = precise;
 789    }
 790
 791    constexpr bool isPrecise() const noexcept
 792    {
 793       return isPrecise_;
 794    }
 795
 796 private:
 797    Temp temp = Temp(0, s1);
 798    PhysReg reg_;
 799    union {
 800       struct {
 801          uint8_t isFixed_:1;
 802          uint8_t hasHint_:1;
 803          uint8_t isKill_:1;
 804          uint8_t isPrecise_:1;
 805       };
 806       /* can't initialize bit-fields in c++11, so work around using a union */
 807       uint8_t control_ = 0;
 808    };
 809 };
 810
 811 struct Block;
 812
 813 struct Instruction {
 814    aco_opcode opcode;
 815    Format format;
 816    uint32_t pass_flags;
 817
 818    aco::span<Operand> operands;
 819    aco::span<Definition> definitions;
 820
 821    constexpr bool isVALU() const noexcept
 822    {
 823       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 824           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 825           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 826           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 827           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 828           || format == Format::VOP3P;
 829    }
 830
 831    constexpr bool isSALU() const noexcept
 832    {
 833       return format == Format::SOP1 ||
 834              format == Format::SOP2 ||
 835              format == Format::SOPC ||
 836              format == Format::SOPK ||
 837              format == Format::SOPP;
 838    }
 839
 840    constexpr bool isVMEM() const noexcept
 841    {
 842       return format == Format::MTBUF ||
 843              format == Format::MUBUF ||
 844              format == Format::MIMG;
 845    }
 846
 847    constexpr bool isDPP() const noexcept
 848    {
 849       return (uint16_t) format & (uint16_t) Format::DPP;
 850    }
 851
 852    constexpr bool isVOP3() const noexcept
 853    {
 854       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 855              ((uint16_t) format & (uint16_t) Format::VOP3B);
 856    }
 857
 858    constexpr bool isSDWA() const noexcept
 859    {
 860       return (uint16_t) format & (uint16_t) Format::SDWA;
 861    }
 862
 863    constexpr bool isFlatOrGlobal() const noexcept
 864    {
 865       return format == Format::FLAT || format == Format::GLOBAL;
 866    }
 867
 868    constexpr bool usesModifiers() const noexcept;
 869
 870    constexpr bool reads_exec() const noexcept
 871    {
 872       for (const Operand& op : operands) {
 873          if (op.isFixed() && op.physReg() == exec)
 874             return true;
 875       }
 876       return false;
 877    }
 878 };
 879 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 880
 881 struct SOPK_instruction : public Instruction {
 882    uint16_t imm;
 883    uint16_t padding;
 884 };
 885 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 886
 887 struct SOPP_instruction : public Instruction {
 888    uint32_t imm;
 889    int block;
 890 };
 891 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 892
 893 struct SOPC_instruction : public Instruction {
 894 };
 895 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 896
 897 struct SOP1_instruction : public Instruction {
 898 };
 899 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 900
 901 struct SOP2_instruction : public Instruction {
 902 };
 903 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 904
 905 /**
 906  * Scalar Memory Format:
 907  * For s_(buffer_)load_dword*:
 908  * Operand(0): SBASE - SGPR-pair which provides base address
 909  * Operand(1): Offset - immediate (un)signed offset or SGPR
 910  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 911  * Operand(n-1): SOffset - SGPR offset (Vega only)
 912  *
 913  * Having no operands is also valid for instructions such as s_dcache_inv.
 914  *
 915  */
 916 struct SMEM_instruction : public Instruction {
 917    barrier_interaction barrier;
 918    bool glc : 1; /* VI+: globally coherent */
 919    bool dlc : 1; /* NAVI: device level coherent */
 920    bool nv : 1; /* VEGA only: Non-volatile */
 921    bool can_reorder : 1;
 922    bool disable_wqm : 1;
 923    uint32_t padding: 19;
 924 };
 925 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 926
 927 struct VOP1_instruction : public Instruction {
 928 };
 929 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 930
 931 struct VOP2_instruction : public Instruction {
 932 };
 933 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 934
 935 struct VOPC_instruction : public Instruction {
 936 };
 937 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 938
 939 struct VOP3A_instruction : public Instruction {
 940    bool abs[3];
 941    bool neg[3];
 942    uint8_t opsel : 4;
 943    uint8_t omod : 2;
 944    bool clamp : 1;
 945    uint32_t padding : 9;
 946 };
 947 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 948
 949 struct VOP3P_instruction : public Instruction {
 950    bool neg_lo[3];
 951    bool neg_hi[3];
 952    uint8_t opsel_lo : 3;
 953    uint8_t opsel_hi : 3;
 954    bool clamp : 1;
 955    uint32_t padding : 9;
 956 };
 957 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 958
 959 /**
 960  * Data Parallel Primitives Format:
 961  * This format can be used for VOP1, VOP2 or VOPC instructions.
 962  * The swizzle applies to the src0 operand.
 963  *
 964  */
 965 struct DPP_instruction : public Instruction {
 966    bool abs[2];
 967    bool neg[2];
 968    uint16_t dpp_ctrl;
 969    uint8_t row_mask : 4;
 970    uint8_t bank_mask : 4;
 971    bool bound_ctrl : 1;
 972    uint32_t padding : 7;
 973 };
 974 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 975
 976 enum sdwa_sel : uint8_t {
 977     /* masks */
 978     sdwa_wordnum = 0x1,
 979     sdwa_bytenum = 0x3,
 980     sdwa_asuint = 0x7 | 0x10,
 981     sdwa_rasize = 0x3,
 982
 983     /* flags */
 984     sdwa_isword = 0x4,
 985     sdwa_sext = 0x8,
 986     sdwa_isra = 0x10,
 987
 988     /* specific values */
 989     sdwa_ubyte0 = 0,
 990     sdwa_ubyte1 = 1,
 991     sdwa_ubyte2 = 2,
 992     sdwa_ubyte3 = 3,
 993     sdwa_uword0 = sdwa_isword | 0,
 994     sdwa_uword1 = sdwa_isword | 1,
 995     sdwa_udword = 6,
 996
 997     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
 998     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
 999     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1000     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1001     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1002     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1003     sdwa_sdword = sdwa_udword | sdwa_sext,
1004
1005     /* register-allocated */
1006     sdwa_ubyte = 1 | sdwa_isra,
1007     sdwa_uword = 2 | sdwa_isra,
1008     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1009     sdwa_sword = sdwa_uword | sdwa_sext,
1010 };
1011
1012 /**
1013  * Sub-Dword Addressing Format:
1014  * This format can be used for VOP1, VOP2 or VOPC instructions.
1015  *
1016  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1017  * the definition doesn't have to be VCC on GFX9+.
1018  *
1019  */
1020 struct SDWA_instruction : public Instruction {
1021    /* these destination modifiers aren't available with VOPC except for
1022     * clamp on GFX8 */
1023    uint8_t sel[2];
1024    uint8_t dst_sel;
1025    bool neg[2];
1026    bool abs[2];
1027    bool dst_preserve : 1;
1028    bool clamp : 1;
1029    uint8_t omod : 2; /* GFX9+ */
1030    uint32_t padding : 4;
1031 };
1032 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1033
1034 struct Interp_instruction : public Instruction {
1035    uint8_t attribute;
1036    uint8_t component;
1037    uint16_t padding;
1038 };
1039 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1040
1041 /**
1042  * Local and Global Data Sharing instructions
1043  * Operand(0): ADDR - VGPR which supplies the address.
1044  * Operand(1): DATA0 - First data VGPR.
1045  * Operand(2): DATA1 - Second data VGPR.
1046  * Operand(n-1): M0 - LDS size.
1047  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1048  *
1049  */
1050 struct DS_instruction : public Instruction {
1051    int16_t offset0;
1052    int8_t offset1;
1053    bool gds;
1054 };
1055 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1056
1057 /**
1058  * Vector Memory Untyped-buffer Instructions
1059  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1060  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1061  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1062  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1063  *
1064  */
1065 struct MUBUF_instruction : public Instruction {
1066    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1067    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1068    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1069    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1070    bool glc : 1; /* globally coherent */
1071    bool dlc : 1; /* NAVI: device level coherent */
1072    bool slc : 1; /* system level coherent */
1073    bool tfe : 1; /* texture fail enable */
1074    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1075    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1076    bool can_reorder : 1;
1077    uint8_t padding : 2;
1078    barrier_interaction barrier;
1079 };
1080 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1081
1082 /**
1083  * Vector Memory Typed-buffer Instructions
1084  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1085  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1086  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1087  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1088  *
1089  */
1090 struct MTBUF_instruction : public Instruction {
1091    uint16_t offset; /* Unsigned byte offset - 12 bit */
1092    barrier_interaction barrier;
1093    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1094    uint8_t nfmt : 3; /* Numeric format of data in memory */
1095    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1096    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1097    bool glc : 1; /* globally coherent */
1098    bool dlc : 1; /* NAVI: device level coherent */
1099    bool slc : 1; /* system level coherent */
1100    bool tfe : 1; /* texture fail enable */
1101    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1102    bool can_reorder : 1;
1103    uint32_t padding : 25;
1104 };
1105 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1106
1107 /**
1108  * Vector Memory Image Instructions
1109  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1110  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1111  *             or VDATA - Vector GPR for write data.
1112  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1113  * Definition(0): VDATA - Vector GPR for read result.
1114  *
1115  */
1116 struct MIMG_instruction : public Instruction {
1117    uint8_t dmask; /* Data VGPR enable mask */
1118    uint8_t dim : 3; /* NAVI: dimensionality */
1119    bool unrm : 1; /* Force address to be un-normalized */
1120    bool dlc : 1; /* NAVI: device level coherent */
1121    bool glc : 1; /* globally coherent */
1122    bool slc : 1; /* system level coherent */
1123    bool tfe : 1; /* texture fail enable */
1124    bool da : 1; /* declare an array */
1125    bool lwe : 1; /* Force data to be un-normalized */
1126    bool r128 : 1; /* NAVI: Texture resource size */
1127    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1128    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1129    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1130    bool can_reorder : 1;
1131    uint8_t padding : 1;
1132    barrier_interaction barrier;
1133 };
1134 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1135
1136 /**
1137  * Flat/Scratch/Global Instructions
1138  * Operand(0): ADDR
1139  * Operand(1): SADDR
1140  * Operand(2) / Definition(0): DATA/VDST
1141  *
1142  */
1143 struct FLAT_instruction : public Instruction {
1144    uint16_t offset; /* Vega/Navi only */
1145    bool slc : 1; /* system level coherent */
1146    bool glc : 1; /* globally coherent */
1147    bool dlc : 1; /* NAVI: device level coherent */
1148    bool lds : 1;
1149    bool nv : 1;
1150    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1151    bool can_reorder : 1;
1152    uint8_t padding : 1;
1153    barrier_interaction barrier;
1154 };
1155 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1156
1157 struct Export_instruction : public Instruction {
1158    uint8_t enabled_mask;
1159    uint8_t dest;
1160    bool compressed : 1;
1161    bool done : 1;
1162    bool valid_mask : 1;
1163    uint32_t padding : 13;
1164 };
1165 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1166
1167 struct Pseudo_instruction : public Instruction {
1168    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1169    bool tmp_in_scc;
1170    uint8_t padding;
1171 };
1172 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1173
1174 struct Pseudo_branch_instruction : public Instruction {
1175    /* target[0] is the block index of the branch target.
1176     * For conditional branches, target[1] contains the fall-through alternative.
1177     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1178     */
1179    uint32_t target[2];
1180 };
1181 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1182
1183 struct Pseudo_barrier_instruction : public Instruction {
1184 };
1185 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1186
1187 enum ReduceOp : uint16_t {
1188    iadd8, iadd16, iadd32, iadd64,
1189    imul8, imul16, imul32, imul64,
1190           fadd16, fadd32, fadd64,
1191           fmul16, fmul32, fmul64,
1192    imin8, imin16, imin32, imin64,
1193    imax8, imax16, imax32, imax64,
1194    umin8, umin16, umin32, umin64,
1195    umax8, umax16, umax32, umax64,
1196           fmin16, fmin32, fmin64,
1197           fmax16, fmax32, fmax64,
1198    iand8, iand16, iand32, iand64,
1199    ior8, ior16, ior32, ior64,
1200    ixor8, ixor16, ixor32, ixor64,
1201 };
1202
1203 /**
1204  * Subgroup Reduction Instructions, everything except for the data to be
1205  * reduced and the result as inserted by setup_reduce_temp().
1206  * Operand(0): data to be reduced
1207  * Operand(1): reduce temporary
1208  * Operand(2): vector temporary
1209  * Definition(0): result
1210  * Definition(1): scalar temporary
1211  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1212  * Definition(3): scc clobber
1213  * Definition(4): vcc clobber
1214  *
1215  */
1216 struct Pseudo_reduction_instruction : public Instruction {
1217    ReduceOp reduce_op;
1218    uint16_t cluster_size; // must be 0 for scans
1219 };
1220 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1221
1222 struct instr_deleter_functor {
1223    void operator()(void* p) {
1224       free(p);
1225    }
1226 };
1227
1228 template<typename T>
1229 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1230
1231 template<typename T>
1232 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1233 {
1234    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1235    char *data = (char*) calloc(1, size);
1236    T* inst = (T*) data;
1237
1238    inst->opcode = opcode;
1239    inst->format = format;
1240
1241    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1242    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1243    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1244    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1245
1246    return inst;
1247 }
1248
1249 constexpr bool Instruction::usesModifiers() const noexcept
1250 {
1251    if (isDPP() || isSDWA())
1252       return true;
1253
1254    if (format == Format::VOP3P) {
1255       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1256       for (unsigned i = 0; i < operands.size(); i++) {
1257          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1258             return true;
1259       }
1260       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1261    } else if (isVOP3()) {
1262       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1263       for (unsigned i = 0; i < operands.size(); i++) {
1264          if (vop3->abs[i] || vop3->neg[i])
1265             return true;
1266       }
1267       return vop3->opsel || vop3->clamp || vop3->omod;
1268    }
1269    return false;
1270 }
1271
1272 constexpr bool is_phi(Instruction* instr)
1273 {
1274    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1275 }
1276
1277 static inline bool is_phi(aco_ptr<Instruction>& instr)
1278 {
1279    return is_phi(instr.get());
1280 }
1281
1282 barrier_interaction get_barrier_interaction(const Instruction* instr);
1283 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1284
1285 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1286 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1287 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1288 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1289
1290 enum block_kind {
1291    /* uniform indicates that leaving this block,
1292     * all actives lanes stay active */
1293    block_kind_uniform = 1 << 0,
1294    block_kind_top_level = 1 << 1,
1295    block_kind_loop_preheader = 1 << 2,
1296    block_kind_loop_header = 1 << 3,
1297    block_kind_loop_exit = 1 << 4,
1298    block_kind_continue = 1 << 5,
1299    block_kind_break = 1 << 6,
1300    block_kind_continue_or_break = 1 << 7,
1301    block_kind_discard = 1 << 8,
1302    block_kind_branch = 1 << 9,
1303    block_kind_merge = 1 << 10,
1304    block_kind_invert = 1 << 11,
1305    block_kind_uses_discard_if = 1 << 12,
1306    block_kind_needs_lowering = 1 << 13,
1307    block_kind_uses_demote = 1 << 14,
1308    block_kind_export_end = 1 << 15,
1309 };
1310
1311
1312 struct RegisterDemand {
1313    constexpr RegisterDemand() = default;
1314    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1315       : vgpr{v}, sgpr{s} {}
1316    int16_t vgpr = 0;
1317    int16_t sgpr = 0;
1318
1319    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1320       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1321    }
1322
1323    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1324       return vgpr > other.vgpr || sgpr > other.sgpr;
1325    }
1326
1327    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1328       if (t.type() == RegType::sgpr)
1329          return RegisterDemand( vgpr, sgpr + t.size() );
1330       else
1331          return RegisterDemand( vgpr + t.size(), sgpr );
1332    }
1333
1334    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1335       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1336    }
1337
1338    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1339       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1340    }
1341
1342    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1343       vgpr += other.vgpr;
1344       sgpr += other.sgpr;
1345       return *this;
1346    }
1347
1348    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1349       vgpr -= other.vgpr;
1350       sgpr -= other.sgpr;
1351       return *this;
1352    }
1353
1354    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1355       if (t.type() == RegType::sgpr)
1356          sgpr += t.size();
1357       else
1358          vgpr += t.size();
1359       return *this;
1360    }
1361
1362    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1363       if (t.type() == RegType::sgpr)
1364          sgpr -= t.size();
1365       else
1366          vgpr -= t.size();
1367       return *this;
1368    }
1369
1370    constexpr void update(const RegisterDemand other) noexcept {
1371       vgpr = std::max(vgpr, other.vgpr);
1372       sgpr = std::max(sgpr, other.sgpr);
1373    }
1374
1375 };
1376
1377 /* CFG */
1378 struct Block {
1379    float_mode fp_mode;
1380    unsigned index;
1381    unsigned offset = 0;
1382    std::vector<aco_ptr<Instruction>> instructions;
1383    std::vector<unsigned> logical_preds;
1384    std::vector<unsigned> linear_preds;
1385    std::vector<unsigned> logical_succs;
1386    std::vector<unsigned> linear_succs;
1387    RegisterDemand register_demand = RegisterDemand();
1388    uint16_t loop_nest_depth = 0;
1389    uint16_t kind = 0;
1390    int logical_idom = -1;
1391    int linear_idom = -1;
1392    Temp live_out_exec = Temp();
1393
1394    /* this information is needed for predecessors to blocks with phis when
1395     * moving out of ssa */
1396    bool scc_live_out = false;
1397    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1398
1399    Block(unsigned idx) : index(idx) {}
1400    Block() : index(0) {}
1401 };
1402
1403 using Stage = uint16_t;
1404
1405 /* software stages */
1406 static constexpr Stage sw_vs = 1 << 0;
1407 static constexpr Stage sw_gs = 1 << 1;
1408 static constexpr Stage sw_tcs = 1 << 2;
1409 static constexpr Stage sw_tes = 1 << 3;
1410 static constexpr Stage sw_fs = 1 << 4;
1411 static constexpr Stage sw_cs = 1 << 5;
1412 static constexpr Stage sw_gs_copy = 1 << 6;
1413 static constexpr Stage sw_mask = 0x7f;
1414
1415 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1416 static constexpr Stage hw_vs = 1 << 7;
1417 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1418 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1419 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1420 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1421 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1422 static constexpr Stage hw_fs = 1 << 13;
1423 static constexpr Stage hw_cs = 1 << 14;
1424 static constexpr Stage hw_mask = 0xff << 7;
1425
1426 /* possible settings of Program::stage */
1427 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1428 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1429 static constexpr Stage compute_cs = sw_cs | hw_cs;
1430 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1431 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1432 /* GFX10/NGG */
1433 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1434 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1435 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1436 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1437 /* GFX9 (and GFX10 if NGG isn't used) */
1438 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1439 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1440 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1441 /* pre-GFX9 */
1442 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1443 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1444 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1445 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1446 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1447
1448 enum statistic {
1449    statistic_hash,
1450    statistic_instructions,
1451    statistic_copies,
1452    statistic_branches,
1453    statistic_cycles,
1454    statistic_vmem_clauses,
1455    statistic_smem_clauses,
1456    statistic_vmem_score,
1457    statistic_smem_score,
1458    statistic_sgpr_presched,
1459    statistic_vgpr_presched,
1460    num_statistics
1461 };
1462
1463 class Program final {
1464 public:
1465    float_mode next_fp_mode;
1466    std::vector<Block> blocks;
1467    RegisterDemand max_reg_demand = RegisterDemand();
1468    uint16_t num_waves = 0;
1469    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1470    ac_shader_config* config;
1471    struct radv_shader_info *info;
1472    enum chip_class chip_class;
1473    enum radeon_family family;
1474    unsigned wave_size;
1475    RegClass lane_mask;
1476    Stage stage; /* Stage */
1477    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1478    bool needs_wqm = false; /* there exists a p_wqm instruction */
1479    bool wb_smem_l1_on_end = false;
1480
1481    std::vector<uint8_t> constant_data;
1482    Temp private_segment_buffer;
1483    Temp scratch_offset;
1484
1485    uint16_t min_waves = 0;
1486    uint16_t lds_alloc_granule;
1487    uint32_t lds_limit; /* in bytes */
1488    bool has_16bank_lds;
1489    uint16_t vgpr_limit;
1490    uint16_t sgpr_limit;
1491    uint16_t physical_sgprs;
1492    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1493    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1494    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1495
1496    bool xnack_enabled = false;
1497    bool sram_ecc_enabled = false;
1498    bool has_fast_fma32 = false;
1499
1500    bool needs_vcc = false;
1501    bool needs_flat_scr = false;
1502
1503    bool collect_statistics = false;
1504    uint32_t statistics[num_statistics];
1505
1506    uint32_t allocateId()
1507    {
1508       assert(allocationID <= 16777215);
1509       return allocationID++;
1510    }
1511
1512    uint32_t peekAllocationId()
1513    {
1514       return allocationID;
1515    }
1516
1517    void setAllocationId(uint32_t id)
1518    {
1519       allocationID = id;
1520    }
1521
1522    Block* create_and_insert_block() {
1523       blocks.emplace_back(blocks.size());
1524       blocks.back().fp_mode = next_fp_mode;
1525       return &blocks.back();
1526    }
1527
1528    Block* insert_block(Block&& block) {
1529       block.index = blocks.size();
1530       block.fp_mode = next_fp_mode;
1531       blocks.emplace_back(std::move(block));
1532       return &blocks.back();
1533    }
1534
1535 private:
1536    uint32_t allocationID = 1;
1537 };
1538
1539 struct TempHash {
1540    std::size_t operator()(Temp t) const {
1541       return t.id();
1542    }
1543 };
1544 using TempSet = std::unordered_set<Temp, TempHash>;
1545
1546 struct live {
1547    /* live temps out per block */
1548    std::vector<TempSet> live_out;
1549    /* register demand (sgpr/vgpr) per instruction per block */
1550    std::vector<std::vector<RegisterDemand>> register_demand;
1551 };
1552
1553 void select_program(Program *program,
1554                     unsigned shader_count,
1555                     struct nir_shader *const *shaders,
1556                     ac_shader_config* config,
1557                     struct radv_shader_args *args);
1558 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1559                            ac_shader_config* config,
1560                            struct radv_shader_args *args);
1561
1562 void lower_wqm(Program* program, live& live_vars,
1563                const struct radv_nir_compiler_options *options);
1564 void lower_phis(Program* program);
1565 void calc_min_waves(Program* program);
1566 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1567 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1568 std::vector<uint16_t> dead_code_analysis(Program *program);
1569 void dominator_tree(Program* program);
1570 void insert_exec_mask(Program *program);
1571 void value_numbering(Program* program);
1572 void optimize(Program* program);
1573 void setup_reduce_temp(Program* program);
1574 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1575 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1576 void ssa_elimination(Program* program);
1577 void lower_to_hw_instr(Program* program);
1578 void schedule_program(Program* program, live& live_vars);
1579 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1580 void insert_wait_states(Program* program);
1581 void insert_NOPs(Program* program);
1582 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1583 void print_asm(Program *program, std::vector<uint32_t>& binary,
1584                unsigned exec_size, std::ostream& out);
1585 void validate(Program* program, FILE *output);
1586 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1587 #ifndef NDEBUG
1588 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1589 #else
1590 #define perfwarn(program, cond, msg, ...) do {} while(0)
1591 #endif
1592
1593 void collect_presched_stats(Program *program);
1594 void collect_preasm_stats(Program *program);
1595 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1596
1597 void aco_print_instr(const Instruction *instr, FILE *output);
1598 void aco_print_program(const Program *program, FILE *output);
1599
1600 /* utilities for dealing with register demand */
1601 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1602 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1603 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1604
1605 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1606 uint16_t get_extra_sgprs(Program *program);
1607
1608 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1609 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1610 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1611
1612 /* return number of addressable sgprs/vgprs for max_waves */
1613 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1614 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1615
1616 typedef struct {
1617    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1618    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1619    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1620    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1621    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1622    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1623    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1624    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1625    /* sizes used for input/output modifiers and constants */
1626    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1627    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1628 } Info;
1629
1630 extern const Info instr_info;
1631
1632 }
1633
1634 #endif /* ACO_IR_H */
1635