src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_info;
  41
  42 namespace aco {
  43
  44 extern uint64_t debug_flags;
  45
  46 enum {
  47    DEBUG_VALIDATE = 0x1,
  48    DEBUG_VALIDATE_RA = 0x2,
  49    DEBUG_PERFWARN = 0x4,
  50 };
  51
  52 /**
  53  * Representation of the instruction's microcode encoding format
  54  * Note: Some Vector ALU Formats can be combined, such that:
  55  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  56  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  57  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  58  *
  59  * (*) The same is applicable for VOP1 and VOPC instructions.
  60  */
  61 enum class Format : std::uint16_t {
  62    /* Pseudo Instruction Format */
  63    PSEUDO = 0,
  64    /* Scalar ALU & Control Formats */
  65    SOP1 = 1,
  66    SOP2 = 2,
  67    SOPK = 3,
  68    SOPP = 4,
  69    SOPC = 5,
  70    /* Scalar Memory Format */
  71    SMEM = 6,
  72    /* LDS/GDS Format */
  73    DS = 8,
  74    /* Vector Memory Buffer Formats */
  75    MTBUF = 9,
  76    MUBUF = 10,
  77    /* Vector Memory Image Format */
  78    MIMG = 11,
  79    /* Export Format */
  80    EXP = 12,
  81    /* Flat Formats */
  82    FLAT = 13,
  83    GLOBAL = 14,
  84    SCRATCH = 15,
  85
  86    PSEUDO_BRANCH = 16,
  87    PSEUDO_BARRIER = 17,
  88    PSEUDO_REDUCTION = 18,
  89
  90    /* Vector ALU Formats */
  91    VOP1 = 1 << 8,
  92    VOP2 = 1 << 9,
  93    VOPC = 1 << 10,
  94    VOP3 = 1 << 11,
  95    VOP3A = 1 << 11,
  96    VOP3B = 1 << 11,
  97    VOP3P = 1 << 12,
  98    /* Vector Parameter Interpolation Format */
  99    VINTRP = 1 << 13,
 100    DPP = 1 << 14,
 101    SDWA = 1 << 15,
 102 };
 103
 104 enum barrier_interaction {
 105    barrier_none = 0,
 106    barrier_buffer = 0x1,
 107    barrier_image = 0x2,
 108    barrier_atomic = 0x4,
 109    barrier_shared = 0x8,
 110    barrier_count = 4,
 111 };
 112
 113 enum fp_round {
 114    fp_round_ne = 0,
 115    fp_round_pi = 1,
 116    fp_round_ni = 2,
 117    fp_round_tz = 3,
 118 };
 119
 120 enum fp_denorm {
 121    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 122     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 123    fp_denorm_flush = 0x0,
 124    fp_denorm_keep = 0x3,
 125 };
 126
 127 struct float_mode {
 128    /* matches encoding of the MODE register */
 129    union {
 130       struct {
 131           fp_round round32:2;
 132           fp_round round16_64:2;
 133           unsigned denorm32:2;
 134           unsigned denorm16_64:2;
 135       };
 136       uint8_t val = 0;
 137    };
 138    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 139    bool preserve_signed_zero_inf_nan32:1;
 140    bool preserve_signed_zero_inf_nan16_64:1;
 141    /* if false, optimizations which may remove denormal flushing can be done */
 142    bool must_flush_denorms32:1;
 143    bool must_flush_denorms16_64:1;
 144    bool care_about_round32:1;
 145    bool care_about_round16_64:1;
 146
 147    /* Returns true if instructions using the mode "other" can safely use the
 148     * current one instead. */
 149    bool canReplace(float_mode other) const noexcept {
 150       return val == other.val &&
 151              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 152              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 153              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 154              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 155              (care_about_round32 || !other.care_about_round32) &&
 156              (care_about_round16_64 || !other.care_about_round16_64);
 157    }
 158 };
 159
 160 constexpr Format asVOP3(Format format) {
 161    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 162 };
 163
 164 enum class RegType {
 165    none = 0,
 166    sgpr,
 167    vgpr,
 168    linear_vgpr,
 169 };
 170
 171 struct RegClass {
 172
 173    enum RC : uint8_t {
 174       s1 = 1,
 175       s2 = 2,
 176       s3 = 3,
 177       s4 = 4,
 178       s6 = 6,
 179       s8 = 8,
 180       s16 = 16,
 181       v1 = s1 | (1 << 5),
 182       v2 = s2 | (1 << 5),
 183       v3 = s3 | (1 << 5),
 184       v4 = s4 | (1 << 5),
 185       v5 = 5  | (1 << 5),
 186       v6 = 6  | (1 << 5),
 187       v7 = 7  | (1 << 5),
 188       v8 = 8  | (1 << 5),
 189       /* these are used for WWM and spills to vgpr */
 190       v1_linear = v1 | (1 << 6),
 191       v2_linear = v2 | (1 << 6),
 192    };
 193
 194    RegClass() = default;
 195    constexpr RegClass(RC rc)
 196       : rc(rc) {}
 197    constexpr RegClass(RegType type, unsigned size)
 198       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 199
 200    constexpr operator RC() const { return rc; }
 201    explicit operator bool() = delete;
 202
 203    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 204    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 205    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 206    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 207
 208 private:
 209    RC rc;
 210 };
 211
 212 /* transitional helper expressions */
 213 static constexpr RegClass s1{RegClass::s1};
 214 static constexpr RegClass s2{RegClass::s2};
 215 static constexpr RegClass s3{RegClass::s3};
 216 static constexpr RegClass s4{RegClass::s4};
 217 static constexpr RegClass s8{RegClass::s8};
 218 static constexpr RegClass s16{RegClass::s16};
 219 static constexpr RegClass v1{RegClass::v1};
 220 static constexpr RegClass v2{RegClass::v2};
 221 static constexpr RegClass v3{RegClass::v3};
 222 static constexpr RegClass v4{RegClass::v4};
 223 static constexpr RegClass v5{RegClass::v5};
 224 static constexpr RegClass v6{RegClass::v6};
 225 static constexpr RegClass v7{RegClass::v7};
 226 static constexpr RegClass v8{RegClass::v8};
 227
 228 /**
 229  * Temp Class
 230  * Each temporary virtual register has a
 231  * register class (i.e. size and type)
 232  * and SSA id.
 233  */
 234 struct Temp {
 235    Temp() = default;
 236    constexpr Temp(uint32_t id, RegClass cls) noexcept
 237       : id_(id), reg_class(cls) {}
 238
 239    constexpr uint32_t id() const noexcept { return id_; }
 240    constexpr RegClass regClass() const noexcept { return reg_class; }
 241
 242    constexpr unsigned size() const noexcept { return reg_class.size(); }
 243    constexpr RegType type() const noexcept { return reg_class.type(); }
 244    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 245
 246    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 247    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 248    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 249
 250 private:
 251    uint32_t id_:24;
 252    RegClass reg_class;
 253 };
 254
 255 /**
 256  * PhysReg
 257  * Represents the physical register for each
 258  * Operand and Definition.
 259  */
 260 struct PhysReg {
 261    constexpr PhysReg() = default;
 262    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 263    constexpr operator unsigned() const { return reg; }
 264
 265    uint16_t reg = 0;
 266 };
 267
 268 /* helper expressions for special registers */
 269 static constexpr PhysReg m0{124};
 270 static constexpr PhysReg vcc{106};
 271 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 272 static constexpr PhysReg exec{126};
 273 static constexpr PhysReg exec_lo{126};
 274 static constexpr PhysReg exec_hi{127};
 275 static constexpr PhysReg scc{253};
 276
 277 /**
 278  * Operand Class
 279  * Initially, each Operand refers to either
 280  * a temporary virtual register
 281  * or to a constant value
 282  * Temporary registers get mapped to physical register during RA
 283  * Constant values are inlined into the instruction sequence.
 284  */
 285 class Operand final
 286 {
 287 public:
 288    constexpr Operand()
 289       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 290         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 291
 292    explicit Operand(Temp r) noexcept
 293    {
 294       data_.temp = r;
 295       if (r.id()) {
 296          isTemp_ = true;
 297       } else {
 298          isUndef_ = true;
 299          setFixed(PhysReg{128});
 300       }
 301    };
 302    explicit Operand(uint32_t v) noexcept
 303    {
 304       data_.i = v;
 305       isConstant_ = true;
 306       if (v <= 64)
 307          setFixed(PhysReg{128 + v});
 308       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 309          setFixed(PhysReg{192 - v});
 310       else if (v == 0x3f000000) /* 0.5 */
 311          setFixed(PhysReg{240});
 312       else if (v == 0xbf000000) /* -0.5 */
 313          setFixed(PhysReg{241});
 314       else if (v == 0x3f800000) /* 1.0 */
 315          setFixed(PhysReg{242});
 316       else if (v == 0xbf800000) /* -1.0 */
 317          setFixed(PhysReg{243});
 318       else if (v == 0x40000000) /* 2.0 */
 319          setFixed(PhysReg{244});
 320       else if (v == 0xc0000000) /* -2.0 */
 321          setFixed(PhysReg{245});
 322       else if (v == 0x40800000) /* 4.0 */
 323          setFixed(PhysReg{246});
 324       else if (v == 0xc0800000) /* -4.0 */
 325          setFixed(PhysReg{247});
 326       else if (v == 0x3e22f983) /* 1/(2*PI) */
 327          setFixed(PhysReg{248});
 328       else /* Literal Constant */
 329          setFixed(PhysReg{255});
 330    };
 331    explicit Operand(uint64_t v) noexcept
 332    {
 333       isConstant_ = true;
 334       is64BitConst_ = true;
 335       if (v <= 64)
 336          setFixed(PhysReg{128 + (uint32_t) v});
 337       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 338          setFixed(PhysReg{192 - (uint32_t) v});
 339       else if (v == 0x3FE0000000000000) /* 0.5 */
 340          setFixed(PhysReg{240});
 341       else if (v == 0xBFE0000000000000) /* -0.5 */
 342          setFixed(PhysReg{241});
 343       else if (v == 0x3FF0000000000000) /* 1.0 */
 344          setFixed(PhysReg{242});
 345       else if (v == 0xBFF0000000000000) /* -1.0 */
 346          setFixed(PhysReg{243});
 347       else if (v == 0x4000000000000000) /* 2.0 */
 348          setFixed(PhysReg{244});
 349       else if (v == 0xC000000000000000) /* -2.0 */
 350          setFixed(PhysReg{245});
 351       else if (v == 0x4010000000000000) /* 4.0 */
 352          setFixed(PhysReg{246});
 353       else if (v == 0xC010000000000000) /* -4.0 */
 354          setFixed(PhysReg{247});
 355       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 356          setFixed(PhysReg{248});
 357       else { /* Literal Constant: we don't know if it is a long or double.*/
 358          isConstant_ = 0;
 359          assert(false && "attempt to create a 64-bit literal constant");
 360       }
 361    };
 362    explicit Operand(RegClass type) noexcept
 363    {
 364       isUndef_ = true;
 365       data_.temp = Temp(0, type);
 366       setFixed(PhysReg{128});
 367    };
 368    explicit Operand(PhysReg reg, RegClass type) noexcept
 369    {
 370       data_.temp = Temp(0, type);
 371       setFixed(reg);
 372    }
 373
 374    constexpr bool isTemp() const noexcept
 375    {
 376       return isTemp_;
 377    }
 378
 379    constexpr void setTemp(Temp t) noexcept {
 380       assert(!isConstant_);
 381       isTemp_ = true;
 382       data_.temp = t;
 383    }
 384
 385    constexpr Temp getTemp() const noexcept
 386    {
 387       return data_.temp;
 388    }
 389
 390    constexpr uint32_t tempId() const noexcept
 391    {
 392       return data_.temp.id();
 393    }
 394
 395    constexpr bool hasRegClass() const noexcept
 396    {
 397       return isTemp() || isUndefined();
 398    }
 399
 400    constexpr RegClass regClass() const noexcept
 401    {
 402       return data_.temp.regClass();
 403    }
 404
 405    constexpr unsigned size() const noexcept
 406    {
 407       if (isConstant())
 408          return is64BitConst_ ? 2 : 1;
 409       else
 410          return data_.temp.size();
 411    }
 412
 413    constexpr bool isFixed() const noexcept
 414    {
 415       return isFixed_;
 416    }
 417
 418    constexpr PhysReg physReg() const noexcept
 419    {
 420       return reg_;
 421    }
 422
 423    constexpr void setFixed(PhysReg reg) noexcept
 424    {
 425       isFixed_ = reg != unsigned(-1);
 426       reg_ = reg;
 427    }
 428
 429    constexpr bool isConstant() const noexcept
 430    {
 431       return isConstant_;
 432    }
 433
 434    constexpr bool isLiteral() const noexcept
 435    {
 436       return isConstant() && reg_ == 255;
 437    }
 438
 439    constexpr bool isUndefined() const noexcept
 440    {
 441       return isUndef_;
 442    }
 443
 444    constexpr uint32_t constantValue() const noexcept
 445    {
 446       return data_.i;
 447    }
 448
 449    constexpr bool constantEquals(uint32_t cmp) const noexcept
 450    {
 451       return isConstant() && constantValue() == cmp;
 452    }
 453
 454    constexpr void setKill(bool flag) noexcept
 455    {
 456       isKill_ = flag;
 457       if (!flag)
 458          setFirstKill(false);
 459    }
 460
 461    constexpr bool isKill() const noexcept
 462    {
 463       return isKill_ || isFirstKill();
 464    }
 465
 466    constexpr void setFirstKill(bool flag) noexcept
 467    {
 468       isFirstKill_ = flag;
 469       if (flag)
 470          setKill(flag);
 471    }
 472
 473    /* When there are multiple operands killing the same temporary,
 474     * isFirstKill() is only returns true for the first one. */
 475    constexpr bool isFirstKill() const noexcept
 476    {
 477       return isFirstKill_;
 478    }
 479
 480 private:
 481    union {
 482       uint32_t i;
 483       float f;
 484       Temp temp = Temp(0, s1);
 485    } data_;
 486    PhysReg reg_;
 487    union {
 488       struct {
 489          uint8_t isTemp_:1;
 490          uint8_t isFixed_:1;
 491          uint8_t isConstant_:1;
 492          uint8_t isKill_:1;
 493          uint8_t isUndef_:1;
 494          uint8_t isFirstKill_:1;
 495          uint8_t is64BitConst_:1;
 496       };
 497       /* can't initialize bit-fields in c++11, so work around using a union */
 498       uint8_t control_ = 0;
 499    };
 500 };
 501
 502 /**
 503  * Definition Class
 504  * Definitions are the results of Instructions
 505  * and refer to temporary virtual registers
 506  * which are later mapped to physical registers
 507  */
 508 class Definition final
 509 {
 510 public:
 511    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 512    Definition(uint32_t index, RegClass type) noexcept
 513       : temp(index, type) {}
 514    explicit Definition(Temp tmp) noexcept
 515       : temp(tmp) {}
 516    Definition(PhysReg reg, RegClass type) noexcept
 517       : temp(Temp(0, type))
 518    {
 519       setFixed(reg);
 520    }
 521    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 522       : temp(Temp(tmpId, type))
 523    {
 524       setFixed(reg);
 525    }
 526
 527    constexpr bool isTemp() const noexcept
 528    {
 529       return tempId() > 0;
 530    }
 531
 532    constexpr Temp getTemp() const noexcept
 533    {
 534       return temp;
 535    }
 536
 537    constexpr uint32_t tempId() const noexcept
 538    {
 539       return temp.id();
 540    }
 541
 542    constexpr void setTemp(Temp t) noexcept {
 543       temp = t;
 544    }
 545
 546    constexpr RegClass regClass() const noexcept
 547    {
 548       return temp.regClass();
 549    }
 550
 551    constexpr unsigned size() const noexcept
 552    {
 553       return temp.size();
 554    }
 555
 556    constexpr bool isFixed() const noexcept
 557    {
 558       return isFixed_;
 559    }
 560
 561    constexpr PhysReg physReg() const noexcept
 562    {
 563       return reg_;
 564    }
 565
 566    constexpr void setFixed(PhysReg reg) noexcept
 567    {
 568       isFixed_ = 1;
 569       reg_ = reg;
 570    }
 571
 572    constexpr void setHint(PhysReg reg) noexcept
 573    {
 574       hasHint_ = 1;
 575       reg_ = reg;
 576    }
 577
 578    constexpr bool hasHint() const noexcept
 579    {
 580       return hasHint_;
 581    }
 582
 583    constexpr void setKill(bool flag) noexcept
 584    {
 585       isKill_ = flag;
 586    }
 587
 588    constexpr bool isKill() const noexcept
 589    {
 590       return isKill_;
 591    }
 592
 593 private:
 594    Temp temp = Temp(0, s1);
 595    PhysReg reg_;
 596    union {
 597       struct {
 598          uint8_t isFixed_:1;
 599          uint8_t hasHint_:1;
 600          uint8_t isKill_:1;
 601       };
 602       /* can't initialize bit-fields in c++11, so work around using a union */
 603       uint8_t control_ = 0;
 604    };
 605 };
 606
 607 class Block;
 608
 609 struct Instruction {
 610    aco_opcode opcode;
 611    Format format;
 612    uint32_t pass_flags;
 613
 614    aco::span<Operand> operands;
 615    aco::span<Definition> definitions;
 616
 617    constexpr bool isVALU() const noexcept
 618    {
 619       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 620           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 621           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 622           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 623           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 624           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 625    }
 626
 627    constexpr bool isSALU() const noexcept
 628    {
 629       return format == Format::SOP1 ||
 630              format == Format::SOP2 ||
 631              format == Format::SOPC ||
 632              format == Format::SOPK ||
 633              format == Format::SOPP;
 634    }
 635
 636    constexpr bool isVMEM() const noexcept
 637    {
 638       return format == Format::MTBUF ||
 639              format == Format::MUBUF ||
 640              format == Format::MIMG;
 641    }
 642
 643    constexpr bool isDPP() const noexcept
 644    {
 645       return (uint16_t) format & (uint16_t) Format::DPP;
 646    }
 647
 648    constexpr bool isVOP3() const noexcept
 649    {
 650       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 651              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 652              format == Format::VOP3P;
 653    }
 654
 655    constexpr bool isSDWA() const noexcept
 656    {
 657       return (uint16_t) format & (uint16_t) Format::SDWA;
 658    }
 659
 660    constexpr bool isFlatOrGlobal() const noexcept
 661    {
 662       return format == Format::FLAT || format == Format::GLOBAL;
 663    }
 664
 665    constexpr bool usesModifiers() const noexcept;
 666
 667    constexpr bool reads_exec() const noexcept
 668    {
 669       for (const Operand& op : operands) {
 670          if (op.isFixed() && op.physReg() == exec)
 671             return true;
 672       }
 673       return false;
 674    }
 675 };
 676
 677 struct SOPK_instruction : public Instruction {
 678    uint16_t imm;
 679 };
 680
 681 struct SOPP_instruction : public Instruction {
 682    uint32_t imm;
 683    int block;
 684 };
 685
 686 struct SOPC_instruction : public Instruction {
 687 };
 688
 689 struct SOP1_instruction : public Instruction {
 690 };
 691
 692 struct SOP2_instruction : public Instruction {
 693 };
 694
 695 /**
 696  * Scalar Memory Format:
 697  * For s_(buffer_)load_dword*:
 698  * Operand(0): SBASE - SGPR-pair which provides base address
 699  * Operand(1): Offset - immediate (un)signed offset or SGPR
 700  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 701  * Operand(n-1): SOffset - SGPR offset (Vega only)
 702  *
 703  * Having no operands is also valid for instructions such as s_dcache_inv.
 704  *
 705  */
 706 struct SMEM_instruction : public Instruction {
 707    bool glc; /* VI+: globally coherent */
 708    bool dlc; /* NAVI: device level coherent */
 709    bool nv; /* VEGA only: Non-volatile */
 710    bool can_reorder;
 711    bool disable_wqm;
 712    barrier_interaction barrier;
 713 };
 714
 715 struct VOP1_instruction : public Instruction {
 716 };
 717
 718 struct VOP2_instruction : public Instruction {
 719 };
 720
 721 struct VOPC_instruction : public Instruction {
 722 };
 723
 724 struct VOP3A_instruction : public Instruction {
 725    bool abs[3];
 726    bool opsel[4];
 727    bool clamp;
 728    unsigned omod;
 729    bool neg[3];
 730 };
 731
 732 /**
 733  * Data Parallel Primitives Format:
 734  * This format can be used for VOP1, VOP2 or VOPC instructions.
 735  * The swizzle applies to the src0 operand.
 736  *
 737  */
 738 struct DPP_instruction : public Instruction {
 739    uint16_t dpp_ctrl;
 740    uint8_t row_mask;
 741    uint8_t bank_mask;
 742    bool abs[2];
 743    bool neg[2];
 744    bool bound_ctrl;
 745 };
 746
 747 struct Interp_instruction : public Instruction {
 748    unsigned attribute;
 749    unsigned component;
 750 };
 751
 752 /**
 753  * Local and Global Data Sharing instructions
 754  * Operand(0): ADDR - VGPR which supplies the address.
 755  * Operand(1): DATA0 - First data VGPR.
 756  * Operand(2): DATA1 - Second data VGPR.
 757  * Operand(n-1): M0 - LDS size.
 758  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 759  *
 760  */
 761 struct DS_instruction : public Instruction {
 762    int16_t offset0;
 763    int8_t offset1;
 764    bool gds;
 765 };
 766
 767 /**
 768  * Vector Memory Untyped-buffer Instructions
 769  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 770  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 771  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 772  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 773  *
 774  */
 775 struct MUBUF_instruction : public Instruction {
 776    unsigned offset; /* Unsigned byte offset - 12 bit */
 777    bool offen; /* Supply an offset from VGPR (VADDR) */
 778    bool idxen; /* Supply an index from VGPR (VADDR) */
 779    bool glc; /* globally coherent */
 780    bool dlc; /* NAVI: device level coherent */
 781    bool slc; /* system level coherent */
 782    bool tfe; /* texture fail enable */
 783    bool lds; /* Return read-data to LDS instead of VGPRs */
 784    bool disable_wqm; /* Require an exec mask without helper invocations */
 785    bool can_reorder;
 786    barrier_interaction barrier;
 787 };
 788
 789 /**
 790  * Vector Memory Typed-buffer Instructions
 791  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 792  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 793  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 794  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 795  *
 796  */
 797 struct MTBUF_instruction : public Instruction {
 798    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 799    uint8_t nfmt : 3; /* Numeric format of data in memory */
 800    unsigned offset; /* Unsigned byte offset - 12 bit */
 801    bool offen; /* Supply an offset from VGPR (VADDR) */
 802    bool idxen; /* Supply an index from VGPR (VADDR) */
 803    bool glc; /* globally coherent */
 804    bool dlc; /* NAVI: device level coherent */
 805    bool slc; /* system level coherent */
 806    bool tfe; /* texture fail enable */
 807    bool disable_wqm; /* Require an exec mask without helper invocations */
 808    bool can_reorder;
 809    barrier_interaction barrier;
 810 };
 811
 812 /**
 813  * Vector Memory Image Instructions
 814  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 815  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 816  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 817  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 818  *
 819  */
 820 struct MIMG_instruction : public Instruction {
 821    unsigned dmask; /* Data VGPR enable mask */
 822    unsigned dim; /* NAVI: dimensionality */
 823    bool unrm; /* Force address to be un-normalized */
 824    bool dlc; /* NAVI: device level coherent */
 825    bool glc; /* globally coherent */
 826    bool slc; /* system level coherent */
 827    bool tfe; /* texture fail enable */
 828    bool da; /* declare an array */
 829    bool lwe; /* Force data to be un-normalized */
 830    bool r128; /* NAVI: Texture resource size */
 831    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 832    bool d16; /* Convert 32-bit data to 16-bit data */
 833    bool disable_wqm; /* Require an exec mask without helper invocations */
 834    bool can_reorder;
 835    barrier_interaction barrier;
 836 };
 837
 838 /**
 839  * Flat/Scratch/Global Instructions
 840  * Operand(0): ADDR
 841  * Operand(1): SADDR
 842  * Operand(2) / Definition(0): DATA/VDST
 843  *
 844  */
 845 struct FLAT_instruction : public Instruction {
 846    uint16_t offset; /* Vega only */
 847    bool slc; /* system level coherent */
 848    bool glc; /* globally coherent */
 849    bool dlc; /* NAVI: device level coherent */
 850    bool lds;
 851    bool nv;
 852 };
 853
 854 struct Export_instruction : public Instruction {
 855    unsigned enabled_mask;
 856    unsigned dest;
 857    bool compressed;
 858    bool done;
 859    bool valid_mask;
 860 };
 861
 862 struct Pseudo_instruction : public Instruction {
 863    bool tmp_in_scc;
 864    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 865 };
 866
 867 struct Pseudo_branch_instruction : public Instruction {
 868    /* target[0] is the block index of the branch target.
 869     * For conditional branches, target[1] contains the fall-through alternative.
 870     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 871     */
 872    uint32_t target[2];
 873 };
 874
 875 struct Pseudo_barrier_instruction : public Instruction {
 876 };
 877
 878 enum ReduceOp {
 879    iadd32, iadd64,
 880    imul32, imul64,
 881    fadd32, fadd64,
 882    fmul32, fmul64,
 883    imin32, imin64,
 884    imax32, imax64,
 885    umin32, umin64,
 886    umax32, umax64,
 887    fmin32, fmin64,
 888    fmax32, fmax64,
 889    iand32, iand64,
 890    ior32, ior64,
 891    ixor32, ixor64,
 892    gfx10_wave64_bpermute
 893 };
 894
 895 /**
 896  * Subgroup Reduction Instructions, everything except for the data to be
 897  * reduced and the result as inserted by setup_reduce_temp().
 898  * Operand(0): data to be reduced
 899  * Operand(1): reduce temporary
 900  * Operand(2): vector temporary
 901  * Definition(0): result
 902  * Definition(1): scalar temporary
 903  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 904  * Definition(3): scc clobber
 905  * Definition(4): vcc clobber
 906  *
 907  */
 908 struct Pseudo_reduction_instruction : public Instruction {
 909    ReduceOp reduce_op;
 910    unsigned cluster_size; // must be 0 for scans
 911 };
 912
 913 struct instr_deleter_functor {
 914    void operator()(void* p) {
 915       free(p);
 916    }
 917 };
 918
 919 template<typename T>
 920 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 921
 922 template<typename T>
 923 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 924 {
 925    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 926    char *data = (char*) calloc(1, size);
 927    T* inst = (T*) data;
 928
 929    inst->opcode = opcode;
 930    inst->format = format;
 931
 932    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 933    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 934
 935    return inst;
 936 }
 937
 938 constexpr bool Instruction::usesModifiers() const noexcept
 939 {
 940    if (isDPP() || isSDWA())
 941       return true;
 942    if (!isVOP3())
 943       return false;
 944    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 945    for (unsigned i = 0; i < operands.size(); i++) {
 946       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 947          return true;
 948    }
 949    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 950 }
 951
 952 constexpr bool is_phi(Instruction* instr)
 953 {
 954    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 955 }
 956
 957 static inline bool is_phi(aco_ptr<Instruction>& instr)
 958 {
 959    return is_phi(instr.get());
 960 }
 961
 962 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 963 {
 964    switch (instr->format) {
 965    case Format::SMEM:
 966       return static_cast<SMEM_instruction*>(instr)->barrier;
 967    case Format::MUBUF:
 968       return static_cast<MUBUF_instruction*>(instr)->barrier;
 969    case Format::MIMG:
 970       return static_cast<MIMG_instruction*>(instr)->barrier;
 971    case Format::FLAT:
 972    case Format::GLOBAL:
 973       return barrier_buffer;
 974    case Format::DS:
 975       return barrier_shared;
 976    default:
 977       return barrier_none;
 978    }
 979 }
 980
 981 enum block_kind {
 982    /* uniform indicates that leaving this block,
 983     * all actives lanes stay active */
 984    block_kind_uniform = 1 << 0,
 985    block_kind_top_level = 1 << 1,
 986    block_kind_loop_preheader = 1 << 2,
 987    block_kind_loop_header = 1 << 3,
 988    block_kind_loop_exit = 1 << 4,
 989    block_kind_continue = 1 << 5,
 990    block_kind_break = 1 << 6,
 991    block_kind_continue_or_break = 1 << 7,
 992    block_kind_discard = 1 << 8,
 993    block_kind_branch = 1 << 9,
 994    block_kind_merge = 1 << 10,
 995    block_kind_invert = 1 << 11,
 996    block_kind_uses_discard_if = 1 << 12,
 997    block_kind_needs_lowering = 1 << 13,
 998    block_kind_uses_demote = 1 << 14,
 999 };
1000
1001
1002 struct RegisterDemand {
1003    constexpr RegisterDemand() = default;
1004    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1005       : vgpr{v}, sgpr{s} {}
1006    int16_t vgpr = 0;
1007    int16_t sgpr = 0;
1008
1009    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1010       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1011    }
1012
1013    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1014       return vgpr > other.vgpr || sgpr > other.sgpr;
1015    }
1016
1017    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1018       if (t.type() == RegType::sgpr)
1019          return RegisterDemand( vgpr, sgpr + t.size() );
1020       else
1021          return RegisterDemand( vgpr + t.size(), sgpr );
1022    }
1023
1024    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1025       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1026    }
1027
1028    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1029       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1030    }
1031
1032    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1033       vgpr += other.vgpr;
1034       sgpr += other.sgpr;
1035       return *this;
1036    }
1037
1038    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1039       vgpr -= other.vgpr;
1040       sgpr -= other.sgpr;
1041       return *this;
1042    }
1043
1044    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1045       if (t.type() == RegType::sgpr)
1046          sgpr += t.size();
1047       else
1048          vgpr += t.size();
1049       return *this;
1050    }
1051
1052    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1053       if (t.type() == RegType::sgpr)
1054          sgpr -= t.size();
1055       else
1056          vgpr -= t.size();
1057       return *this;
1058    }
1059
1060    constexpr void update(const RegisterDemand other) noexcept {
1061       vgpr = std::max(vgpr, other.vgpr);
1062       sgpr = std::max(sgpr, other.sgpr);
1063    }
1064
1065 };
1066
1067 /* CFG */
1068 struct Block {
1069    float_mode fp_mode;
1070    unsigned index;
1071    unsigned offset = 0;
1072    std::vector<aco_ptr<Instruction>> instructions;
1073    std::vector<unsigned> logical_preds;
1074    std::vector<unsigned> linear_preds;
1075    std::vector<unsigned> logical_succs;
1076    std::vector<unsigned> linear_succs;
1077    RegisterDemand register_demand = RegisterDemand();
1078    uint16_t loop_nest_depth = 0;
1079    uint16_t kind = 0;
1080    int logical_idom = -1;
1081    int linear_idom = -1;
1082    Temp live_out_exec = Temp();
1083
1084    /* this information is needed for predecessors to blocks with phis when
1085     * moving out of ssa */
1086    bool scc_live_out = false;
1087    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1088
1089    Block(unsigned idx) : index(idx) {}
1090    Block() : index(0) {}
1091 };
1092
1093 using Stage = uint16_t;
1094
1095 /* software stages */
1096 static constexpr Stage sw_vs = 1 << 0;
1097 static constexpr Stage sw_gs = 1 << 1;
1098 static constexpr Stage sw_tcs = 1 << 2;
1099 static constexpr Stage sw_tes = 1 << 3;
1100 static constexpr Stage sw_fs = 1 << 4;
1101 static constexpr Stage sw_cs = 1 << 5;
1102 static constexpr Stage sw_mask = 0x3f;
1103
1104 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1105 static constexpr Stage hw_vs = 1 << 6;
1106 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1107 static constexpr Stage hw_gs = 1 << 8;
1108 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1109 static constexpr Stage hw_hs = 1 << 10;
1110 static constexpr Stage hw_fs = 1 << 11;
1111 static constexpr Stage hw_cs = 1 << 12;
1112 static constexpr Stage hw_mask = 0x7f << 6;
1113
1114 /* possible settings of Program::stage */
1115 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1116 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1117 static constexpr Stage compute_cs = sw_cs | hw_cs;
1118 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1119 /* GFX10/NGG */
1120 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1121 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1122 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1123 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1124 /* GFX9 (and GFX10 if NGG isn't used) */
1125 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1126 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1127 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1128 /* pre-GFX9 */
1129 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1130 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1131 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1132 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1133 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1134
1135 class Program final {
1136 public:
1137    float_mode next_fp_mode;
1138    std::vector<Block> blocks;
1139    RegisterDemand max_reg_demand = RegisterDemand();
1140    uint16_t num_waves = 0;
1141    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1142    ac_shader_config* config;
1143    struct radv_shader_info *info;
1144    enum chip_class chip_class;
1145    enum radeon_family family;
1146    unsigned wave_size;
1147    Stage stage; /* Stage */
1148    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1149    bool needs_wqm = false; /* there exists a p_wqm instruction */
1150    bool wb_smem_l1_on_end = false;
1151
1152    std::vector<uint8_t> constant_data;
1153    Temp private_segment_buffer;
1154    Temp scratch_offset;
1155
1156    uint16_t lds_alloc_granule;
1157    uint32_t lds_limit; /* in bytes */
1158    uint16_t vgpr_limit;
1159    uint16_t sgpr_limit;
1160    uint16_t physical_sgprs;
1161    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1162
1163    bool needs_vcc = false;
1164    bool needs_xnack_mask = false;
1165    bool needs_flat_scr = false;
1166
1167    uint32_t allocateId()
1168    {
1169       assert(allocationID <= 16777215);
1170       return allocationID++;
1171    }
1172
1173    uint32_t peekAllocationId()
1174    {
1175       return allocationID;
1176    }
1177
1178    void setAllocationId(uint32_t id)
1179    {
1180       allocationID = id;
1181    }
1182
1183    Block* create_and_insert_block() {
1184       blocks.emplace_back(blocks.size());
1185       blocks.back().fp_mode = next_fp_mode;
1186       return &blocks.back();
1187    }
1188
1189    Block* insert_block(Block&& block) {
1190       block.index = blocks.size();
1191       block.fp_mode = next_fp_mode;
1192       blocks.emplace_back(std::move(block));
1193       return &blocks.back();
1194    }
1195
1196 private:
1197    uint32_t allocationID = 1;
1198 };
1199
1200 struct live {
1201    /* live temps out per block */
1202    std::vector<std::set<Temp>> live_out;
1203    /* register demand (sgpr/vgpr) per instruction per block */
1204    std::vector<std::vector<RegisterDemand>> register_demand;
1205 };
1206
1207 void select_program(Program *program,
1208                     unsigned shader_count,
1209                     struct nir_shader *const *shaders,
1210                     ac_shader_config* config,
1211                     struct radv_shader_info *info,
1212                     struct radv_nir_compiler_options *options);
1213
1214 void lower_wqm(Program* program, live& live_vars,
1215                const struct radv_nir_compiler_options *options);
1216 void lower_bool_phis(Program* program);
1217 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1218 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1219 std::vector<uint16_t> dead_code_analysis(Program *program);
1220 void dominator_tree(Program* program);
1221 void insert_exec_mask(Program *program);
1222 void value_numbering(Program* program);
1223 void optimize(Program* program);
1224 void setup_reduce_temp(Program* program);
1225 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1226 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1227 void ssa_elimination(Program* program);
1228 void lower_to_hw_instr(Program* program);
1229 void schedule_program(Program* program, live& live_vars);
1230 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void insert_wait_states(Program* program);
1232 void insert_NOPs(Program* program);
1233 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1234 void print_asm(Program *program, std::vector<uint32_t>& binary,
1235                unsigned exec_size, std::ostream& out);
1236 void validate(Program* program, FILE *output);
1237 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1238 #ifndef NDEBUG
1239 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1240 #else
1241 #define perfwarn(program, cond, msg, ...)
1242 #endif
1243
1244 void aco_print_instr(Instruction *instr, FILE *output);
1245 void aco_print_program(Program *program, FILE *output);
1246
1247 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1248 uint16_t get_extra_sgprs(Program *program);
1249
1250 /* get number of sgprs allocated required to address a number of sgprs */
1251 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1252
1253 /* return number of addressable SGPRs for max_waves */
1254 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1255
1256 typedef struct {
1257    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1258    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1259    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1260    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1261    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1262    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1263 } Info;
1264
1265 extern const Info instr_info;
1266
1267 }
1268
1269 #endif /* ACO_IR_H */
1270