src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_info;
  41
  42 namespace aco {
  43
  44 extern uint64_t debug_flags;
  45
  46 enum {
  47    DEBUG_VALIDATE = 0x1,
  48    DEBUG_VALIDATE_RA = 0x2,
  49    DEBUG_PERFWARN = 0x4,
  50 };
  51
  52 /**
  53  * Representation of the instruction's microcode encoding format
  54  * Note: Some Vector ALU Formats can be combined, such that:
  55  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  56  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  57  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  58  *
  59  * (*) The same is applicable for VOP1 and VOPC instructions.
  60  */
  61 enum class Format : std::uint16_t {
  62    /* Pseudo Instruction Format */
  63    PSEUDO = 0,
  64    /* Scalar ALU & Control Formats */
  65    SOP1 = 1,
  66    SOP2 = 2,
  67    SOPK = 3,
  68    SOPP = 4,
  69    SOPC = 5,
  70    /* Scalar Memory Format */
  71    SMEM = 6,
  72    /* LDS/GDS Format */
  73    DS = 8,
  74    /* Vector Memory Buffer Formats */
  75    MTBUF = 9,
  76    MUBUF = 10,
  77    /* Vector Memory Image Format */
  78    MIMG = 11,
  79    /* Export Format */
  80    EXP = 12,
  81    /* Flat Formats */
  82    FLAT = 13,
  83    GLOBAL = 14,
  84    SCRATCH = 15,
  85
  86    PSEUDO_BRANCH = 16,
  87    PSEUDO_BARRIER = 17,
  88    PSEUDO_REDUCTION = 18,
  89
  90    /* Vector ALU Formats */
  91    VOP1 = 1 << 8,
  92    VOP2 = 1 << 9,
  93    VOPC = 1 << 10,
  94    VOP3 = 1 << 11,
  95    VOP3A = 1 << 11,
  96    VOP3B = 1 << 11,
  97    VOP3P = 1 << 12,
  98    /* Vector Parameter Interpolation Format */
  99    VINTRP = 1 << 13,
 100    DPP = 1 << 14,
 101    SDWA = 1 << 15,
 102 };
 103
 104 enum barrier_interaction {
 105    barrier_none = 0,
 106    barrier_buffer = 0x1,
 107    barrier_image = 0x2,
 108    barrier_atomic = 0x4,
 109    barrier_shared = 0x8,
 110    barrier_count = 4,
 111 };
 112
 113 constexpr Format asVOP3(Format format) {
 114    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 115 };
 116
 117 enum class RegType {
 118    none = 0,
 119    sgpr,
 120    vgpr,
 121    linear_vgpr,
 122 };
 123
 124 struct RegClass {
 125
 126    enum RC : uint8_t {
 127       s1 = 1,
 128       s2 = 2,
 129       s3 = 3,
 130       s4 = 4,
 131       s6 = 6,
 132       s8 = 8,
 133       s16 = 16,
 134       v1 = s1 | (1 << 5),
 135       v2 = s2 | (1 << 5),
 136       v3 = s3 | (1 << 5),
 137       v4 = s4 | (1 << 5),
 138       v5 = 5  | (1 << 5),
 139       v6 = 6  | (1 << 5),
 140       v7 = 7  | (1 << 5),
 141       v8 = 8  | (1 << 5),
 142       /* these are used for WWM and spills to vgpr */
 143       v1_linear = v1 | (1 << 6),
 144       v2_linear = v2 | (1 << 6),
 145    };
 146
 147    RegClass() = default;
 148    constexpr RegClass(RC rc)
 149       : rc(rc) {}
 150    constexpr RegClass(RegType type, unsigned size)
 151       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 152
 153    constexpr operator RC() const { return rc; }
 154    explicit operator bool() = delete;
 155
 156    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 157    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 158    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 159    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 160
 161 private:
 162    RC rc;
 163 };
 164
 165 /* transitional helper expressions */
 166 static constexpr RegClass s1{RegClass::s1};
 167 static constexpr RegClass s2{RegClass::s2};
 168 static constexpr RegClass s3{RegClass::s3};
 169 static constexpr RegClass s4{RegClass::s4};
 170 static constexpr RegClass s8{RegClass::s8};
 171 static constexpr RegClass s16{RegClass::s16};
 172 static constexpr RegClass v1{RegClass::v1};
 173 static constexpr RegClass v2{RegClass::v2};
 174 static constexpr RegClass v3{RegClass::v3};
 175 static constexpr RegClass v4{RegClass::v4};
 176 static constexpr RegClass v5{RegClass::v5};
 177 static constexpr RegClass v6{RegClass::v6};
 178 static constexpr RegClass v7{RegClass::v7};
 179 static constexpr RegClass v8{RegClass::v8};
 180
 181 /**
 182  * Temp Class
 183  * Each temporary virtual register has a
 184  * register class (i.e. size and type)
 185  * and SSA id.
 186  */
 187 struct Temp {
 188    Temp() = default;
 189    constexpr Temp(uint32_t id, RegClass cls) noexcept
 190       : id_(id), reg_class(cls) {}
 191
 192    constexpr uint32_t id() const noexcept { return id_; }
 193    constexpr RegClass regClass() const noexcept { return reg_class; }
 194
 195    constexpr unsigned size() const noexcept { return reg_class.size(); }
 196    constexpr RegType type() const noexcept { return reg_class.type(); }
 197    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 198
 199    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 200    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 201    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 202
 203 private:
 204    uint32_t id_:24;
 205    RegClass reg_class;
 206 };
 207
 208 /**
 209  * PhysReg
 210  * Represents the physical register for each
 211  * Operand and Definition.
 212  */
 213 struct PhysReg {
 214    constexpr PhysReg() = default;
 215    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 216    constexpr operator unsigned() const { return reg; }
 217
 218    uint16_t reg = 0;
 219 };
 220
 221 /* helper expressions for special registers */
 222 static constexpr PhysReg m0{124};
 223 static constexpr PhysReg vcc{106};
 224 static constexpr PhysReg exec{126};
 225 static constexpr PhysReg exec_lo{126};
 226 static constexpr PhysReg exec_hi{127};
 227 static constexpr PhysReg scc{253};
 228
 229 /**
 230  * Operand Class
 231  * Initially, each Operand refers to either
 232  * a temporary virtual register
 233  * or to a constant value
 234  * Temporary registers get mapped to physical register during RA
 235  * Constant values are inlined into the instruction sequence.
 236  */
 237 class Operand final
 238 {
 239 public:
 240    constexpr Operand()
 241       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 242         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 243
 244    explicit Operand(Temp r) noexcept
 245    {
 246       data_.temp = r;
 247       if (r.id()) {
 248          isTemp_ = true;
 249       } else {
 250          isUndef_ = true;
 251          setFixed(PhysReg{128});
 252       }
 253    };
 254    explicit Operand(uint32_t v) noexcept
 255    {
 256       data_.i = v;
 257       isConstant_ = true;
 258       if (v <= 64)
 259          setFixed(PhysReg{128 + v});
 260       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 261          setFixed(PhysReg{192 - v});
 262       else if (v == 0x3f000000) /* 0.5 */
 263          setFixed(PhysReg{240});
 264       else if (v == 0xbf000000) /* -0.5 */
 265          setFixed(PhysReg{241});
 266       else if (v == 0x3f800000) /* 1.0 */
 267          setFixed(PhysReg{242});
 268       else if (v == 0xbf800000) /* -1.0 */
 269          setFixed(PhysReg{243});
 270       else if (v == 0x40000000) /* 2.0 */
 271          setFixed(PhysReg{244});
 272       else if (v == 0xc0000000) /* -2.0 */
 273          setFixed(PhysReg{245});
 274       else if (v == 0x40800000) /* 4.0 */
 275          setFixed(PhysReg{246});
 276       else if (v == 0xc0800000) /* -4.0 */
 277          setFixed(PhysReg{247});
 278       else if (v == 0x3e22f983) /* 1/(2*PI) */
 279          setFixed(PhysReg{248});
 280       else /* Literal Constant */
 281          setFixed(PhysReg{255});
 282    };
 283    explicit Operand(uint64_t v) noexcept
 284    {
 285       isConstant_ = true;
 286       is64BitConst_ = true;
 287       if (v <= 64)
 288          setFixed(PhysReg{128 + (uint32_t) v});
 289       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 290          setFixed(PhysReg{192 - (uint32_t) v});
 291       else if (v == 0x3FE0000000000000) /* 0.5 */
 292          setFixed(PhysReg{240});
 293       else if (v == 0xBFE0000000000000) /* -0.5 */
 294          setFixed(PhysReg{241});
 295       else if (v == 0x3FF0000000000000) /* 1.0 */
 296          setFixed(PhysReg{242});
 297       else if (v == 0xBFF0000000000000) /* -1.0 */
 298          setFixed(PhysReg{243});
 299       else if (v == 0x4000000000000000) /* 2.0 */
 300          setFixed(PhysReg{244});
 301       else if (v == 0xC000000000000000) /* -2.0 */
 302          setFixed(PhysReg{245});
 303       else if (v == 0x4010000000000000) /* 4.0 */
 304          setFixed(PhysReg{246});
 305       else if (v == 0xC010000000000000) /* -4.0 */
 306          setFixed(PhysReg{247});
 307       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 308          setFixed(PhysReg{248});
 309       else { /* Literal Constant: we don't know if it is a long or double.*/
 310          isConstant_ = 0;
 311          assert(false && "attempt to create a 64-bit literal constant");
 312       }
 313    };
 314    explicit Operand(RegClass type) noexcept
 315    {
 316       isUndef_ = true;
 317       data_.temp = Temp(0, type);
 318       setFixed(PhysReg{128});
 319    };
 320    explicit Operand(PhysReg reg, RegClass type) noexcept
 321    {
 322       data_.temp = Temp(0, type);
 323       setFixed(reg);
 324    }
 325
 326    constexpr bool isTemp() const noexcept
 327    {
 328       return isTemp_;
 329    }
 330
 331    constexpr void setTemp(Temp t) noexcept {
 332       assert(!isConstant_);
 333       isTemp_ = true;
 334       data_.temp = t;
 335    }
 336
 337    constexpr Temp getTemp() const noexcept
 338    {
 339       return data_.temp;
 340    }
 341
 342    constexpr uint32_t tempId() const noexcept
 343    {
 344       return data_.temp.id();
 345    }
 346
 347    constexpr bool hasRegClass() const noexcept
 348    {
 349       return isTemp() || isUndefined();
 350    }
 351
 352    constexpr RegClass regClass() const noexcept
 353    {
 354       return data_.temp.regClass();
 355    }
 356
 357    constexpr unsigned size() const noexcept
 358    {
 359       if (isConstant())
 360          return is64BitConst_ ? 2 : 1;
 361       else
 362          return data_.temp.size();
 363    }
 364
 365    constexpr bool isFixed() const noexcept
 366    {
 367       return isFixed_;
 368    }
 369
 370    constexpr PhysReg physReg() const noexcept
 371    {
 372       return reg_;
 373    }
 374
 375    constexpr void setFixed(PhysReg reg) noexcept
 376    {
 377       isFixed_ = reg != unsigned(-1);
 378       reg_ = reg;
 379    }
 380
 381    constexpr bool isConstant() const noexcept
 382    {
 383       return isConstant_;
 384    }
 385
 386    constexpr bool isLiteral() const noexcept
 387    {
 388       return isConstant() && reg_ == 255;
 389    }
 390
 391    constexpr bool isUndefined() const noexcept
 392    {
 393       return isUndef_;
 394    }
 395
 396    constexpr uint32_t constantValue() const noexcept
 397    {
 398       return data_.i;
 399    }
 400
 401    constexpr bool constantEquals(uint32_t cmp) const noexcept
 402    {
 403       return isConstant() && constantValue() == cmp;
 404    }
 405
 406    constexpr void setKill(bool flag) noexcept
 407    {
 408       isKill_ = flag;
 409       if (!flag)
 410          setFirstKill(false);
 411    }
 412
 413    constexpr bool isKill() const noexcept
 414    {
 415       return isKill_ || isFirstKill();
 416    }
 417
 418    constexpr void setFirstKill(bool flag) noexcept
 419    {
 420       isFirstKill_ = flag;
 421       if (flag)
 422          setKill(flag);
 423    }
 424
 425    /* When there are multiple operands killing the same temporary,
 426     * isFirstKill() is only returns true for the first one. */
 427    constexpr bool isFirstKill() const noexcept
 428    {
 429       return isFirstKill_;
 430    }
 431
 432 private:
 433    union {
 434       uint32_t i;
 435       float f;
 436       Temp temp = Temp(0, s1);
 437    } data_;
 438    PhysReg reg_;
 439    union {
 440       struct {
 441          uint8_t isTemp_:1;
 442          uint8_t isFixed_:1;
 443          uint8_t isConstant_:1;
 444          uint8_t isKill_:1;
 445          uint8_t isUndef_:1;
 446          uint8_t isFirstKill_:1;
 447          uint8_t is64BitConst_:1;
 448       };
 449       /* can't initialize bit-fields in c++11, so work around using a union */
 450       uint8_t control_ = 0;
 451    };
 452 };
 453
 454 /**
 455  * Definition Class
 456  * Definitions are the results of Instructions
 457  * and refer to temporary virtual registers
 458  * which are later mapped to physical registers
 459  */
 460 class Definition final
 461 {
 462 public:
 463    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 464    Definition(uint32_t index, RegClass type) noexcept
 465       : temp(index, type) {}
 466    explicit Definition(Temp tmp) noexcept
 467       : temp(tmp) {}
 468    Definition(PhysReg reg, RegClass type) noexcept
 469       : temp(Temp(0, type))
 470    {
 471       setFixed(reg);
 472    }
 473    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 474       : temp(Temp(tmpId, type))
 475    {
 476       setFixed(reg);
 477    }
 478
 479    constexpr bool isTemp() const noexcept
 480    {
 481       return tempId() > 0;
 482    }
 483
 484    constexpr Temp getTemp() const noexcept
 485    {
 486       return temp;
 487    }
 488
 489    constexpr uint32_t tempId() const noexcept
 490    {
 491       return temp.id();
 492    }
 493
 494    constexpr void setTemp(Temp t) noexcept {
 495       temp = t;
 496    }
 497
 498    constexpr RegClass regClass() const noexcept
 499    {
 500       return temp.regClass();
 501    }
 502
 503    constexpr unsigned size() const noexcept
 504    {
 505       return temp.size();
 506    }
 507
 508    constexpr bool isFixed() const noexcept
 509    {
 510       return isFixed_;
 511    }
 512
 513    constexpr PhysReg physReg() const noexcept
 514    {
 515       return reg_;
 516    }
 517
 518    constexpr void setFixed(PhysReg reg) noexcept
 519    {
 520       isFixed_ = 1;
 521       reg_ = reg;
 522    }
 523
 524    constexpr void setHint(PhysReg reg) noexcept
 525    {
 526       hasHint_ = 1;
 527       reg_ = reg;
 528    }
 529
 530    constexpr bool hasHint() const noexcept
 531    {
 532       return hasHint_;
 533    }
 534
 535    constexpr void setKill(bool flag) noexcept
 536    {
 537       isKill_ = flag;
 538    }
 539
 540    constexpr bool isKill() const noexcept
 541    {
 542       return isKill_;
 543    }
 544
 545 private:
 546    Temp temp = Temp(0, s1);
 547    PhysReg reg_;
 548    union {
 549       struct {
 550          uint8_t isFixed_:1;
 551          uint8_t hasHint_:1;
 552          uint8_t isKill_:1;
 553       };
 554       /* can't initialize bit-fields in c++11, so work around using a union */
 555       uint8_t control_ = 0;
 556    };
 557 };
 558
 559 class Block;
 560
 561 struct Instruction {
 562    aco_opcode opcode;
 563    Format format;
 564
 565    aco::span<Operand> operands;
 566    aco::span<Definition> definitions;
 567
 568    constexpr bool isVALU() const noexcept
 569    {
 570       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 571           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 572           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 573           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 574           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 575           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 576    }
 577
 578    constexpr bool isSALU() const noexcept
 579    {
 580       return format == Format::SOP1 ||
 581              format == Format::SOP2 ||
 582              format == Format::SOPC ||
 583              format == Format::SOPK ||
 584              format == Format::SOPP;
 585    }
 586
 587    constexpr bool isVMEM() const noexcept
 588    {
 589       return format == Format::MTBUF ||
 590              format == Format::MUBUF ||
 591              format == Format::MIMG;
 592    }
 593
 594    constexpr bool isDPP() const noexcept
 595    {
 596       return (uint16_t) format & (uint16_t) Format::DPP;
 597    }
 598
 599    constexpr bool isVOP3() const noexcept
 600    {
 601       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 602              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 603              format == Format::VOP3P;
 604    }
 605
 606    constexpr bool isSDWA() const noexcept
 607    {
 608       return (uint16_t) format & (uint16_t) Format::SDWA;
 609    }
 610
 611    constexpr bool isFlatOrGlobal() const noexcept
 612    {
 613       return format == Format::FLAT || format == Format::GLOBAL;
 614    }
 615 };
 616
 617 struct SOPK_instruction : public Instruction {
 618    uint16_t imm;
 619 };
 620
 621 struct SOPP_instruction : public Instruction {
 622    uint32_t imm;
 623    int block;
 624 };
 625
 626 struct SOPC_instruction : public Instruction {
 627 };
 628
 629 struct SOP1_instruction : public Instruction {
 630 };
 631
 632 struct SOP2_instruction : public Instruction {
 633 };
 634
 635 /**
 636  * Scalar Memory Format:
 637  * For s_(buffer_)load_dword*:
 638  * Operand(0): SBASE - SGPR-pair which provides base address
 639  * Operand(1): Offset - immediate (un)signed offset or SGPR
 640  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 641  * Operand(n-1): SOffset - SGPR offset (Vega only)
 642  *
 643  * Having no operands is also valid for instructions such as s_dcache_inv.
 644  *
 645  */
 646 struct SMEM_instruction : public Instruction {
 647    bool glc; /* VI+: globally coherent */
 648    bool dlc; /* NAVI: device level coherent */
 649    bool nv; /* VEGA only: Non-volatile */
 650    bool can_reorder;
 651    bool disable_wqm;
 652    barrier_interaction barrier;
 653 };
 654
 655 struct VOP1_instruction : public Instruction {
 656 };
 657
 658 struct VOP2_instruction : public Instruction {
 659 };
 660
 661 struct VOPC_instruction : public Instruction {
 662 };
 663
 664 struct VOP3A_instruction : public Instruction {
 665    bool abs[3];
 666    bool opsel[3];
 667    bool clamp;
 668    unsigned omod;
 669    bool neg[3];
 670 };
 671
 672 /**
 673  * Data Parallel Primitives Format:
 674  * This format can be used for VOP1, VOP2 or VOPC instructions.
 675  * The swizzle applies to the src0 operand.
 676  *
 677  */
 678 struct DPP_instruction : public Instruction {
 679    uint16_t dpp_ctrl;
 680    uint8_t row_mask;
 681    uint8_t bank_mask;
 682    bool abs[2];
 683    bool neg[2];
 684    bool bound_ctrl;
 685 };
 686
 687 struct Interp_instruction : public Instruction {
 688    unsigned attribute;
 689    unsigned component;
 690 };
 691
 692 /**
 693  * Local and Global Data Sharing instructions
 694  * Operand(0): ADDR - VGPR which supplies the address.
 695  * Operand(1): DATA0 - First data VGPR.
 696  * Operand(2): DATA1 - Second data VGPR.
 697  * Operand(n-1): M0 - LDS size.
 698  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 699  *
 700  */
 701 struct DS_instruction : public Instruction {
 702    int16_t offset0;
 703    int8_t offset1;
 704    bool gds;
 705 };
 706
 707 /**
 708  * Vector Memory Untyped-buffer Instructions
 709  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 710  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 711  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 712  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 713  *
 714  */
 715 struct MUBUF_instruction : public Instruction {
 716    unsigned offset; /* Unsigned byte offset - 12 bit */
 717    bool offen; /* Supply an offset from VGPR (VADDR) */
 718    bool idxen; /* Supply an index from VGPR (VADDR) */
 719    bool glc; /* globally coherent */
 720    bool dlc; /* NAVI: device level coherent */
 721    bool slc; /* system level coherent */
 722    bool tfe; /* texture fail enable */
 723    bool lds; /* Return read-data to LDS instead of VGPRs */
 724    bool disable_wqm; /* Require an exec mask without helper invocations */
 725    bool can_reorder;
 726    barrier_interaction barrier;
 727 };
 728
 729 /**
 730  * Vector Memory Typed-buffer Instructions
 731  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 732  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 733  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 734  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 735  *
 736  */
 737 struct MTBUF_instruction : public Instruction {
 738    union {
 739       struct {
 740          uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 741          uint8_t nfmt : 3; /* Numeric format of data in memory */
 742       };
 743       uint8_t img_format; /* Buffer or image format as used by GFX10 */
 744    };
 745    unsigned offset; /* Unsigned byte offset - 12 bit */
 746    bool offen; /* Supply an offset from VGPR (VADDR) */
 747    bool idxen; /* Supply an index from VGPR (VADDR) */
 748    bool glc; /* globally coherent */
 749    bool dlc; /* NAVI: device level coherent */
 750    bool slc; /* system level coherent */
 751    bool tfe; /* texture fail enable */
 752    bool disable_wqm; /* Require an exec mask without helper invocations */
 753    bool can_reorder;
 754    barrier_interaction barrier;
 755 };
 756
 757 /**
 758  * Vector Memory Image Instructions
 759  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 760  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 761  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 762  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 763  *
 764  */
 765 struct MIMG_instruction : public Instruction {
 766    unsigned dmask; /* Data VGPR enable mask */
 767    bool unrm; /* Force address to be un-normalized */
 768    bool dlc; /* NAVI: device level coherent */
 769    bool glc; /* globally coherent */
 770    bool slc; /* system level coherent */
 771    bool tfe; /* texture fail enable */
 772    bool da; /* declare an array */
 773    bool lwe; /* Force data to be un-normalized */
 774    bool r128; /* NAVI: Texture resource size */
 775    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 776    bool d16; /* Convert 32-bit data to 16-bit data */
 777    bool disable_wqm; /* Require an exec mask without helper invocations */
 778    bool can_reorder;
 779    barrier_interaction barrier;
 780 };
 781
 782 /**
 783  * Flat/Scratch/Global Instructions
 784  * Operand(0): ADDR
 785  * Operand(1): SADDR
 786  * Operand(2) / Definition(0): DATA/VDST
 787  *
 788  */
 789 struct FLAT_instruction : public Instruction {
 790    uint16_t offset; /* Vega only */
 791    bool slc;
 792    bool glc;
 793    bool lds;
 794    bool nv;
 795 };
 796
 797 struct Export_instruction : public Instruction {
 798    unsigned enabled_mask;
 799    unsigned dest;
 800    bool compressed;
 801    bool done;
 802    bool valid_mask;
 803 };
 804
 805 struct Pseudo_instruction : public Instruction {
 806    bool tmp_in_scc;
 807    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 808 };
 809
 810 struct Pseudo_branch_instruction : public Instruction {
 811    /* target[0] is the block index of the branch target.
 812     * For conditional branches, target[1] contains the fall-through alternative.
 813     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 814     */
 815    uint32_t target[2];
 816 };
 817
 818 struct Pseudo_barrier_instruction : public Instruction {
 819 };
 820
 821 enum ReduceOp {
 822    iadd32, iadd64,
 823    imul32, imul64,
 824    fadd32, fadd64,
 825    fmul32, fmul64,
 826    imin32, imin64,
 827    imax32, imax64,
 828    umin32, umin64,
 829    umax32, umax64,
 830    fmin32, fmin64,
 831    fmax32, fmax64,
 832    iand32, iand64,
 833    ior32, ior64,
 834    ixor32, ixor64,
 835 };
 836
 837 /**
 838  * Subgroup Reduction Instructions, everything except for the data to be
 839  * reduced and the result as inserted by setup_reduce_temp().
 840  * Operand(0): data to be reduced
 841  * Operand(1): reduce temporary
 842  * Operand(2): vector temporary
 843  * Definition(0): result
 844  * Definition(1): scalar temporary
 845  * Definition(2): scalar identity temporary
 846  * Definition(3): scc clobber
 847  * Definition(4): vcc clobber
 848  *
 849  */
 850 struct Pseudo_reduction_instruction : public Instruction {
 851    ReduceOp reduce_op;
 852    unsigned cluster_size; // must be 0 for scans
 853 };
 854
 855 struct instr_deleter_functor {
 856    void operator()(void* p) {
 857       free(p);
 858    }
 859 };
 860
 861 template<typename T>
 862 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 863
 864 template<typename T>
 865 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 866 {
 867    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 868    char *data = (char*) calloc(1, size);
 869    T* inst = (T*) data;
 870
 871    inst->opcode = opcode;
 872    inst->format = format;
 873
 874    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 875    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 876
 877    return inst;
 878 }
 879
 880 constexpr bool is_phi(Instruction* instr)
 881 {
 882    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 883 }
 884
 885 static inline bool is_phi(aco_ptr<Instruction>& instr)
 886 {
 887    return is_phi(instr.get());
 888 }
 889
 890 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 891 {
 892    switch (instr->format) {
 893    case Format::SMEM:
 894       return static_cast<SMEM_instruction*>(instr)->barrier;
 895    case Format::MUBUF:
 896       return static_cast<MUBUF_instruction*>(instr)->barrier;
 897    case Format::MIMG:
 898       return static_cast<MIMG_instruction*>(instr)->barrier;
 899    case Format::FLAT:
 900    case Format::GLOBAL:
 901       return barrier_buffer;
 902    case Format::DS:
 903       return barrier_shared;
 904    default:
 905       return barrier_none;
 906    }
 907 }
 908
 909 enum block_kind {
 910    /* uniform indicates that leaving this block,
 911     * all actives lanes stay active */
 912    block_kind_uniform = 1 << 0,
 913    block_kind_top_level = 1 << 1,
 914    block_kind_loop_preheader = 1 << 2,
 915    block_kind_loop_header = 1 << 3,
 916    block_kind_loop_exit = 1 << 4,
 917    block_kind_continue = 1 << 5,
 918    block_kind_break = 1 << 6,
 919    block_kind_continue_or_break = 1 << 7,
 920    block_kind_discard = 1 << 8,
 921    block_kind_branch = 1 << 9,
 922    block_kind_merge = 1 << 10,
 923    block_kind_invert = 1 << 11,
 924    block_kind_uses_discard_if = 1 << 12,
 925    block_kind_needs_lowering = 1 << 13,
 926 };
 927
 928
 929 struct RegisterDemand {
 930    constexpr RegisterDemand() = default;
 931    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
 932       : vgpr{v}, sgpr{s} {}
 933    int16_t vgpr = 0;
 934    int16_t sgpr = 0;
 935
 936    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
 937       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
 938    }
 939
 940    constexpr bool exceeds(const RegisterDemand other) const noexcept {
 941       return vgpr > other.vgpr || sgpr > other.sgpr;
 942    }
 943
 944    constexpr RegisterDemand operator+(const Temp t) const noexcept {
 945       if (t.type() == RegType::sgpr)
 946          return RegisterDemand( vgpr, sgpr + t.size() );
 947       else
 948          return RegisterDemand( vgpr + t.size(), sgpr );
 949    }
 950
 951    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
 952       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
 953    }
 954
 955    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
 956       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
 957    }
 958
 959    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
 960       vgpr += other.vgpr;
 961       sgpr += other.sgpr;
 962       return *this;
 963    }
 964
 965    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
 966       vgpr -= other.vgpr;
 967       sgpr -= other.sgpr;
 968       return *this;
 969    }
 970
 971    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
 972       if (t.type() == RegType::sgpr)
 973          sgpr += t.size();
 974       else
 975          vgpr += t.size();
 976       return *this;
 977    }
 978
 979    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
 980       if (t.type() == RegType::sgpr)
 981          sgpr -= t.size();
 982       else
 983          vgpr -= t.size();
 984       return *this;
 985    }
 986
 987    constexpr void update(const RegisterDemand other) noexcept {
 988       vgpr = std::max(vgpr, other.vgpr);
 989       sgpr = std::max(sgpr, other.sgpr);
 990    }
 991
 992 };
 993
 994 /* CFG */
 995 struct Block {
 996    unsigned index;
 997    unsigned offset = 0;
 998    std::vector<aco_ptr<Instruction>> instructions;
 999    std::vector<unsigned> logical_preds;
1000    std::vector<unsigned> linear_preds;
1001    std::vector<unsigned> logical_succs;
1002    std::vector<unsigned> linear_succs;
1003    RegisterDemand register_demand = RegisterDemand();
1004    uint16_t loop_nest_depth = 0;
1005    uint16_t kind = 0;
1006    int logical_idom = -1;
1007    int linear_idom = -1;
1008    Temp live_out_exec = Temp();
1009
1010    /* this information is needed for predecessors to blocks with phis when
1011     * moving out of ssa */
1012    bool scc_live_out = false;
1013    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1014
1015    Block(unsigned idx) : index(idx) {}
1016    Block() : index(0) {}
1017 };
1018
1019 using Stage = uint16_t;
1020
1021 /* software stages */
1022 static constexpr Stage sw_vs = 1 << 0;
1023 static constexpr Stage sw_gs = 1 << 1;
1024 static constexpr Stage sw_tcs = 1 << 2;
1025 static constexpr Stage sw_tes = 1 << 3;
1026 static constexpr Stage sw_fs = 1 << 4;
1027 static constexpr Stage sw_cs = 1 << 5;
1028 static constexpr Stage sw_mask = 0x3f;
1029
1030 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1031 static constexpr Stage hw_vs = 1 << 6;
1032 static constexpr Stage hw_es = 1 << 7;
1033 static constexpr Stage hw_gs = 1 << 8; /* not on GFX9. combined into ES on GFX9 (and GFX10/legacy). */
1034 static constexpr Stage hw_ls = 1 << 9;
1035 static constexpr Stage hw_hs = 1 << 10; /* not on GFX9. combined into LS on GFX9 (and GFX10/legacy). */
1036 static constexpr Stage hw_fs = 1 << 11;
1037 static constexpr Stage hw_cs = 1 << 12;
1038 static constexpr Stage hw_mask = 0x7f << 6;
1039
1040 /* possible settings of Program::stage */
1041 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1042 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1043 static constexpr Stage compute_cs = sw_cs | hw_cs;
1044 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1045 /* GFX10/NGG */
1046 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1047 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1048 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1049 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1050 /* GFX9 (and GFX10 if NGG isn't used) */
1051 static constexpr Stage vertex_geometry_es = sw_vs | sw_gs | hw_es;
1052 static constexpr Stage vertex_tess_control_ls = sw_vs | sw_tcs | hw_ls;
1053 static constexpr Stage tess_eval_geometry_es = sw_tes | sw_gs | hw_es;
1054 /* pre-GFX9 */
1055 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1056 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1057 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before GS */
1058 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1059
1060 class Program final {
1061 public:
1062    std::vector<Block> blocks;
1063    RegisterDemand max_reg_demand = RegisterDemand();
1064    uint16_t sgpr_limit = 0;
1065    uint16_t num_waves = 0;
1066    ac_shader_config* config;
1067    struct radv_shader_info *info;
1068    enum chip_class chip_class;
1069    enum radeon_family family;
1070    Stage stage; /* Stage */
1071    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1072    bool needs_wqm = false; /* there exists a p_wqm instruction */
1073    bool wb_smem_l1_on_end = false;
1074
1075    std::vector<uint8_t> constant_data;
1076
1077    uint32_t allocateId()
1078    {
1079       assert(allocationID <= 16777215);
1080       return allocationID++;
1081    }
1082
1083    uint32_t peekAllocationId()
1084    {
1085       return allocationID;
1086    }
1087
1088    void setAllocationId(uint32_t id)
1089    {
1090       allocationID = id;
1091    }
1092
1093    Block* create_and_insert_block() {
1094       blocks.emplace_back(blocks.size());
1095       return &blocks.back();
1096    }
1097
1098    Block* insert_block(Block&& block) {
1099       block.index = blocks.size();
1100       blocks.emplace_back(std::move(block));
1101       return &blocks.back();
1102    }
1103
1104 private:
1105    uint32_t allocationID = 1;
1106 };
1107
1108 struct live {
1109    /* live temps out per block */
1110    std::vector<std::set<Temp>> live_out;
1111    /* register demand (sgpr/vgpr) per instruction per block */
1112    std::vector<std::vector<RegisterDemand>> register_demand;
1113 };
1114
1115 void select_program(Program *program,
1116                     unsigned shader_count,
1117                     struct nir_shader *const *shaders,
1118                     ac_shader_config* config,
1119                     struct radv_shader_info *info,
1120                     struct radv_nir_compiler_options *options);
1121
1122 void lower_wqm(Program* program, live& live_vars,
1123                const struct radv_nir_compiler_options *options);
1124 void lower_bool_phis(Program* program);
1125 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1126 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1127 std::vector<uint16_t> dead_code_analysis(Program *program);
1128 void dominator_tree(Program* program);
1129 void insert_exec_mask(Program *program);
1130 void value_numbering(Program* program);
1131 void optimize(Program* program);
1132 void setup_reduce_temp(Program* program);
1133 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1134 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1135 void ssa_elimination(Program* program);
1136 void lower_to_hw_instr(Program* program);
1137 void schedule_program(Program* program, live& live_vars);
1138 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1139 void insert_wait_states(Program* program);
1140 void insert_NOPs(Program* program);
1141 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1142 void print_asm(Program *program, std::vector<uint32_t>& binary, unsigned exec_size,
1143                enum radeon_family family, std::ostream& out);
1144 void validate(Program* program, FILE *output);
1145 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1146 #ifndef NDEBUG
1147 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1148 #else
1149 #define perfwarn(program, cond, msg, ...)
1150 #endif
1151
1152 void aco_print_instr(Instruction *instr, FILE *output);
1153 void aco_print_program(Program *program, FILE *output);
1154
1155 typedef struct {
1156    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1157    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1158    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1159    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1160    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1161    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1162 } Info;
1163
1164 extern const Info instr_info;
1165
1166 }
1167
1168 #endif /* ACO_IR_H */
1169