src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_info;
  41
  42 namespace aco {
  43
  44 extern uint64_t debug_flags;
  45
  46 enum {
  47    DEBUG_VALIDATE = 0x1,
  48    DEBUG_VALIDATE_RA = 0x2,
  49    DEBUG_PERFWARN = 0x4,
  50 };
  51
  52 /**
  53  * Representation of the instruction's microcode encoding format
  54  * Note: Some Vector ALU Formats can be combined, such that:
  55  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  56  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  57  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  58  *
  59  * (*) The same is applicable for VOP1 and VOPC instructions.
  60  */
  61 enum class Format : std::uint16_t {
  62    /* Pseudo Instruction Format */
  63    PSEUDO = 0,
  64    /* Scalar ALU & Control Formats */
  65    SOP1 = 1,
  66    SOP2 = 2,
  67    SOPK = 3,
  68    SOPP = 4,
  69    SOPC = 5,
  70    /* Scalar Memory Format */
  71    SMEM = 6,
  72    /* LDS/GDS Format */
  73    DS = 8,
  74    /* Vector Memory Buffer Formats */
  75    MTBUF = 9,
  76    MUBUF = 10,
  77    /* Vector Memory Image Format */
  78    MIMG = 11,
  79    /* Export Format */
  80    EXP = 12,
  81    /* Flat Formats */
  82    FLAT = 13,
  83    GLOBAL = 14,
  84    SCRATCH = 15,
  85
  86    PSEUDO_BRANCH = 16,
  87    PSEUDO_BARRIER = 17,
  88    PSEUDO_REDUCTION = 18,
  89
  90    /* Vector ALU Formats */
  91    VOP1 = 1 << 8,
  92    VOP2 = 1 << 9,
  93    VOPC = 1 << 10,
  94    VOP3 = 1 << 11,
  95    VOP3A = 1 << 11,
  96    VOP3B = 1 << 11,
  97    VOP3P = 1 << 12,
  98    /* Vector Parameter Interpolation Format */
  99    VINTRP = 1 << 13,
 100    DPP = 1 << 14,
 101    SDWA = 1 << 15,
 102 };
 103
 104 enum barrier_interaction {
 105    barrier_none = 0,
 106    barrier_buffer = 0x1,
 107    barrier_image = 0x2,
 108    barrier_atomic = 0x4,
 109    barrier_shared = 0x8,
 110    barrier_count = 4,
 111 };
 112
 113 constexpr Format asVOP3(Format format) {
 114    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 115 };
 116
 117 enum class RegType {
 118    none = 0,
 119    sgpr,
 120    vgpr,
 121    linear_vgpr,
 122 };
 123
 124 struct RegClass {
 125
 126    enum RC : uint8_t {
 127       s1 = 1,
 128       s2 = 2,
 129       s3 = 3,
 130       s4 = 4,
 131       s6 = 6,
 132       s8 = 8,
 133       s16 = 16,
 134       v1 = s1 | (1 << 5),
 135       v2 = s2 | (1 << 5),
 136       v3 = s3 | (1 << 5),
 137       v4 = s4 | (1 << 5),
 138       v5 = 5  | (1 << 5),
 139       v6 = 6  | (1 << 5),
 140       v7 = 7  | (1 << 5),
 141       v8 = 8  | (1 << 5),
 142       /* these are used for WWM and spills to vgpr */
 143       v1_linear = v1 | (1 << 6),
 144       v2_linear = v2 | (1 << 6),
 145    };
 146
 147    RegClass() = default;
 148    constexpr RegClass(RC rc)
 149       : rc(rc) {}
 150    constexpr RegClass(RegType type, unsigned size)
 151       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 152
 153    constexpr operator RC() const { return rc; }
 154    explicit operator bool() = delete;
 155
 156    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 157    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 158    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 159    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 160
 161 private:
 162    RC rc;
 163 };
 164
 165 /* transitional helper expressions */
 166 static constexpr RegClass s1{RegClass::s1};
 167 static constexpr RegClass s2{RegClass::s2};
 168 static constexpr RegClass s3{RegClass::s3};
 169 static constexpr RegClass s4{RegClass::s4};
 170 static constexpr RegClass s8{RegClass::s8};
 171 static constexpr RegClass s16{RegClass::s16};
 172 static constexpr RegClass v1{RegClass::v1};
 173 static constexpr RegClass v2{RegClass::v2};
 174 static constexpr RegClass v3{RegClass::v3};
 175 static constexpr RegClass v4{RegClass::v4};
 176 static constexpr RegClass v5{RegClass::v5};
 177 static constexpr RegClass v6{RegClass::v6};
 178 static constexpr RegClass v7{RegClass::v7};
 179 static constexpr RegClass v8{RegClass::v8};
 180
 181 /**
 182  * Temp Class
 183  * Each temporary virtual register has a
 184  * register class (i.e. size and type)
 185  * and SSA id.
 186  */
 187 struct Temp {
 188    Temp() = default;
 189    constexpr Temp(uint32_t id, RegClass cls) noexcept
 190       : id_(id), reg_class(cls) {}
 191
 192    constexpr uint32_t id() const noexcept { return id_; }
 193    constexpr RegClass regClass() const noexcept { return reg_class; }
 194
 195    constexpr unsigned size() const noexcept { return reg_class.size(); }
 196    constexpr RegType type() const noexcept { return reg_class.type(); }
 197    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 198
 199    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 200    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 201    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 202
 203 private:
 204    uint32_t id_:24;
 205    RegClass reg_class;
 206 };
 207
 208 /**
 209  * PhysReg
 210  * Represents the physical register for each
 211  * Operand and Definition.
 212  */
 213 struct PhysReg {
 214    constexpr PhysReg() = default;
 215    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 216    constexpr operator unsigned() const { return reg; }
 217
 218    uint16_t reg = 0;
 219 };
 220
 221 /* helper expressions for special registers */
 222 static constexpr PhysReg m0{124};
 223 static constexpr PhysReg vcc{106};
 224 static constexpr PhysReg exec{126};
 225 static constexpr PhysReg exec_lo{126};
 226 static constexpr PhysReg exec_hi{127};
 227 static constexpr PhysReg scc{253};
 228
 229 /**
 230  * Operand Class
 231  * Initially, each Operand refers to either
 232  * a temporary virtual register
 233  * or to a constant value
 234  * Temporary registers get mapped to physical register during RA
 235  * Constant values are inlined into the instruction sequence.
 236  */
 237 class Operand final
 238 {
 239 public:
 240    constexpr Operand()
 241       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 242         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 243
 244    explicit Operand(Temp r) noexcept
 245    {
 246       data_.temp = r;
 247       if (r.id()) {
 248          isTemp_ = true;
 249       } else {
 250          isUndef_ = true;
 251          setFixed(PhysReg{128});
 252       }
 253    };
 254    explicit Operand(uint32_t v) noexcept
 255    {
 256       data_.i = v;
 257       isConstant_ = true;
 258       if (v <= 64)
 259          setFixed(PhysReg{128 + v});
 260       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 261          setFixed(PhysReg{192 - v});
 262       else if (v == 0x3f000000) /* 0.5 */
 263          setFixed(PhysReg{240});
 264       else if (v == 0xbf000000) /* -0.5 */
 265          setFixed(PhysReg{241});
 266       else if (v == 0x3f800000) /* 1.0 */
 267          setFixed(PhysReg{242});
 268       else if (v == 0xbf800000) /* -1.0 */
 269          setFixed(PhysReg{243});
 270       else if (v == 0x40000000) /* 2.0 */
 271          setFixed(PhysReg{244});
 272       else if (v == 0xc0000000) /* -2.0 */
 273          setFixed(PhysReg{245});
 274       else if (v == 0x40800000) /* 4.0 */
 275          setFixed(PhysReg{246});
 276       else if (v == 0xc0800000) /* -4.0 */
 277          setFixed(PhysReg{247});
 278       else if (v == 0x3e22f983) /* 1/(2*PI) */
 279          setFixed(PhysReg{248});
 280       else /* Literal Constant */
 281          setFixed(PhysReg{255});
 282    };
 283    explicit Operand(uint64_t v) noexcept
 284    {
 285       isConstant_ = true;
 286       is64BitConst_ = true;
 287       if (v <= 64)
 288          setFixed(PhysReg{128 + (uint32_t) v});
 289       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 290          setFixed(PhysReg{192 - (uint32_t) v});
 291       else if (v == 0x3FE0000000000000) /* 0.5 */
 292          setFixed(PhysReg{240});
 293       else if (v == 0xBFE0000000000000) /* -0.5 */
 294          setFixed(PhysReg{241});
 295       else if (v == 0x3FF0000000000000) /* 1.0 */
 296          setFixed(PhysReg{242});
 297       else if (v == 0xBFF0000000000000) /* -1.0 */
 298          setFixed(PhysReg{243});
 299       else if (v == 0x4000000000000000) /* 2.0 */
 300          setFixed(PhysReg{244});
 301       else if (v == 0xC000000000000000) /* -2.0 */
 302          setFixed(PhysReg{245});
 303       else if (v == 0x4010000000000000) /* 4.0 */
 304          setFixed(PhysReg{246});
 305       else if (v == 0xC010000000000000) /* -4.0 */
 306          setFixed(PhysReg{247});
 307       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 308          setFixed(PhysReg{248});
 309       else { /* Literal Constant: we don't know if it is a long or double.*/
 310          isConstant_ = 0;
 311          assert(false && "attempt to create a 64-bit literal constant");
 312       }
 313    };
 314    explicit Operand(RegClass type) noexcept
 315    {
 316       isUndef_ = true;
 317       data_.temp = Temp(0, type);
 318       setFixed(PhysReg{128});
 319    };
 320    explicit Operand(PhysReg reg, RegClass type) noexcept
 321    {
 322       data_.temp = Temp(0, type);
 323       setFixed(reg);
 324    }
 325
 326    constexpr bool isTemp() const noexcept
 327    {
 328       return isTemp_;
 329    }
 330
 331    constexpr void setTemp(Temp t) noexcept {
 332       assert(!isConstant_);
 333       isTemp_ = true;
 334       data_.temp = t;
 335    }
 336
 337    constexpr Temp getTemp() const noexcept
 338    {
 339       return data_.temp;
 340    }
 341
 342    constexpr uint32_t tempId() const noexcept
 343    {
 344       return data_.temp.id();
 345    }
 346
 347    constexpr bool hasRegClass() const noexcept
 348    {
 349       return isTemp() || isUndefined();
 350    }
 351
 352    constexpr RegClass regClass() const noexcept
 353    {
 354       return data_.temp.regClass();
 355    }
 356
 357    constexpr unsigned size() const noexcept
 358    {
 359       if (isConstant())
 360          return is64BitConst_ ? 2 : 1;
 361       else
 362          return data_.temp.size();
 363    }
 364
 365    constexpr bool isFixed() const noexcept
 366    {
 367       return isFixed_;
 368    }
 369
 370    constexpr PhysReg physReg() const noexcept
 371    {
 372       return reg_;
 373    }
 374
 375    constexpr void setFixed(PhysReg reg) noexcept
 376    {
 377       isFixed_ = reg != unsigned(-1);
 378       reg_ = reg;
 379    }
 380
 381    constexpr bool isConstant() const noexcept
 382    {
 383       return isConstant_;
 384    }
 385
 386    constexpr bool isLiteral() const noexcept
 387    {
 388       return isConstant() && reg_ == 255;
 389    }
 390
 391    constexpr bool isUndefined() const noexcept
 392    {
 393       return isUndef_;
 394    }
 395
 396    constexpr uint32_t constantValue() const noexcept
 397    {
 398       return data_.i;
 399    }
 400
 401    constexpr bool constantEquals(uint32_t cmp) const noexcept
 402    {
 403       return isConstant() && constantValue() == cmp;
 404    }
 405
 406    constexpr void setKill(bool flag) noexcept
 407    {
 408       isKill_ = flag;
 409       if (!flag)
 410          setFirstKill(false);
 411    }
 412
 413    constexpr bool isKill() const noexcept
 414    {
 415       return isKill_ || isFirstKill();
 416    }
 417
 418    constexpr void setFirstKill(bool flag) noexcept
 419    {
 420       isFirstKill_ = flag;
 421       if (flag)
 422          setKill(flag);
 423    }
 424
 425    /* When there are multiple operands killing the same temporary,
 426     * isFirstKill() is only returns true for the first one. */
 427    constexpr bool isFirstKill() const noexcept
 428    {
 429       return isFirstKill_;
 430    }
 431
 432 private:
 433    union {
 434       uint32_t i;
 435       float f;
 436       Temp temp = Temp(0, s1);
 437    } data_;
 438    PhysReg reg_;
 439    union {
 440       struct {
 441          uint8_t isTemp_:1;
 442          uint8_t isFixed_:1;
 443          uint8_t isConstant_:1;
 444          uint8_t isKill_:1;
 445          uint8_t isUndef_:1;
 446          uint8_t isFirstKill_:1;
 447          uint8_t is64BitConst_:1;
 448       };
 449       /* can't initialize bit-fields in c++11, so work around using a union */
 450       uint8_t control_ = 0;
 451    };
 452 };
 453
 454 /**
 455  * Definition Class
 456  * Definitions are the results of Instructions
 457  * and refer to temporary virtual registers
 458  * which are later mapped to physical registers
 459  */
 460 class Definition final
 461 {
 462 public:
 463    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 464    Definition(uint32_t index, RegClass type) noexcept
 465       : temp(index, type) {}
 466    explicit Definition(Temp tmp) noexcept
 467       : temp(tmp) {}
 468    Definition(PhysReg reg, RegClass type) noexcept
 469       : temp(Temp(0, type))
 470    {
 471       setFixed(reg);
 472    }
 473    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 474       : temp(Temp(tmpId, type))
 475    {
 476       setFixed(reg);
 477    }
 478
 479    constexpr bool isTemp() const noexcept
 480    {
 481       return tempId() > 0;
 482    }
 483
 484    constexpr Temp getTemp() const noexcept
 485    {
 486       return temp;
 487    }
 488
 489    constexpr uint32_t tempId() const noexcept
 490    {
 491       return temp.id();
 492    }
 493
 494    constexpr void setTemp(Temp t) noexcept {
 495       temp = t;
 496    }
 497
 498    constexpr RegClass regClass() const noexcept
 499    {
 500       return temp.regClass();
 501    }
 502
 503    constexpr unsigned size() const noexcept
 504    {
 505       return temp.size();
 506    }
 507
 508    constexpr bool isFixed() const noexcept
 509    {
 510       return isFixed_;
 511    }
 512
 513    constexpr PhysReg physReg() const noexcept
 514    {
 515       return reg_;
 516    }
 517
 518    constexpr void setFixed(PhysReg reg) noexcept
 519    {
 520       isFixed_ = 1;
 521       reg_ = reg;
 522    }
 523
 524    constexpr void setHint(PhysReg reg) noexcept
 525    {
 526       hasHint_ = 1;
 527       reg_ = reg;
 528    }
 529
 530    constexpr bool hasHint() const noexcept
 531    {
 532       return hasHint_;
 533    }
 534
 535    constexpr void setKill(bool flag) noexcept
 536    {
 537       isKill_ = flag;
 538    }
 539
 540    constexpr bool isKill() const noexcept
 541    {
 542       return isKill_;
 543    }
 544
 545 private:
 546    Temp temp = Temp(0, s1);
 547    PhysReg reg_;
 548    union {
 549       struct {
 550          uint8_t isFixed_:1;
 551          uint8_t hasHint_:1;
 552          uint8_t isKill_:1;
 553       };
 554       /* can't initialize bit-fields in c++11, so work around using a union */
 555       uint8_t control_ = 0;
 556    };
 557 };
 558
 559 class Block;
 560
 561 struct Instruction {
 562    aco_opcode opcode;
 563    Format format;
 564    uint32_t pass_flags;
 565
 566    aco::span<Operand> operands;
 567    aco::span<Definition> definitions;
 568
 569    constexpr bool isVALU() const noexcept
 570    {
 571       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 572           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 573           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 574           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 575           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 576           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 577    }
 578
 579    constexpr bool isSALU() const noexcept
 580    {
 581       return format == Format::SOP1 ||
 582              format == Format::SOP2 ||
 583              format == Format::SOPC ||
 584              format == Format::SOPK ||
 585              format == Format::SOPP;
 586    }
 587
 588    constexpr bool isVMEM() const noexcept
 589    {
 590       return format == Format::MTBUF ||
 591              format == Format::MUBUF ||
 592              format == Format::MIMG;
 593    }
 594
 595    constexpr bool isDPP() const noexcept
 596    {
 597       return (uint16_t) format & (uint16_t) Format::DPP;
 598    }
 599
 600    constexpr bool isVOP3() const noexcept
 601    {
 602       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 603              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 604              format == Format::VOP3P;
 605    }
 606
 607    constexpr bool isSDWA() const noexcept
 608    {
 609       return (uint16_t) format & (uint16_t) Format::SDWA;
 610    }
 611
 612    constexpr bool isFlatOrGlobal() const noexcept
 613    {
 614       return format == Format::FLAT || format == Format::GLOBAL;
 615    }
 616 };
 617
 618 struct SOPK_instruction : public Instruction {
 619    uint16_t imm;
 620 };
 621
 622 struct SOPP_instruction : public Instruction {
 623    uint32_t imm;
 624    int block;
 625 };
 626
 627 struct SOPC_instruction : public Instruction {
 628 };
 629
 630 struct SOP1_instruction : public Instruction {
 631 };
 632
 633 struct SOP2_instruction : public Instruction {
 634 };
 635
 636 /**
 637  * Scalar Memory Format:
 638  * For s_(buffer_)load_dword*:
 639  * Operand(0): SBASE - SGPR-pair which provides base address
 640  * Operand(1): Offset - immediate (un)signed offset or SGPR
 641  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 642  * Operand(n-1): SOffset - SGPR offset (Vega only)
 643  *
 644  * Having no operands is also valid for instructions such as s_dcache_inv.
 645  *
 646  */
 647 struct SMEM_instruction : public Instruction {
 648    bool glc; /* VI+: globally coherent */
 649    bool dlc; /* NAVI: device level coherent */
 650    bool nv; /* VEGA only: Non-volatile */
 651    bool can_reorder;
 652    bool disable_wqm;
 653    barrier_interaction barrier;
 654 };
 655
 656 struct VOP1_instruction : public Instruction {
 657 };
 658
 659 struct VOP2_instruction : public Instruction {
 660 };
 661
 662 struct VOPC_instruction : public Instruction {
 663 };
 664
 665 struct VOP3A_instruction : public Instruction {
 666    bool abs[3];
 667    bool opsel[3];
 668    bool clamp;
 669    unsigned omod;
 670    bool neg[3];
 671 };
 672
 673 /**
 674  * Data Parallel Primitives Format:
 675  * This format can be used for VOP1, VOP2 or VOPC instructions.
 676  * The swizzle applies to the src0 operand.
 677  *
 678  */
 679 struct DPP_instruction : public Instruction {
 680    uint16_t dpp_ctrl;
 681    uint8_t row_mask;
 682    uint8_t bank_mask;
 683    bool abs[2];
 684    bool neg[2];
 685    bool bound_ctrl;
 686 };
 687
 688 struct Interp_instruction : public Instruction {
 689    unsigned attribute;
 690    unsigned component;
 691 };
 692
 693 /**
 694  * Local and Global Data Sharing instructions
 695  * Operand(0): ADDR - VGPR which supplies the address.
 696  * Operand(1): DATA0 - First data VGPR.
 697  * Operand(2): DATA1 - Second data VGPR.
 698  * Operand(n-1): M0 - LDS size.
 699  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 700  *
 701  */
 702 struct DS_instruction : public Instruction {
 703    int16_t offset0;
 704    int8_t offset1;
 705    bool gds;
 706 };
 707
 708 /**
 709  * Vector Memory Untyped-buffer Instructions
 710  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 711  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 712  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 713  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 714  *
 715  */
 716 struct MUBUF_instruction : public Instruction {
 717    unsigned offset; /* Unsigned byte offset - 12 bit */
 718    bool offen; /* Supply an offset from VGPR (VADDR) */
 719    bool idxen; /* Supply an index from VGPR (VADDR) */
 720    bool glc; /* globally coherent */
 721    bool dlc; /* NAVI: device level coherent */
 722    bool slc; /* system level coherent */
 723    bool tfe; /* texture fail enable */
 724    bool lds; /* Return read-data to LDS instead of VGPRs */
 725    bool disable_wqm; /* Require an exec mask without helper invocations */
 726    bool can_reorder;
 727    barrier_interaction barrier;
 728 };
 729
 730 /**
 731  * Vector Memory Typed-buffer Instructions
 732  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 733  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 734  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 735  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 736  *
 737  */
 738 struct MTBUF_instruction : public Instruction {
 739    union {
 740       struct {
 741          uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 742          uint8_t nfmt : 3; /* Numeric format of data in memory */
 743       };
 744       uint8_t img_format; /* Buffer or image format as used by GFX10 */
 745    };
 746    unsigned offset; /* Unsigned byte offset - 12 bit */
 747    bool offen; /* Supply an offset from VGPR (VADDR) */
 748    bool idxen; /* Supply an index from VGPR (VADDR) */
 749    bool glc; /* globally coherent */
 750    bool dlc; /* NAVI: device level coherent */
 751    bool slc; /* system level coherent */
 752    bool tfe; /* texture fail enable */
 753    bool disable_wqm; /* Require an exec mask without helper invocations */
 754    bool can_reorder;
 755    barrier_interaction barrier;
 756 };
 757
 758 /**
 759  * Vector Memory Image Instructions
 760  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 761  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 762  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 763  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 764  *
 765  */
 766 struct MIMG_instruction : public Instruction {
 767    unsigned dmask; /* Data VGPR enable mask */
 768    bool unrm; /* Force address to be un-normalized */
 769    bool dlc; /* NAVI: device level coherent */
 770    bool glc; /* globally coherent */
 771    bool slc; /* system level coherent */
 772    bool tfe; /* texture fail enable */
 773    bool da; /* declare an array */
 774    bool lwe; /* Force data to be un-normalized */
 775    bool r128; /* NAVI: Texture resource size */
 776    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 777    bool d16; /* Convert 32-bit data to 16-bit data */
 778    bool disable_wqm; /* Require an exec mask without helper invocations */
 779    bool can_reorder;
 780    barrier_interaction barrier;
 781 };
 782
 783 /**
 784  * Flat/Scratch/Global Instructions
 785  * Operand(0): ADDR
 786  * Operand(1): SADDR
 787  * Operand(2) / Definition(0): DATA/VDST
 788  *
 789  */
 790 struct FLAT_instruction : public Instruction {
 791    uint16_t offset; /* Vega only */
 792    bool slc;
 793    bool glc;
 794    bool lds;
 795    bool nv;
 796 };
 797
 798 struct Export_instruction : public Instruction {
 799    unsigned enabled_mask;
 800    unsigned dest;
 801    bool compressed;
 802    bool done;
 803    bool valid_mask;
 804 };
 805
 806 struct Pseudo_instruction : public Instruction {
 807    bool tmp_in_scc;
 808    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 809 };
 810
 811 struct Pseudo_branch_instruction : public Instruction {
 812    /* target[0] is the block index of the branch target.
 813     * For conditional branches, target[1] contains the fall-through alternative.
 814     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 815     */
 816    uint32_t target[2];
 817 };
 818
 819 struct Pseudo_barrier_instruction : public Instruction {
 820 };
 821
 822 enum ReduceOp {
 823    iadd32, iadd64,
 824    imul32, imul64,
 825    fadd32, fadd64,
 826    fmul32, fmul64,
 827    imin32, imin64,
 828    imax32, imax64,
 829    umin32, umin64,
 830    umax32, umax64,
 831    fmin32, fmin64,
 832    fmax32, fmax64,
 833    iand32, iand64,
 834    ior32, ior64,
 835    ixor32, ixor64,
 836 };
 837
 838 /**
 839  * Subgroup Reduction Instructions, everything except for the data to be
 840  * reduced and the result as inserted by setup_reduce_temp().
 841  * Operand(0): data to be reduced
 842  * Operand(1): reduce temporary
 843  * Operand(2): vector temporary
 844  * Definition(0): result
 845  * Definition(1): scalar temporary
 846  * Definition(2): scalar identity temporary
 847  * Definition(3): scc clobber
 848  * Definition(4): vcc clobber
 849  *
 850  */
 851 struct Pseudo_reduction_instruction : public Instruction {
 852    ReduceOp reduce_op;
 853    unsigned cluster_size; // must be 0 for scans
 854 };
 855
 856 struct instr_deleter_functor {
 857    void operator()(void* p) {
 858       free(p);
 859    }
 860 };
 861
 862 template<typename T>
 863 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 864
 865 template<typename T>
 866 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 867 {
 868    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 869    char *data = (char*) calloc(1, size);
 870    T* inst = (T*) data;
 871
 872    inst->opcode = opcode;
 873    inst->format = format;
 874
 875    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 876    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 877
 878    return inst;
 879 }
 880
 881 constexpr bool is_phi(Instruction* instr)
 882 {
 883    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 884 }
 885
 886 static inline bool is_phi(aco_ptr<Instruction>& instr)
 887 {
 888    return is_phi(instr.get());
 889 }
 890
 891 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 892 {
 893    switch (instr->format) {
 894    case Format::SMEM:
 895       return static_cast<SMEM_instruction*>(instr)->barrier;
 896    case Format::MUBUF:
 897       return static_cast<MUBUF_instruction*>(instr)->barrier;
 898    case Format::MIMG:
 899       return static_cast<MIMG_instruction*>(instr)->barrier;
 900    case Format::FLAT:
 901    case Format::GLOBAL:
 902       return barrier_buffer;
 903    case Format::DS:
 904       return barrier_shared;
 905    default:
 906       return barrier_none;
 907    }
 908 }
 909
 910 enum block_kind {
 911    /* uniform indicates that leaving this block,
 912     * all actives lanes stay active */
 913    block_kind_uniform = 1 << 0,
 914    block_kind_top_level = 1 << 1,
 915    block_kind_loop_preheader = 1 << 2,
 916    block_kind_loop_header = 1 << 3,
 917    block_kind_loop_exit = 1 << 4,
 918    block_kind_continue = 1 << 5,
 919    block_kind_break = 1 << 6,
 920    block_kind_continue_or_break = 1 << 7,
 921    block_kind_discard = 1 << 8,
 922    block_kind_branch = 1 << 9,
 923    block_kind_merge = 1 << 10,
 924    block_kind_invert = 1 << 11,
 925    block_kind_uses_discard_if = 1 << 12,
 926    block_kind_needs_lowering = 1 << 13,
 927    block_kind_uses_demote = 1 << 14,
 928 };
 929
 930
 931 struct RegisterDemand {
 932    constexpr RegisterDemand() = default;
 933    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
 934       : vgpr{v}, sgpr{s} {}
 935    int16_t vgpr = 0;
 936    int16_t sgpr = 0;
 937
 938    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
 939       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
 940    }
 941
 942    constexpr bool exceeds(const RegisterDemand other) const noexcept {
 943       return vgpr > other.vgpr || sgpr > other.sgpr;
 944    }
 945
 946    constexpr RegisterDemand operator+(const Temp t) const noexcept {
 947       if (t.type() == RegType::sgpr)
 948          return RegisterDemand( vgpr, sgpr + t.size() );
 949       else
 950          return RegisterDemand( vgpr + t.size(), sgpr );
 951    }
 952
 953    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
 954       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
 955    }
 956
 957    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
 958       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
 959    }
 960
 961    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
 962       vgpr += other.vgpr;
 963       sgpr += other.sgpr;
 964       return *this;
 965    }
 966
 967    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
 968       vgpr -= other.vgpr;
 969       sgpr -= other.sgpr;
 970       return *this;
 971    }
 972
 973    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
 974       if (t.type() == RegType::sgpr)
 975          sgpr += t.size();
 976       else
 977          vgpr += t.size();
 978       return *this;
 979    }
 980
 981    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
 982       if (t.type() == RegType::sgpr)
 983          sgpr -= t.size();
 984       else
 985          vgpr -= t.size();
 986       return *this;
 987    }
 988
 989    constexpr void update(const RegisterDemand other) noexcept {
 990       vgpr = std::max(vgpr, other.vgpr);
 991       sgpr = std::max(sgpr, other.sgpr);
 992    }
 993
 994 };
 995
 996 /* CFG */
 997 struct Block {
 998    unsigned index;
 999    unsigned offset = 0;
1000    std::vector<aco_ptr<Instruction>> instructions;
1001    std::vector<unsigned> logical_preds;
1002    std::vector<unsigned> linear_preds;
1003    std::vector<unsigned> logical_succs;
1004    std::vector<unsigned> linear_succs;
1005    RegisterDemand register_demand = RegisterDemand();
1006    uint16_t loop_nest_depth = 0;
1007    uint16_t kind = 0;
1008    int logical_idom = -1;
1009    int linear_idom = -1;
1010    Temp live_out_exec = Temp();
1011
1012    /* this information is needed for predecessors to blocks with phis when
1013     * moving out of ssa */
1014    bool scc_live_out = false;
1015    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1016
1017    Block(unsigned idx) : index(idx) {}
1018    Block() : index(0) {}
1019 };
1020
1021 using Stage = uint16_t;
1022
1023 /* software stages */
1024 static constexpr Stage sw_vs = 1 << 0;
1025 static constexpr Stage sw_gs = 1 << 1;
1026 static constexpr Stage sw_tcs = 1 << 2;
1027 static constexpr Stage sw_tes = 1 << 3;
1028 static constexpr Stage sw_fs = 1 << 4;
1029 static constexpr Stage sw_cs = 1 << 5;
1030 static constexpr Stage sw_mask = 0x3f;
1031
1032 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1033 static constexpr Stage hw_vs = 1 << 6;
1034 static constexpr Stage hw_es = 1 << 7;
1035 static constexpr Stage hw_gs = 1 << 8; /* not on GFX9. combined into ES on GFX9 (and GFX10/legacy). */
1036 static constexpr Stage hw_ls = 1 << 9;
1037 static constexpr Stage hw_hs = 1 << 10; /* not on GFX9. combined into LS on GFX9 (and GFX10/legacy). */
1038 static constexpr Stage hw_fs = 1 << 11;
1039 static constexpr Stage hw_cs = 1 << 12;
1040 static constexpr Stage hw_mask = 0x7f << 6;
1041
1042 /* possible settings of Program::stage */
1043 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1044 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1045 static constexpr Stage compute_cs = sw_cs | hw_cs;
1046 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1047 /* GFX10/NGG */
1048 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1049 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1050 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1051 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1052 /* GFX9 (and GFX10 if NGG isn't used) */
1053 static constexpr Stage vertex_geometry_es = sw_vs | sw_gs | hw_es;
1054 static constexpr Stage vertex_tess_control_ls = sw_vs | sw_tcs | hw_ls;
1055 static constexpr Stage tess_eval_geometry_es = sw_tes | sw_gs | hw_es;
1056 /* pre-GFX9 */
1057 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1058 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1059 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before GS */
1060 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1061
1062 class Program final {
1063 public:
1064    std::vector<Block> blocks;
1065    RegisterDemand max_reg_demand = RegisterDemand();
1066    uint16_t sgpr_limit = 0;
1067    uint16_t num_waves = 0;
1068    ac_shader_config* config;
1069    struct radv_shader_info *info;
1070    enum chip_class chip_class;
1071    enum radeon_family family;
1072    Stage stage; /* Stage */
1073    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1074    bool needs_wqm = false; /* there exists a p_wqm instruction */
1075    bool wb_smem_l1_on_end = false;
1076
1077    std::vector<uint8_t> constant_data;
1078
1079    uint32_t allocateId()
1080    {
1081       assert(allocationID <= 16777215);
1082       return allocationID++;
1083    }
1084
1085    uint32_t peekAllocationId()
1086    {
1087       return allocationID;
1088    }
1089
1090    void setAllocationId(uint32_t id)
1091    {
1092       allocationID = id;
1093    }
1094
1095    Block* create_and_insert_block() {
1096       blocks.emplace_back(blocks.size());
1097       return &blocks.back();
1098    }
1099
1100    Block* insert_block(Block&& block) {
1101       block.index = blocks.size();
1102       blocks.emplace_back(std::move(block));
1103       return &blocks.back();
1104    }
1105
1106 private:
1107    uint32_t allocationID = 1;
1108 };
1109
1110 struct live {
1111    /* live temps out per block */
1112    std::vector<std::set<Temp>> live_out;
1113    /* register demand (sgpr/vgpr) per instruction per block */
1114    std::vector<std::vector<RegisterDemand>> register_demand;
1115 };
1116
1117 void select_program(Program *program,
1118                     unsigned shader_count,
1119                     struct nir_shader *const *shaders,
1120                     ac_shader_config* config,
1121                     struct radv_shader_info *info,
1122                     struct radv_nir_compiler_options *options);
1123
1124 void lower_wqm(Program* program, live& live_vars,
1125                const struct radv_nir_compiler_options *options);
1126 void lower_bool_phis(Program* program);
1127 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1128 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1129 std::vector<uint16_t> dead_code_analysis(Program *program);
1130 void dominator_tree(Program* program);
1131 void insert_exec_mask(Program *program);
1132 void value_numbering(Program* program);
1133 void optimize(Program* program);
1134 void setup_reduce_temp(Program* program);
1135 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1136 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1137 void ssa_elimination(Program* program);
1138 void lower_to_hw_instr(Program* program);
1139 void schedule_program(Program* program, live& live_vars);
1140 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1141 void insert_wait_states(Program* program);
1142 void insert_NOPs(Program* program);
1143 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1144 void print_asm(Program *program, std::vector<uint32_t>& binary, unsigned exec_size,
1145                enum radeon_family family, std::ostream& out);
1146 void validate(Program* program, FILE *output);
1147 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1148 #ifndef NDEBUG
1149 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1150 #else
1151 #define perfwarn(program, cond, msg, ...)
1152 #endif
1153
1154 void aco_print_instr(Instruction *instr, FILE *output);
1155 void aco_print_program(Program *program, FILE *output);
1156
1157 typedef struct {
1158    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1159    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1160    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1161    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1162    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1163    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1164 } Info;
1165
1166 extern const Info instr_info;
1167
1168 }
1169
1170 #endif /* ACO_IR_H */
1171