src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_info;
  41
  42 namespace aco {
  43
  44 extern uint64_t debug_flags;
  45
  46 enum {
  47    DEBUG_VALIDATE = 0x1,
  48    DEBUG_VALIDATE_RA = 0x2,
  49    DEBUG_PERFWARN = 0x4,
  50 };
  51
  52 /**
  53  * Representation of the instruction's microcode encoding format
  54  * Note: Some Vector ALU Formats can be combined, such that:
  55  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  56  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  57  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  58  *
  59  * (*) The same is applicable for VOP1 and VOPC instructions.
  60  */
  61 enum class Format : std::uint16_t {
  62    /* Pseudo Instruction Format */
  63    PSEUDO = 0,
  64    /* Scalar ALU & Control Formats */
  65    SOP1 = 1,
  66    SOP2 = 2,
  67    SOPK = 3,
  68    SOPP = 4,
  69    SOPC = 5,
  70    /* Scalar Memory Format */
  71    SMEM = 6,
  72    /* LDS/GDS Format */
  73    DS = 8,
  74    /* Vector Memory Buffer Formats */
  75    MTBUF = 9,
  76    MUBUF = 10,
  77    /* Vector Memory Image Format */
  78    MIMG = 11,
  79    /* Export Format */
  80    EXP = 12,
  81    /* Flat Formats */
  82    FLAT = 13,
  83    GLOBAL = 14,
  84    SCRATCH = 15,
  85
  86    PSEUDO_BRANCH = 16,
  87    PSEUDO_BARRIER = 17,
  88    PSEUDO_REDUCTION = 18,
  89
  90    /* Vector ALU Formats */
  91    VOP1 = 1 << 8,
  92    VOP2 = 1 << 9,
  93    VOPC = 1 << 10,
  94    VOP3 = 1 << 11,
  95    VOP3A = 1 << 11,
  96    VOP3B = 1 << 11,
  97    VOP3P = 1 << 12,
  98    /* Vector Parameter Interpolation Format */
  99    VINTRP = 1 << 13,
 100    DPP = 1 << 14,
 101    SDWA = 1 << 15,
 102 };
 103
 104 enum barrier_interaction {
 105    barrier_none = 0,
 106    barrier_buffer = 0x1,
 107    barrier_image = 0x2,
 108    barrier_atomic = 0x4,
 109    barrier_shared = 0x8,
 110    barrier_count = 4,
 111 };
 112
 113 constexpr Format asVOP3(Format format) {
 114    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 115 };
 116
 117 enum class RegType {
 118    none = 0,
 119    sgpr,
 120    vgpr,
 121    linear_vgpr,
 122 };
 123
 124 struct RegClass {
 125
 126    enum RC : uint8_t {
 127       s1 = 1,
 128       s2 = 2,
 129       s3 = 3,
 130       s4 = 4,
 131       s6 = 6,
 132       s8 = 8,
 133       s16 = 16,
 134       v1 = s1 | (1 << 5),
 135       v2 = s2 | (1 << 5),
 136       v3 = s3 | (1 << 5),
 137       v4 = s4 | (1 << 5),
 138       v5 = 5  | (1 << 5),
 139       v6 = 6  | (1 << 5),
 140       v7 = 7  | (1 << 5),
 141       v8 = 8  | (1 << 5),
 142       /* these are used for WWM and spills to vgpr */
 143       v1_linear = v1 | (1 << 6),
 144       v2_linear = v2 | (1 << 6),
 145    };
 146
 147    RegClass() = default;
 148    constexpr RegClass(RC rc)
 149       : rc(rc) {}
 150    constexpr RegClass(RegType type, unsigned size)
 151       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 152
 153    constexpr operator RC() const { return rc; }
 154    explicit operator bool() = delete;
 155
 156    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 157    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 158    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 159    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 160
 161 private:
 162    RC rc;
 163 };
 164
 165 /* transitional helper expressions */
 166 static constexpr RegClass s1{RegClass::s1};
 167 static constexpr RegClass s2{RegClass::s2};
 168 static constexpr RegClass s3{RegClass::s3};
 169 static constexpr RegClass s4{RegClass::s4};
 170 static constexpr RegClass s8{RegClass::s8};
 171 static constexpr RegClass s16{RegClass::s16};
 172 static constexpr RegClass v1{RegClass::v1};
 173 static constexpr RegClass v2{RegClass::v2};
 174 static constexpr RegClass v3{RegClass::v3};
 175 static constexpr RegClass v4{RegClass::v4};
 176 static constexpr RegClass v5{RegClass::v5};
 177 static constexpr RegClass v6{RegClass::v6};
 178 static constexpr RegClass v7{RegClass::v7};
 179 static constexpr RegClass v8{RegClass::v8};
 180
 181 /**
 182  * Temp Class
 183  * Each temporary virtual register has a
 184  * register class (i.e. size and type)
 185  * and SSA id.
 186  */
 187 struct Temp {
 188    Temp() = default;
 189    constexpr Temp(uint32_t id, RegClass cls) noexcept
 190       : id_(id), reg_class(cls) {}
 191
 192    constexpr uint32_t id() const noexcept { return id_; }
 193    constexpr RegClass regClass() const noexcept { return reg_class; }
 194
 195    constexpr unsigned size() const noexcept { return reg_class.size(); }
 196    constexpr RegType type() const noexcept { return reg_class.type(); }
 197    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 198
 199    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 200    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 201    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 202
 203 private:
 204    uint32_t id_:24;
 205    RegClass reg_class;
 206 };
 207
 208 /**
 209  * PhysReg
 210  * Represents the physical register for each
 211  * Operand and Definition.
 212  */
 213 struct PhysReg {
 214    constexpr PhysReg() = default;
 215    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 216    constexpr operator unsigned() const { return reg; }
 217
 218    uint16_t reg = 0;
 219 };
 220
 221 /* helper expressions for special registers */
 222 static constexpr PhysReg m0{124};
 223 static constexpr PhysReg vcc{106};
 224 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 225 static constexpr PhysReg exec{126};
 226 static constexpr PhysReg exec_lo{126};
 227 static constexpr PhysReg exec_hi{127};
 228 static constexpr PhysReg scc{253};
 229
 230 /**
 231  * Operand Class
 232  * Initially, each Operand refers to either
 233  * a temporary virtual register
 234  * or to a constant value
 235  * Temporary registers get mapped to physical register during RA
 236  * Constant values are inlined into the instruction sequence.
 237  */
 238 class Operand final
 239 {
 240 public:
 241    constexpr Operand()
 242       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 243         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 244
 245    explicit Operand(Temp r) noexcept
 246    {
 247       data_.temp = r;
 248       if (r.id()) {
 249          isTemp_ = true;
 250       } else {
 251          isUndef_ = true;
 252          setFixed(PhysReg{128});
 253       }
 254    };
 255    explicit Operand(uint32_t v) noexcept
 256    {
 257       data_.i = v;
 258       isConstant_ = true;
 259       if (v <= 64)
 260          setFixed(PhysReg{128 + v});
 261       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 262          setFixed(PhysReg{192 - v});
 263       else if (v == 0x3f000000) /* 0.5 */
 264          setFixed(PhysReg{240});
 265       else if (v == 0xbf000000) /* -0.5 */
 266          setFixed(PhysReg{241});
 267       else if (v == 0x3f800000) /* 1.0 */
 268          setFixed(PhysReg{242});
 269       else if (v == 0xbf800000) /* -1.0 */
 270          setFixed(PhysReg{243});
 271       else if (v == 0x40000000) /* 2.0 */
 272          setFixed(PhysReg{244});
 273       else if (v == 0xc0000000) /* -2.0 */
 274          setFixed(PhysReg{245});
 275       else if (v == 0x40800000) /* 4.0 */
 276          setFixed(PhysReg{246});
 277       else if (v == 0xc0800000) /* -4.0 */
 278          setFixed(PhysReg{247});
 279       else if (v == 0x3e22f983) /* 1/(2*PI) */
 280          setFixed(PhysReg{248});
 281       else /* Literal Constant */
 282          setFixed(PhysReg{255});
 283    };
 284    explicit Operand(uint64_t v) noexcept
 285    {
 286       isConstant_ = true;
 287       is64BitConst_ = true;
 288       if (v <= 64)
 289          setFixed(PhysReg{128 + (uint32_t) v});
 290       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 291          setFixed(PhysReg{192 - (uint32_t) v});
 292       else if (v == 0x3FE0000000000000) /* 0.5 */
 293          setFixed(PhysReg{240});
 294       else if (v == 0xBFE0000000000000) /* -0.5 */
 295          setFixed(PhysReg{241});
 296       else if (v == 0x3FF0000000000000) /* 1.0 */
 297          setFixed(PhysReg{242});
 298       else if (v == 0xBFF0000000000000) /* -1.0 */
 299          setFixed(PhysReg{243});
 300       else if (v == 0x4000000000000000) /* 2.0 */
 301          setFixed(PhysReg{244});
 302       else if (v == 0xC000000000000000) /* -2.0 */
 303          setFixed(PhysReg{245});
 304       else if (v == 0x4010000000000000) /* 4.0 */
 305          setFixed(PhysReg{246});
 306       else if (v == 0xC010000000000000) /* -4.0 */
 307          setFixed(PhysReg{247});
 308       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 309          setFixed(PhysReg{248});
 310       else { /* Literal Constant: we don't know if it is a long or double.*/
 311          isConstant_ = 0;
 312          assert(false && "attempt to create a 64-bit literal constant");
 313       }
 314    };
 315    explicit Operand(RegClass type) noexcept
 316    {
 317       isUndef_ = true;
 318       data_.temp = Temp(0, type);
 319       setFixed(PhysReg{128});
 320    };
 321    explicit Operand(PhysReg reg, RegClass type) noexcept
 322    {
 323       data_.temp = Temp(0, type);
 324       setFixed(reg);
 325    }
 326
 327    constexpr bool isTemp() const noexcept
 328    {
 329       return isTemp_;
 330    }
 331
 332    constexpr void setTemp(Temp t) noexcept {
 333       assert(!isConstant_);
 334       isTemp_ = true;
 335       data_.temp = t;
 336    }
 337
 338    constexpr Temp getTemp() const noexcept
 339    {
 340       return data_.temp;
 341    }
 342
 343    constexpr uint32_t tempId() const noexcept
 344    {
 345       return data_.temp.id();
 346    }
 347
 348    constexpr bool hasRegClass() const noexcept
 349    {
 350       return isTemp() || isUndefined();
 351    }
 352
 353    constexpr RegClass regClass() const noexcept
 354    {
 355       return data_.temp.regClass();
 356    }
 357
 358    constexpr unsigned size() const noexcept
 359    {
 360       if (isConstant())
 361          return is64BitConst_ ? 2 : 1;
 362       else
 363          return data_.temp.size();
 364    }
 365
 366    constexpr bool isFixed() const noexcept
 367    {
 368       return isFixed_;
 369    }
 370
 371    constexpr PhysReg physReg() const noexcept
 372    {
 373       return reg_;
 374    }
 375
 376    constexpr void setFixed(PhysReg reg) noexcept
 377    {
 378       isFixed_ = reg != unsigned(-1);
 379       reg_ = reg;
 380    }
 381
 382    constexpr bool isConstant() const noexcept
 383    {
 384       return isConstant_;
 385    }
 386
 387    constexpr bool isLiteral() const noexcept
 388    {
 389       return isConstant() && reg_ == 255;
 390    }
 391
 392    constexpr bool isUndefined() const noexcept
 393    {
 394       return isUndef_;
 395    }
 396
 397    constexpr uint32_t constantValue() const noexcept
 398    {
 399       return data_.i;
 400    }
 401
 402    constexpr bool constantEquals(uint32_t cmp) const noexcept
 403    {
 404       return isConstant() && constantValue() == cmp;
 405    }
 406
 407    constexpr void setKill(bool flag) noexcept
 408    {
 409       isKill_ = flag;
 410       if (!flag)
 411          setFirstKill(false);
 412    }
 413
 414    constexpr bool isKill() const noexcept
 415    {
 416       return isKill_ || isFirstKill();
 417    }
 418
 419    constexpr void setFirstKill(bool flag) noexcept
 420    {
 421       isFirstKill_ = flag;
 422       if (flag)
 423          setKill(flag);
 424    }
 425
 426    /* When there are multiple operands killing the same temporary,
 427     * isFirstKill() is only returns true for the first one. */
 428    constexpr bool isFirstKill() const noexcept
 429    {
 430       return isFirstKill_;
 431    }
 432
 433 private:
 434    union {
 435       uint32_t i;
 436       float f;
 437       Temp temp = Temp(0, s1);
 438    } data_;
 439    PhysReg reg_;
 440    union {
 441       struct {
 442          uint8_t isTemp_:1;
 443          uint8_t isFixed_:1;
 444          uint8_t isConstant_:1;
 445          uint8_t isKill_:1;
 446          uint8_t isUndef_:1;
 447          uint8_t isFirstKill_:1;
 448          uint8_t is64BitConst_:1;
 449       };
 450       /* can't initialize bit-fields in c++11, so work around using a union */
 451       uint8_t control_ = 0;
 452    };
 453 };
 454
 455 /**
 456  * Definition Class
 457  * Definitions are the results of Instructions
 458  * and refer to temporary virtual registers
 459  * which are later mapped to physical registers
 460  */
 461 class Definition final
 462 {
 463 public:
 464    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 465    Definition(uint32_t index, RegClass type) noexcept
 466       : temp(index, type) {}
 467    explicit Definition(Temp tmp) noexcept
 468       : temp(tmp) {}
 469    Definition(PhysReg reg, RegClass type) noexcept
 470       : temp(Temp(0, type))
 471    {
 472       setFixed(reg);
 473    }
 474    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 475       : temp(Temp(tmpId, type))
 476    {
 477       setFixed(reg);
 478    }
 479
 480    constexpr bool isTemp() const noexcept
 481    {
 482       return tempId() > 0;
 483    }
 484
 485    constexpr Temp getTemp() const noexcept
 486    {
 487       return temp;
 488    }
 489
 490    constexpr uint32_t tempId() const noexcept
 491    {
 492       return temp.id();
 493    }
 494
 495    constexpr void setTemp(Temp t) noexcept {
 496       temp = t;
 497    }
 498
 499    constexpr RegClass regClass() const noexcept
 500    {
 501       return temp.regClass();
 502    }
 503
 504    constexpr unsigned size() const noexcept
 505    {
 506       return temp.size();
 507    }
 508
 509    constexpr bool isFixed() const noexcept
 510    {
 511       return isFixed_;
 512    }
 513
 514    constexpr PhysReg physReg() const noexcept
 515    {
 516       return reg_;
 517    }
 518
 519    constexpr void setFixed(PhysReg reg) noexcept
 520    {
 521       isFixed_ = 1;
 522       reg_ = reg;
 523    }
 524
 525    constexpr void setHint(PhysReg reg) noexcept
 526    {
 527       hasHint_ = 1;
 528       reg_ = reg;
 529    }
 530
 531    constexpr bool hasHint() const noexcept
 532    {
 533       return hasHint_;
 534    }
 535
 536    constexpr void setKill(bool flag) noexcept
 537    {
 538       isKill_ = flag;
 539    }
 540
 541    constexpr bool isKill() const noexcept
 542    {
 543       return isKill_;
 544    }
 545
 546 private:
 547    Temp temp = Temp(0, s1);
 548    PhysReg reg_;
 549    union {
 550       struct {
 551          uint8_t isFixed_:1;
 552          uint8_t hasHint_:1;
 553          uint8_t isKill_:1;
 554       };
 555       /* can't initialize bit-fields in c++11, so work around using a union */
 556       uint8_t control_ = 0;
 557    };
 558 };
 559
 560 class Block;
 561
 562 struct Instruction {
 563    aco_opcode opcode;
 564    Format format;
 565    uint32_t pass_flags;
 566
 567    aco::span<Operand> operands;
 568    aco::span<Definition> definitions;
 569
 570    constexpr bool isVALU() const noexcept
 571    {
 572       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 573           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 574           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 575           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 576           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 577           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 578    }
 579
 580    constexpr bool isSALU() const noexcept
 581    {
 582       return format == Format::SOP1 ||
 583              format == Format::SOP2 ||
 584              format == Format::SOPC ||
 585              format == Format::SOPK ||
 586              format == Format::SOPP;
 587    }
 588
 589    constexpr bool isVMEM() const noexcept
 590    {
 591       return format == Format::MTBUF ||
 592              format == Format::MUBUF ||
 593              format == Format::MIMG;
 594    }
 595
 596    constexpr bool isDPP() const noexcept
 597    {
 598       return (uint16_t) format & (uint16_t) Format::DPP;
 599    }
 600
 601    constexpr bool isVOP3() const noexcept
 602    {
 603       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 604              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 605              format == Format::VOP3P;
 606    }
 607
 608    constexpr bool isSDWA() const noexcept
 609    {
 610       return (uint16_t) format & (uint16_t) Format::SDWA;
 611    }
 612
 613    constexpr bool isFlatOrGlobal() const noexcept
 614    {
 615       return format == Format::FLAT || format == Format::GLOBAL;
 616    }
 617 };
 618
 619 struct SOPK_instruction : public Instruction {
 620    uint16_t imm;
 621 };
 622
 623 struct SOPP_instruction : public Instruction {
 624    uint32_t imm;
 625    int block;
 626 };
 627
 628 struct SOPC_instruction : public Instruction {
 629 };
 630
 631 struct SOP1_instruction : public Instruction {
 632 };
 633
 634 struct SOP2_instruction : public Instruction {
 635 };
 636
 637 /**
 638  * Scalar Memory Format:
 639  * For s_(buffer_)load_dword*:
 640  * Operand(0): SBASE - SGPR-pair which provides base address
 641  * Operand(1): Offset - immediate (un)signed offset or SGPR
 642  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 643  * Operand(n-1): SOffset - SGPR offset (Vega only)
 644  *
 645  * Having no operands is also valid for instructions such as s_dcache_inv.
 646  *
 647  */
 648 struct SMEM_instruction : public Instruction {
 649    bool glc; /* VI+: globally coherent */
 650    bool dlc; /* NAVI: device level coherent */
 651    bool nv; /* VEGA only: Non-volatile */
 652    bool can_reorder;
 653    bool disable_wqm;
 654    barrier_interaction barrier;
 655 };
 656
 657 struct VOP1_instruction : public Instruction {
 658 };
 659
 660 struct VOP2_instruction : public Instruction {
 661 };
 662
 663 struct VOPC_instruction : public Instruction {
 664 };
 665
 666 struct VOP3A_instruction : public Instruction {
 667    bool abs[3];
 668    bool opsel[4];
 669    bool clamp;
 670    unsigned omod;
 671    bool neg[3];
 672 };
 673
 674 /**
 675  * Data Parallel Primitives Format:
 676  * This format can be used for VOP1, VOP2 or VOPC instructions.
 677  * The swizzle applies to the src0 operand.
 678  *
 679  */
 680 struct DPP_instruction : public Instruction {
 681    uint16_t dpp_ctrl;
 682    uint8_t row_mask;
 683    uint8_t bank_mask;
 684    bool abs[2];
 685    bool neg[2];
 686    bool bound_ctrl;
 687 };
 688
 689 struct Interp_instruction : public Instruction {
 690    unsigned attribute;
 691    unsigned component;
 692 };
 693
 694 /**
 695  * Local and Global Data Sharing instructions
 696  * Operand(0): ADDR - VGPR which supplies the address.
 697  * Operand(1): DATA0 - First data VGPR.
 698  * Operand(2): DATA1 - Second data VGPR.
 699  * Operand(n-1): M0 - LDS size.
 700  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 701  *
 702  */
 703 struct DS_instruction : public Instruction {
 704    int16_t offset0;
 705    int8_t offset1;
 706    bool gds;
 707 };
 708
 709 /**
 710  * Vector Memory Untyped-buffer Instructions
 711  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 712  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 713  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 714  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 715  *
 716  */
 717 struct MUBUF_instruction : public Instruction {
 718    unsigned offset; /* Unsigned byte offset - 12 bit */
 719    bool offen; /* Supply an offset from VGPR (VADDR) */
 720    bool idxen; /* Supply an index from VGPR (VADDR) */
 721    bool glc; /* globally coherent */
 722    bool dlc; /* NAVI: device level coherent */
 723    bool slc; /* system level coherent */
 724    bool tfe; /* texture fail enable */
 725    bool lds; /* Return read-data to LDS instead of VGPRs */
 726    bool disable_wqm; /* Require an exec mask without helper invocations */
 727    bool can_reorder;
 728    barrier_interaction barrier;
 729 };
 730
 731 /**
 732  * Vector Memory Typed-buffer Instructions
 733  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 734  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 735  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 736  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 737  *
 738  */
 739 struct MTBUF_instruction : public Instruction {
 740    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 741    uint8_t nfmt : 3; /* Numeric format of data in memory */
 742    unsigned offset; /* Unsigned byte offset - 12 bit */
 743    bool offen; /* Supply an offset from VGPR (VADDR) */
 744    bool idxen; /* Supply an index from VGPR (VADDR) */
 745    bool glc; /* globally coherent */
 746    bool dlc; /* NAVI: device level coherent */
 747    bool slc; /* system level coherent */
 748    bool tfe; /* texture fail enable */
 749    bool disable_wqm; /* Require an exec mask without helper invocations */
 750    bool can_reorder;
 751    barrier_interaction barrier;
 752 };
 753
 754 /**
 755  * Vector Memory Image Instructions
 756  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 757  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 758  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 759  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 760  *
 761  */
 762 struct MIMG_instruction : public Instruction {
 763    unsigned dmask; /* Data VGPR enable mask */
 764    unsigned dim; /* NAVI: dimensionality */
 765    bool unrm; /* Force address to be un-normalized */
 766    bool dlc; /* NAVI: device level coherent */
 767    bool glc; /* globally coherent */
 768    bool slc; /* system level coherent */
 769    bool tfe; /* texture fail enable */
 770    bool da; /* declare an array */
 771    bool lwe; /* Force data to be un-normalized */
 772    bool r128; /* NAVI: Texture resource size */
 773    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 774    bool d16; /* Convert 32-bit data to 16-bit data */
 775    bool disable_wqm; /* Require an exec mask without helper invocations */
 776    bool can_reorder;
 777    barrier_interaction barrier;
 778 };
 779
 780 /**
 781  * Flat/Scratch/Global Instructions
 782  * Operand(0): ADDR
 783  * Operand(1): SADDR
 784  * Operand(2) / Definition(0): DATA/VDST
 785  *
 786  */
 787 struct FLAT_instruction : public Instruction {
 788    uint16_t offset; /* Vega only */
 789    bool slc; /* system level coherent */
 790    bool glc; /* globally coherent */
 791    bool dlc; /* NAVI: device level coherent */
 792    bool lds;
 793    bool nv;
 794 };
 795
 796 struct Export_instruction : public Instruction {
 797    unsigned enabled_mask;
 798    unsigned dest;
 799    bool compressed;
 800    bool done;
 801    bool valid_mask;
 802 };
 803
 804 struct Pseudo_instruction : public Instruction {
 805    bool tmp_in_scc;
 806    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 807 };
 808
 809 struct Pseudo_branch_instruction : public Instruction {
 810    /* target[0] is the block index of the branch target.
 811     * For conditional branches, target[1] contains the fall-through alternative.
 812     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 813     */
 814    uint32_t target[2];
 815 };
 816
 817 struct Pseudo_barrier_instruction : public Instruction {
 818 };
 819
 820 enum ReduceOp {
 821    iadd32, iadd64,
 822    imul32, imul64,
 823    fadd32, fadd64,
 824    fmul32, fmul64,
 825    imin32, imin64,
 826    imax32, imax64,
 827    umin32, umin64,
 828    umax32, umax64,
 829    fmin32, fmin64,
 830    fmax32, fmax64,
 831    iand32, iand64,
 832    ior32, ior64,
 833    ixor32, ixor64,
 834    gfx10_wave64_bpermute
 835 };
 836
 837 /**
 838  * Subgroup Reduction Instructions, everything except for the data to be
 839  * reduced and the result as inserted by setup_reduce_temp().
 840  * Operand(0): data to be reduced
 841  * Operand(1): reduce temporary
 842  * Operand(2): vector temporary
 843  * Definition(0): result
 844  * Definition(1): scalar temporary
 845  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 846  * Definition(3): scc clobber
 847  * Definition(4): vcc clobber
 848  *
 849  */
 850 struct Pseudo_reduction_instruction : public Instruction {
 851    ReduceOp reduce_op;
 852    unsigned cluster_size; // must be 0 for scans
 853 };
 854
 855 struct instr_deleter_functor {
 856    void operator()(void* p) {
 857       free(p);
 858    }
 859 };
 860
 861 template<typename T>
 862 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 863
 864 template<typename T>
 865 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 866 {
 867    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 868    char *data = (char*) calloc(1, size);
 869    T* inst = (T*) data;
 870
 871    inst->opcode = opcode;
 872    inst->format = format;
 873
 874    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 875    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 876
 877    return inst;
 878 }
 879
 880 constexpr bool is_phi(Instruction* instr)
 881 {
 882    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 883 }
 884
 885 static inline bool is_phi(aco_ptr<Instruction>& instr)
 886 {
 887    return is_phi(instr.get());
 888 }
 889
 890 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 891 {
 892    switch (instr->format) {
 893    case Format::SMEM:
 894       return static_cast<SMEM_instruction*>(instr)->barrier;
 895    case Format::MUBUF:
 896       return static_cast<MUBUF_instruction*>(instr)->barrier;
 897    case Format::MIMG:
 898       return static_cast<MIMG_instruction*>(instr)->barrier;
 899    case Format::FLAT:
 900    case Format::GLOBAL:
 901       return barrier_buffer;
 902    case Format::DS:
 903       return barrier_shared;
 904    default:
 905       return barrier_none;
 906    }
 907 }
 908
 909 enum block_kind {
 910    /* uniform indicates that leaving this block,
 911     * all actives lanes stay active */
 912    block_kind_uniform = 1 << 0,
 913    block_kind_top_level = 1 << 1,
 914    block_kind_loop_preheader = 1 << 2,
 915    block_kind_loop_header = 1 << 3,
 916    block_kind_loop_exit = 1 << 4,
 917    block_kind_continue = 1 << 5,
 918    block_kind_break = 1 << 6,
 919    block_kind_continue_or_break = 1 << 7,
 920    block_kind_discard = 1 << 8,
 921    block_kind_branch = 1 << 9,
 922    block_kind_merge = 1 << 10,
 923    block_kind_invert = 1 << 11,
 924    block_kind_uses_discard_if = 1 << 12,
 925    block_kind_needs_lowering = 1 << 13,
 926    block_kind_uses_demote = 1 << 14,
 927 };
 928
 929
 930 struct RegisterDemand {
 931    constexpr RegisterDemand() = default;
 932    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
 933       : vgpr{v}, sgpr{s} {}
 934    int16_t vgpr = 0;
 935    int16_t sgpr = 0;
 936
 937    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
 938       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
 939    }
 940
 941    constexpr bool exceeds(const RegisterDemand other) const noexcept {
 942       return vgpr > other.vgpr || sgpr > other.sgpr;
 943    }
 944
 945    constexpr RegisterDemand operator+(const Temp t) const noexcept {
 946       if (t.type() == RegType::sgpr)
 947          return RegisterDemand( vgpr, sgpr + t.size() );
 948       else
 949          return RegisterDemand( vgpr + t.size(), sgpr );
 950    }
 951
 952    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
 953       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
 954    }
 955
 956    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
 957       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
 958    }
 959
 960    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
 961       vgpr += other.vgpr;
 962       sgpr += other.sgpr;
 963       return *this;
 964    }
 965
 966    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
 967       vgpr -= other.vgpr;
 968       sgpr -= other.sgpr;
 969       return *this;
 970    }
 971
 972    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
 973       if (t.type() == RegType::sgpr)
 974          sgpr += t.size();
 975       else
 976          vgpr += t.size();
 977       return *this;
 978    }
 979
 980    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
 981       if (t.type() == RegType::sgpr)
 982          sgpr -= t.size();
 983       else
 984          vgpr -= t.size();
 985       return *this;
 986    }
 987
 988    constexpr void update(const RegisterDemand other) noexcept {
 989       vgpr = std::max(vgpr, other.vgpr);
 990       sgpr = std::max(sgpr, other.sgpr);
 991    }
 992
 993 };
 994
 995 /* CFG */
 996 struct Block {
 997    unsigned index;
 998    unsigned offset = 0;
 999    std::vector<aco_ptr<Instruction>> instructions;
1000    std::vector<unsigned> logical_preds;
1001    std::vector<unsigned> linear_preds;
1002    std::vector<unsigned> logical_succs;
1003    std::vector<unsigned> linear_succs;
1004    RegisterDemand register_demand = RegisterDemand();
1005    uint16_t loop_nest_depth = 0;
1006    uint16_t kind = 0;
1007    int logical_idom = -1;
1008    int linear_idom = -1;
1009    Temp live_out_exec = Temp();
1010
1011    /* this information is needed for predecessors to blocks with phis when
1012     * moving out of ssa */
1013    bool scc_live_out = false;
1014    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1015
1016    Block(unsigned idx) : index(idx) {}
1017    Block() : index(0) {}
1018 };
1019
1020 using Stage = uint16_t;
1021
1022 /* software stages */
1023 static constexpr Stage sw_vs = 1 << 0;
1024 static constexpr Stage sw_gs = 1 << 1;
1025 static constexpr Stage sw_tcs = 1 << 2;
1026 static constexpr Stage sw_tes = 1 << 3;
1027 static constexpr Stage sw_fs = 1 << 4;
1028 static constexpr Stage sw_cs = 1 << 5;
1029 static constexpr Stage sw_mask = 0x3f;
1030
1031 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1032 static constexpr Stage hw_vs = 1 << 6;
1033 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1034 static constexpr Stage hw_gs = 1 << 8;
1035 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1036 static constexpr Stage hw_hs = 1 << 10;
1037 static constexpr Stage hw_fs = 1 << 11;
1038 static constexpr Stage hw_cs = 1 << 12;
1039 static constexpr Stage hw_mask = 0x7f << 6;
1040
1041 /* possible settings of Program::stage */
1042 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1043 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1044 static constexpr Stage compute_cs = sw_cs | hw_cs;
1045 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1046 /* GFX10/NGG */
1047 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1048 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1049 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1050 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1051 /* GFX9 (and GFX10 if NGG isn't used) */
1052 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1053 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1054 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1055 /* pre-GFX9 */
1056 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1057 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1058 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1059 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1060 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1061
1062 class Program final {
1063 public:
1064    std::vector<Block> blocks;
1065    RegisterDemand max_reg_demand = RegisterDemand();
1066    uint16_t num_waves = 0;
1067    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1068    ac_shader_config* config;
1069    struct radv_shader_info *info;
1070    enum chip_class chip_class;
1071    enum radeon_family family;
1072    unsigned wave_size;
1073    Stage stage; /* Stage */
1074    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1075    bool needs_wqm = false; /* there exists a p_wqm instruction */
1076    bool wb_smem_l1_on_end = false;
1077
1078    std::vector<uint8_t> constant_data;
1079    Temp private_segment_buffer;
1080    Temp scratch_offset;
1081
1082    uint16_t lds_alloc_granule;
1083    uint32_t lds_limit; /* in bytes */
1084    uint16_t vgpr_limit;
1085    uint16_t sgpr_limit;
1086    uint16_t physical_sgprs;
1087    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1088
1089    bool needs_vcc = false;
1090    bool needs_xnack_mask = false;
1091    bool needs_flat_scr = false;
1092
1093    uint32_t allocateId()
1094    {
1095       assert(allocationID <= 16777215);
1096       return allocationID++;
1097    }
1098
1099    uint32_t peekAllocationId()
1100    {
1101       return allocationID;
1102    }
1103
1104    void setAllocationId(uint32_t id)
1105    {
1106       allocationID = id;
1107    }
1108
1109    Block* create_and_insert_block() {
1110       blocks.emplace_back(blocks.size());
1111       return &blocks.back();
1112    }
1113
1114    Block* insert_block(Block&& block) {
1115       block.index = blocks.size();
1116       blocks.emplace_back(std::move(block));
1117       return &blocks.back();
1118    }
1119
1120 private:
1121    uint32_t allocationID = 1;
1122 };
1123
1124 struct live {
1125    /* live temps out per block */
1126    std::vector<std::set<Temp>> live_out;
1127    /* register demand (sgpr/vgpr) per instruction per block */
1128    std::vector<std::vector<RegisterDemand>> register_demand;
1129 };
1130
1131 void select_program(Program *program,
1132                     unsigned shader_count,
1133                     struct nir_shader *const *shaders,
1134                     ac_shader_config* config,
1135                     struct radv_shader_info *info,
1136                     struct radv_nir_compiler_options *options);
1137
1138 void lower_wqm(Program* program, live& live_vars,
1139                const struct radv_nir_compiler_options *options);
1140 void lower_bool_phis(Program* program);
1141 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1142 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1143 std::vector<uint16_t> dead_code_analysis(Program *program);
1144 void dominator_tree(Program* program);
1145 void insert_exec_mask(Program *program);
1146 void value_numbering(Program* program);
1147 void optimize(Program* program);
1148 void setup_reduce_temp(Program* program);
1149 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1150 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1151 void ssa_elimination(Program* program);
1152 void lower_to_hw_instr(Program* program);
1153 void schedule_program(Program* program, live& live_vars);
1154 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1155 void insert_wait_states(Program* program);
1156 void insert_NOPs(Program* program);
1157 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1158 void print_asm(Program *program, std::vector<uint32_t>& binary,
1159                unsigned exec_size, std::ostream& out);
1160 void validate(Program* program, FILE *output);
1161 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1162 #ifndef NDEBUG
1163 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1164 #else
1165 #define perfwarn(program, cond, msg, ...)
1166 #endif
1167
1168 void aco_print_instr(Instruction *instr, FILE *output);
1169 void aco_print_program(Program *program, FILE *output);
1170
1171 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1172 uint16_t get_extra_sgprs(Program *program);
1173
1174 /* get number of sgprs allocated required to address a number of sgprs */
1175 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1176
1177 /* return number of addressable SGPRs for max_waves */
1178 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1179
1180 typedef struct {
1181    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1182    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1183    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1184    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1185    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1186    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1187 } Info;
1188
1189 extern const Info instr_info;
1190
1191 }
1192
1193 #endif /* ACO_IR_H */
1194