src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_info;
  41
  42 namespace aco {
  43
  44 extern uint64_t debug_flags;
  45
  46 enum {
  47    DEBUG_VALIDATE = 0x1,
  48    DEBUG_VALIDATE_RA = 0x2,
  49    DEBUG_PERFWARN = 0x4,
  50 };
  51
  52 /**
  53  * Representation of the instruction's microcode encoding format
  54  * Note: Some Vector ALU Formats can be combined, such that:
  55  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  56  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  57  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  58  *
  59  * (*) The same is applicable for VOP1 and VOPC instructions.
  60  */
  61 enum class Format : std::uint16_t {
  62    /* Pseudo Instruction Format */
  63    PSEUDO = 0,
  64    /* Scalar ALU & Control Formats */
  65    SOP1 = 1,
  66    SOP2 = 2,
  67    SOPK = 3,
  68    SOPP = 4,
  69    SOPC = 5,
  70    /* Scalar Memory Format */
  71    SMEM = 6,
  72    /* LDS/GDS Format */
  73    DS = 8,
  74    /* Vector Memory Buffer Formats */
  75    MTBUF = 9,
  76    MUBUF = 10,
  77    /* Vector Memory Image Format */
  78    MIMG = 11,
  79    /* Export Format */
  80    EXP = 12,
  81    /* Flat Formats */
  82    FLAT = 13,
  83    GLOBAL = 14,
  84    SCRATCH = 15,
  85
  86    PSEUDO_BRANCH = 16,
  87    PSEUDO_BARRIER = 17,
  88    PSEUDO_REDUCTION = 18,
  89
  90    /* Vector ALU Formats */
  91    VOP1 = 1 << 8,
  92    VOP2 = 1 << 9,
  93    VOPC = 1 << 10,
  94    VOP3 = 1 << 11,
  95    VOP3A = 1 << 11,
  96    VOP3B = 1 << 11,
  97    VOP3P = 1 << 12,
  98    /* Vector Parameter Interpolation Format */
  99    VINTRP = 1 << 13,
 100    DPP = 1 << 14,
 101    SDWA = 1 << 15,
 102 };
 103
 104 enum barrier_interaction {
 105    barrier_none = 0,
 106    barrier_buffer = 0x1,
 107    barrier_image = 0x2,
 108    barrier_atomic = 0x4,
 109    barrier_shared = 0x8,
 110    barrier_count = 4,
 111 };
 112
 113 constexpr Format asVOP3(Format format) {
 114    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 115 };
 116
 117 enum class RegType {
 118    none = 0,
 119    sgpr,
 120    vgpr,
 121    linear_vgpr,
 122 };
 123
 124 struct RegClass {
 125
 126    enum RC : uint8_t {
 127       s1 = 1,
 128       s2 = 2,
 129       s3 = 3,
 130       s4 = 4,
 131       s6 = 6,
 132       s8 = 8,
 133       s16 = 16,
 134       v1 = s1 | (1 << 5),
 135       v2 = s2 | (1 << 5),
 136       v3 = s3 | (1 << 5),
 137       v4 = s4 | (1 << 5),
 138       v5 = 5  | (1 << 5),
 139       v6 = 6  | (1 << 5),
 140       v7 = 7  | (1 << 5),
 141       v8 = 8  | (1 << 5),
 142       /* these are used for WWM and spills to vgpr */
 143       v1_linear = v1 | (1 << 6),
 144       v2_linear = v2 | (1 << 6),
 145    };
 146
 147    RegClass() = default;
 148    constexpr RegClass(RC rc)
 149       : rc(rc) {}
 150    constexpr RegClass(RegType type, unsigned size)
 151       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 152
 153    constexpr operator RC() const { return rc; }
 154    explicit operator bool() = delete;
 155
 156    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 157    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 158    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 159    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 160
 161 private:
 162    RC rc;
 163 };
 164
 165 /* transitional helper expressions */
 166 static constexpr RegClass s1{RegClass::s1};
 167 static constexpr RegClass s2{RegClass::s2};
 168 static constexpr RegClass s3{RegClass::s3};
 169 static constexpr RegClass s4{RegClass::s4};
 170 static constexpr RegClass s8{RegClass::s8};
 171 static constexpr RegClass s16{RegClass::s16};
 172 static constexpr RegClass v1{RegClass::v1};
 173 static constexpr RegClass v2{RegClass::v2};
 174 static constexpr RegClass v3{RegClass::v3};
 175 static constexpr RegClass v4{RegClass::v4};
 176 static constexpr RegClass v5{RegClass::v5};
 177 static constexpr RegClass v6{RegClass::v6};
 178 static constexpr RegClass v7{RegClass::v7};
 179 static constexpr RegClass v8{RegClass::v8};
 180
 181 /**
 182  * Temp Class
 183  * Each temporary virtual register has a
 184  * register class (i.e. size and type)
 185  * and SSA id.
 186  */
 187 struct Temp {
 188    Temp() = default;
 189    constexpr Temp(uint32_t id, RegClass cls) noexcept
 190       : id_(id), reg_class(cls) {}
 191
 192    constexpr uint32_t id() const noexcept { return id_; }
 193    constexpr RegClass regClass() const noexcept { return reg_class; }
 194
 195    constexpr unsigned size() const noexcept { return reg_class.size(); }
 196    constexpr RegType type() const noexcept { return reg_class.type(); }
 197    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 198
 199    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 200    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 201    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 202
 203 private:
 204    uint32_t id_:24;
 205    RegClass reg_class;
 206 };
 207
 208 /**
 209  * PhysReg
 210  * Represents the physical register for each
 211  * Operand and Definition.
 212  */
 213 struct PhysReg {
 214    constexpr PhysReg() = default;
 215    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 216    constexpr operator unsigned() const { return reg; }
 217
 218    uint16_t reg = 0;
 219 };
 220
 221 /* helper expressions for special registers */
 222 static constexpr PhysReg m0{124};
 223 static constexpr PhysReg vcc{106};
 224 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 225 static constexpr PhysReg exec{126};
 226 static constexpr PhysReg exec_lo{126};
 227 static constexpr PhysReg exec_hi{127};
 228 static constexpr PhysReg scc{253};
 229
 230 /**
 231  * Operand Class
 232  * Initially, each Operand refers to either
 233  * a temporary virtual register
 234  * or to a constant value
 235  * Temporary registers get mapped to physical register during RA
 236  * Constant values are inlined into the instruction sequence.
 237  */
 238 class Operand final
 239 {
 240 public:
 241    constexpr Operand()
 242       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 243         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 244
 245    explicit Operand(Temp r) noexcept
 246    {
 247       data_.temp = r;
 248       if (r.id()) {
 249          isTemp_ = true;
 250       } else {
 251          isUndef_ = true;
 252          setFixed(PhysReg{128});
 253       }
 254    };
 255    explicit Operand(uint32_t v) noexcept
 256    {
 257       data_.i = v;
 258       isConstant_ = true;
 259       if (v <= 64)
 260          setFixed(PhysReg{128 + v});
 261       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 262          setFixed(PhysReg{192 - v});
 263       else if (v == 0x3f000000) /* 0.5 */
 264          setFixed(PhysReg{240});
 265       else if (v == 0xbf000000) /* -0.5 */
 266          setFixed(PhysReg{241});
 267       else if (v == 0x3f800000) /* 1.0 */
 268          setFixed(PhysReg{242});
 269       else if (v == 0xbf800000) /* -1.0 */
 270          setFixed(PhysReg{243});
 271       else if (v == 0x40000000) /* 2.0 */
 272          setFixed(PhysReg{244});
 273       else if (v == 0xc0000000) /* -2.0 */
 274          setFixed(PhysReg{245});
 275       else if (v == 0x40800000) /* 4.0 */
 276          setFixed(PhysReg{246});
 277       else if (v == 0xc0800000) /* -4.0 */
 278          setFixed(PhysReg{247});
 279       else if (v == 0x3e22f983) /* 1/(2*PI) */
 280          setFixed(PhysReg{248});
 281       else /* Literal Constant */
 282          setFixed(PhysReg{255});
 283    };
 284    explicit Operand(uint64_t v) noexcept
 285    {
 286       isConstant_ = true;
 287       is64BitConst_ = true;
 288       if (v <= 64)
 289          setFixed(PhysReg{128 + (uint32_t) v});
 290       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 291          setFixed(PhysReg{192 - (uint32_t) v});
 292       else if (v == 0x3FE0000000000000) /* 0.5 */
 293          setFixed(PhysReg{240});
 294       else if (v == 0xBFE0000000000000) /* -0.5 */
 295          setFixed(PhysReg{241});
 296       else if (v == 0x3FF0000000000000) /* 1.0 */
 297          setFixed(PhysReg{242});
 298       else if (v == 0xBFF0000000000000) /* -1.0 */
 299          setFixed(PhysReg{243});
 300       else if (v == 0x4000000000000000) /* 2.0 */
 301          setFixed(PhysReg{244});
 302       else if (v == 0xC000000000000000) /* -2.0 */
 303          setFixed(PhysReg{245});
 304       else if (v == 0x4010000000000000) /* 4.0 */
 305          setFixed(PhysReg{246});
 306       else if (v == 0xC010000000000000) /* -4.0 */
 307          setFixed(PhysReg{247});
 308       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 309          setFixed(PhysReg{248});
 310       else { /* Literal Constant: we don't know if it is a long or double.*/
 311          isConstant_ = 0;
 312          assert(false && "attempt to create a 64-bit literal constant");
 313       }
 314    };
 315    explicit Operand(RegClass type) noexcept
 316    {
 317       isUndef_ = true;
 318       data_.temp = Temp(0, type);
 319       setFixed(PhysReg{128});
 320    };
 321    explicit Operand(PhysReg reg, RegClass type) noexcept
 322    {
 323       data_.temp = Temp(0, type);
 324       setFixed(reg);
 325    }
 326
 327    constexpr bool isTemp() const noexcept
 328    {
 329       return isTemp_;
 330    }
 331
 332    constexpr void setTemp(Temp t) noexcept {
 333       assert(!isConstant_);
 334       isTemp_ = true;
 335       data_.temp = t;
 336    }
 337
 338    constexpr Temp getTemp() const noexcept
 339    {
 340       return data_.temp;
 341    }
 342
 343    constexpr uint32_t tempId() const noexcept
 344    {
 345       return data_.temp.id();
 346    }
 347
 348    constexpr bool hasRegClass() const noexcept
 349    {
 350       return isTemp() || isUndefined();
 351    }
 352
 353    constexpr RegClass regClass() const noexcept
 354    {
 355       return data_.temp.regClass();
 356    }
 357
 358    constexpr unsigned size() const noexcept
 359    {
 360       if (isConstant())
 361          return is64BitConst_ ? 2 : 1;
 362       else
 363          return data_.temp.size();
 364    }
 365
 366    constexpr bool isFixed() const noexcept
 367    {
 368       return isFixed_;
 369    }
 370
 371    constexpr PhysReg physReg() const noexcept
 372    {
 373       return reg_;
 374    }
 375
 376    constexpr void setFixed(PhysReg reg) noexcept
 377    {
 378       isFixed_ = reg != unsigned(-1);
 379       reg_ = reg;
 380    }
 381
 382    constexpr bool isConstant() const noexcept
 383    {
 384       return isConstant_;
 385    }
 386
 387    constexpr bool isLiteral() const noexcept
 388    {
 389       return isConstant() && reg_ == 255;
 390    }
 391
 392    constexpr bool isUndefined() const noexcept
 393    {
 394       return isUndef_;
 395    }
 396
 397    constexpr uint32_t constantValue() const noexcept
 398    {
 399       return data_.i;
 400    }
 401
 402    constexpr bool constantEquals(uint32_t cmp) const noexcept
 403    {
 404       return isConstant() && constantValue() == cmp;
 405    }
 406
 407    constexpr void setKill(bool flag) noexcept
 408    {
 409       isKill_ = flag;
 410       if (!flag)
 411          setFirstKill(false);
 412    }
 413
 414    constexpr bool isKill() const noexcept
 415    {
 416       return isKill_ || isFirstKill();
 417    }
 418
 419    constexpr void setFirstKill(bool flag) noexcept
 420    {
 421       isFirstKill_ = flag;
 422       if (flag)
 423          setKill(flag);
 424    }
 425
 426    /* When there are multiple operands killing the same temporary,
 427     * isFirstKill() is only returns true for the first one. */
 428    constexpr bool isFirstKill() const noexcept
 429    {
 430       return isFirstKill_;
 431    }
 432
 433 private:
 434    union {
 435       uint32_t i;
 436       float f;
 437       Temp temp = Temp(0, s1);
 438    } data_;
 439    PhysReg reg_;
 440    union {
 441       struct {
 442          uint8_t isTemp_:1;
 443          uint8_t isFixed_:1;
 444          uint8_t isConstant_:1;
 445          uint8_t isKill_:1;
 446          uint8_t isUndef_:1;
 447          uint8_t isFirstKill_:1;
 448          uint8_t is64BitConst_:1;
 449       };
 450       /* can't initialize bit-fields in c++11, so work around using a union */
 451       uint8_t control_ = 0;
 452    };
 453 };
 454
 455 /**
 456  * Definition Class
 457  * Definitions are the results of Instructions
 458  * and refer to temporary virtual registers
 459  * which are later mapped to physical registers
 460  */
 461 class Definition final
 462 {
 463 public:
 464    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 465    Definition(uint32_t index, RegClass type) noexcept
 466       : temp(index, type) {}
 467    explicit Definition(Temp tmp) noexcept
 468       : temp(tmp) {}
 469    Definition(PhysReg reg, RegClass type) noexcept
 470       : temp(Temp(0, type))
 471    {
 472       setFixed(reg);
 473    }
 474    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 475       : temp(Temp(tmpId, type))
 476    {
 477       setFixed(reg);
 478    }
 479
 480    constexpr bool isTemp() const noexcept
 481    {
 482       return tempId() > 0;
 483    }
 484
 485    constexpr Temp getTemp() const noexcept
 486    {
 487       return temp;
 488    }
 489
 490    constexpr uint32_t tempId() const noexcept
 491    {
 492       return temp.id();
 493    }
 494
 495    constexpr void setTemp(Temp t) noexcept {
 496       temp = t;
 497    }
 498
 499    constexpr RegClass regClass() const noexcept
 500    {
 501       return temp.regClass();
 502    }
 503
 504    constexpr unsigned size() const noexcept
 505    {
 506       return temp.size();
 507    }
 508
 509    constexpr bool isFixed() const noexcept
 510    {
 511       return isFixed_;
 512    }
 513
 514    constexpr PhysReg physReg() const noexcept
 515    {
 516       return reg_;
 517    }
 518
 519    constexpr void setFixed(PhysReg reg) noexcept
 520    {
 521       isFixed_ = 1;
 522       reg_ = reg;
 523    }
 524
 525    constexpr void setHint(PhysReg reg) noexcept
 526    {
 527       hasHint_ = 1;
 528       reg_ = reg;
 529    }
 530
 531    constexpr bool hasHint() const noexcept
 532    {
 533       return hasHint_;
 534    }
 535
 536    constexpr void setKill(bool flag) noexcept
 537    {
 538       isKill_ = flag;
 539    }
 540
 541    constexpr bool isKill() const noexcept
 542    {
 543       return isKill_;
 544    }
 545
 546 private:
 547    Temp temp = Temp(0, s1);
 548    PhysReg reg_;
 549    union {
 550       struct {
 551          uint8_t isFixed_:1;
 552          uint8_t hasHint_:1;
 553          uint8_t isKill_:1;
 554       };
 555       /* can't initialize bit-fields in c++11, so work around using a union */
 556       uint8_t control_ = 0;
 557    };
 558 };
 559
 560 class Block;
 561
 562 struct Instruction {
 563    aco_opcode opcode;
 564    Format format;
 565    uint32_t pass_flags;
 566
 567    aco::span<Operand> operands;
 568    aco::span<Definition> definitions;
 569
 570    constexpr bool isVALU() const noexcept
 571    {
 572       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 573           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 574           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 575           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 576           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 577           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 578    }
 579
 580    constexpr bool isSALU() const noexcept
 581    {
 582       return format == Format::SOP1 ||
 583              format == Format::SOP2 ||
 584              format == Format::SOPC ||
 585              format == Format::SOPK ||
 586              format == Format::SOPP;
 587    }
 588
 589    constexpr bool isVMEM() const noexcept
 590    {
 591       return format == Format::MTBUF ||
 592              format == Format::MUBUF ||
 593              format == Format::MIMG;
 594    }
 595
 596    constexpr bool isDPP() const noexcept
 597    {
 598       return (uint16_t) format & (uint16_t) Format::DPP;
 599    }
 600
 601    constexpr bool isVOP3() const noexcept
 602    {
 603       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 604              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 605              format == Format::VOP3P;
 606    }
 607
 608    constexpr bool isSDWA() const noexcept
 609    {
 610       return (uint16_t) format & (uint16_t) Format::SDWA;
 611    }
 612
 613    constexpr bool isFlatOrGlobal() const noexcept
 614    {
 615       return format == Format::FLAT || format == Format::GLOBAL;
 616    }
 617
 618    constexpr bool usesModifiers() const noexcept;
 619
 620    constexpr bool reads_exec() const noexcept
 621    {
 622       for (const Operand& op : operands) {
 623          if (op.isFixed() && op.physReg() == exec)
 624             return true;
 625       }
 626       return false;
 627    }
 628 };
 629
 630 struct SOPK_instruction : public Instruction {
 631    uint16_t imm;
 632 };
 633
 634 struct SOPP_instruction : public Instruction {
 635    uint32_t imm;
 636    int block;
 637 };
 638
 639 struct SOPC_instruction : public Instruction {
 640 };
 641
 642 struct SOP1_instruction : public Instruction {
 643 };
 644
 645 struct SOP2_instruction : public Instruction {
 646 };
 647
 648 /**
 649  * Scalar Memory Format:
 650  * For s_(buffer_)load_dword*:
 651  * Operand(0): SBASE - SGPR-pair which provides base address
 652  * Operand(1): Offset - immediate (un)signed offset or SGPR
 653  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 654  * Operand(n-1): SOffset - SGPR offset (Vega only)
 655  *
 656  * Having no operands is also valid for instructions such as s_dcache_inv.
 657  *
 658  */
 659 struct SMEM_instruction : public Instruction {
 660    bool glc; /* VI+: globally coherent */
 661    bool dlc; /* NAVI: device level coherent */
 662    bool nv; /* VEGA only: Non-volatile */
 663    bool can_reorder;
 664    bool disable_wqm;
 665    barrier_interaction barrier;
 666 };
 667
 668 struct VOP1_instruction : public Instruction {
 669 };
 670
 671 struct VOP2_instruction : public Instruction {
 672 };
 673
 674 struct VOPC_instruction : public Instruction {
 675 };
 676
 677 struct VOP3A_instruction : public Instruction {
 678    bool abs[3];
 679    bool opsel[4];
 680    bool clamp;
 681    unsigned omod;
 682    bool neg[3];
 683 };
 684
 685 /**
 686  * Data Parallel Primitives Format:
 687  * This format can be used for VOP1, VOP2 or VOPC instructions.
 688  * The swizzle applies to the src0 operand.
 689  *
 690  */
 691 struct DPP_instruction : public Instruction {
 692    uint16_t dpp_ctrl;
 693    uint8_t row_mask;
 694    uint8_t bank_mask;
 695    bool abs[2];
 696    bool neg[2];
 697    bool bound_ctrl;
 698 };
 699
 700 struct Interp_instruction : public Instruction {
 701    unsigned attribute;
 702    unsigned component;
 703 };
 704
 705 /**
 706  * Local and Global Data Sharing instructions
 707  * Operand(0): ADDR - VGPR which supplies the address.
 708  * Operand(1): DATA0 - First data VGPR.
 709  * Operand(2): DATA1 - Second data VGPR.
 710  * Operand(n-1): M0 - LDS size.
 711  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 712  *
 713  */
 714 struct DS_instruction : public Instruction {
 715    int16_t offset0;
 716    int8_t offset1;
 717    bool gds;
 718 };
 719
 720 /**
 721  * Vector Memory Untyped-buffer Instructions
 722  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 723  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 724  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 725  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 726  *
 727  */
 728 struct MUBUF_instruction : public Instruction {
 729    unsigned offset; /* Unsigned byte offset - 12 bit */
 730    bool offen; /* Supply an offset from VGPR (VADDR) */
 731    bool idxen; /* Supply an index from VGPR (VADDR) */
 732    bool glc; /* globally coherent */
 733    bool dlc; /* NAVI: device level coherent */
 734    bool slc; /* system level coherent */
 735    bool tfe; /* texture fail enable */
 736    bool lds; /* Return read-data to LDS instead of VGPRs */
 737    bool disable_wqm; /* Require an exec mask without helper invocations */
 738    bool can_reorder;
 739    barrier_interaction barrier;
 740 };
 741
 742 /**
 743  * Vector Memory Typed-buffer Instructions
 744  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 745  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 746  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 747  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 748  *
 749  */
 750 struct MTBUF_instruction : public Instruction {
 751    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 752    uint8_t nfmt : 3; /* Numeric format of data in memory */
 753    unsigned offset; /* Unsigned byte offset - 12 bit */
 754    bool offen; /* Supply an offset from VGPR (VADDR) */
 755    bool idxen; /* Supply an index from VGPR (VADDR) */
 756    bool glc; /* globally coherent */
 757    bool dlc; /* NAVI: device level coherent */
 758    bool slc; /* system level coherent */
 759    bool tfe; /* texture fail enable */
 760    bool disable_wqm; /* Require an exec mask without helper invocations */
 761    bool can_reorder;
 762    barrier_interaction barrier;
 763 };
 764
 765 /**
 766  * Vector Memory Image Instructions
 767  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 768  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 769  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 770  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 771  *
 772  */
 773 struct MIMG_instruction : public Instruction {
 774    unsigned dmask; /* Data VGPR enable mask */
 775    unsigned dim; /* NAVI: dimensionality */
 776    bool unrm; /* Force address to be un-normalized */
 777    bool dlc; /* NAVI: device level coherent */
 778    bool glc; /* globally coherent */
 779    bool slc; /* system level coherent */
 780    bool tfe; /* texture fail enable */
 781    bool da; /* declare an array */
 782    bool lwe; /* Force data to be un-normalized */
 783    bool r128; /* NAVI: Texture resource size */
 784    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 785    bool d16; /* Convert 32-bit data to 16-bit data */
 786    bool disable_wqm; /* Require an exec mask without helper invocations */
 787    bool can_reorder;
 788    barrier_interaction barrier;
 789 };
 790
 791 /**
 792  * Flat/Scratch/Global Instructions
 793  * Operand(0): ADDR
 794  * Operand(1): SADDR
 795  * Operand(2) / Definition(0): DATA/VDST
 796  *
 797  */
 798 struct FLAT_instruction : public Instruction {
 799    uint16_t offset; /* Vega only */
 800    bool slc; /* system level coherent */
 801    bool glc; /* globally coherent */
 802    bool dlc; /* NAVI: device level coherent */
 803    bool lds;
 804    bool nv;
 805 };
 806
 807 struct Export_instruction : public Instruction {
 808    unsigned enabled_mask;
 809    unsigned dest;
 810    bool compressed;
 811    bool done;
 812    bool valid_mask;
 813 };
 814
 815 struct Pseudo_instruction : public Instruction {
 816    bool tmp_in_scc;
 817    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 818 };
 819
 820 struct Pseudo_branch_instruction : public Instruction {
 821    /* target[0] is the block index of the branch target.
 822     * For conditional branches, target[1] contains the fall-through alternative.
 823     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 824     */
 825    uint32_t target[2];
 826 };
 827
 828 struct Pseudo_barrier_instruction : public Instruction {
 829 };
 830
 831 enum ReduceOp {
 832    iadd32, iadd64,
 833    imul32, imul64,
 834    fadd32, fadd64,
 835    fmul32, fmul64,
 836    imin32, imin64,
 837    imax32, imax64,
 838    umin32, umin64,
 839    umax32, umax64,
 840    fmin32, fmin64,
 841    fmax32, fmax64,
 842    iand32, iand64,
 843    ior32, ior64,
 844    ixor32, ixor64,
 845    gfx10_wave64_bpermute
 846 };
 847
 848 /**
 849  * Subgroup Reduction Instructions, everything except for the data to be
 850  * reduced and the result as inserted by setup_reduce_temp().
 851  * Operand(0): data to be reduced
 852  * Operand(1): reduce temporary
 853  * Operand(2): vector temporary
 854  * Definition(0): result
 855  * Definition(1): scalar temporary
 856  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 857  * Definition(3): scc clobber
 858  * Definition(4): vcc clobber
 859  *
 860  */
 861 struct Pseudo_reduction_instruction : public Instruction {
 862    ReduceOp reduce_op;
 863    unsigned cluster_size; // must be 0 for scans
 864 };
 865
 866 struct instr_deleter_functor {
 867    void operator()(void* p) {
 868       free(p);
 869    }
 870 };
 871
 872 template<typename T>
 873 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 874
 875 template<typename T>
 876 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 877 {
 878    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 879    char *data = (char*) calloc(1, size);
 880    T* inst = (T*) data;
 881
 882    inst->opcode = opcode;
 883    inst->format = format;
 884
 885    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 886    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 887
 888    return inst;
 889 }
 890
 891 constexpr bool Instruction::usesModifiers() const noexcept
 892 {
 893    if (isDPP() || isSDWA())
 894       return true;
 895    if (!isVOP3())
 896       return false;
 897    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 898    for (unsigned i = 0; i < operands.size(); i++) {
 899       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 900          return true;
 901    }
 902    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 903 }
 904
 905 constexpr bool is_phi(Instruction* instr)
 906 {
 907    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 908 }
 909
 910 static inline bool is_phi(aco_ptr<Instruction>& instr)
 911 {
 912    return is_phi(instr.get());
 913 }
 914
 915 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 916 {
 917    switch (instr->format) {
 918    case Format::SMEM:
 919       return static_cast<SMEM_instruction*>(instr)->barrier;
 920    case Format::MUBUF:
 921       return static_cast<MUBUF_instruction*>(instr)->barrier;
 922    case Format::MIMG:
 923       return static_cast<MIMG_instruction*>(instr)->barrier;
 924    case Format::FLAT:
 925    case Format::GLOBAL:
 926       return barrier_buffer;
 927    case Format::DS:
 928       return barrier_shared;
 929    default:
 930       return barrier_none;
 931    }
 932 }
 933
 934 enum block_kind {
 935    /* uniform indicates that leaving this block,
 936     * all actives lanes stay active */
 937    block_kind_uniform = 1 << 0,
 938    block_kind_top_level = 1 << 1,
 939    block_kind_loop_preheader = 1 << 2,
 940    block_kind_loop_header = 1 << 3,
 941    block_kind_loop_exit = 1 << 4,
 942    block_kind_continue = 1 << 5,
 943    block_kind_break = 1 << 6,
 944    block_kind_continue_or_break = 1 << 7,
 945    block_kind_discard = 1 << 8,
 946    block_kind_branch = 1 << 9,
 947    block_kind_merge = 1 << 10,
 948    block_kind_invert = 1 << 11,
 949    block_kind_uses_discard_if = 1 << 12,
 950    block_kind_needs_lowering = 1 << 13,
 951    block_kind_uses_demote = 1 << 14,
 952 };
 953
 954
 955 struct RegisterDemand {
 956    constexpr RegisterDemand() = default;
 957    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
 958       : vgpr{v}, sgpr{s} {}
 959    int16_t vgpr = 0;
 960    int16_t sgpr = 0;
 961
 962    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
 963       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
 964    }
 965
 966    constexpr bool exceeds(const RegisterDemand other) const noexcept {
 967       return vgpr > other.vgpr || sgpr > other.sgpr;
 968    }
 969
 970    constexpr RegisterDemand operator+(const Temp t) const noexcept {
 971       if (t.type() == RegType::sgpr)
 972          return RegisterDemand( vgpr, sgpr + t.size() );
 973       else
 974          return RegisterDemand( vgpr + t.size(), sgpr );
 975    }
 976
 977    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
 978       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
 979    }
 980
 981    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
 982       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
 983    }
 984
 985    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
 986       vgpr += other.vgpr;
 987       sgpr += other.sgpr;
 988       return *this;
 989    }
 990
 991    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
 992       vgpr -= other.vgpr;
 993       sgpr -= other.sgpr;
 994       return *this;
 995    }
 996
 997    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
 998       if (t.type() == RegType::sgpr)
 999          sgpr += t.size();
1000       else
1001          vgpr += t.size();
1002       return *this;
1003    }
1004
1005    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1006       if (t.type() == RegType::sgpr)
1007          sgpr -= t.size();
1008       else
1009          vgpr -= t.size();
1010       return *this;
1011    }
1012
1013    constexpr void update(const RegisterDemand other) noexcept {
1014       vgpr = std::max(vgpr, other.vgpr);
1015       sgpr = std::max(sgpr, other.sgpr);
1016    }
1017
1018 };
1019
1020 /* CFG */
1021 struct Block {
1022    unsigned index;
1023    unsigned offset = 0;
1024    std::vector<aco_ptr<Instruction>> instructions;
1025    std::vector<unsigned> logical_preds;
1026    std::vector<unsigned> linear_preds;
1027    std::vector<unsigned> logical_succs;
1028    std::vector<unsigned> linear_succs;
1029    RegisterDemand register_demand = RegisterDemand();
1030    uint16_t loop_nest_depth = 0;
1031    uint16_t kind = 0;
1032    int logical_idom = -1;
1033    int linear_idom = -1;
1034    Temp live_out_exec = Temp();
1035
1036    /* this information is needed for predecessors to blocks with phis when
1037     * moving out of ssa */
1038    bool scc_live_out = false;
1039    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1040
1041    Block(unsigned idx) : index(idx) {}
1042    Block() : index(0) {}
1043 };
1044
1045 using Stage = uint16_t;
1046
1047 /* software stages */
1048 static constexpr Stage sw_vs = 1 << 0;
1049 static constexpr Stage sw_gs = 1 << 1;
1050 static constexpr Stage sw_tcs = 1 << 2;
1051 static constexpr Stage sw_tes = 1 << 3;
1052 static constexpr Stage sw_fs = 1 << 4;
1053 static constexpr Stage sw_cs = 1 << 5;
1054 static constexpr Stage sw_mask = 0x3f;
1055
1056 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1057 static constexpr Stage hw_vs = 1 << 6;
1058 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1059 static constexpr Stage hw_gs = 1 << 8;
1060 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1061 static constexpr Stage hw_hs = 1 << 10;
1062 static constexpr Stage hw_fs = 1 << 11;
1063 static constexpr Stage hw_cs = 1 << 12;
1064 static constexpr Stage hw_mask = 0x7f << 6;
1065
1066 /* possible settings of Program::stage */
1067 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1068 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1069 static constexpr Stage compute_cs = sw_cs | hw_cs;
1070 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1071 /* GFX10/NGG */
1072 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1073 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1074 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1075 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1076 /* GFX9 (and GFX10 if NGG isn't used) */
1077 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1078 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1079 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1080 /* pre-GFX9 */
1081 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1082 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1083 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1084 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1085 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1086
1087 class Program final {
1088 public:
1089    std::vector<Block> blocks;
1090    RegisterDemand max_reg_demand = RegisterDemand();
1091    uint16_t num_waves = 0;
1092    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1093    ac_shader_config* config;
1094    struct radv_shader_info *info;
1095    enum chip_class chip_class;
1096    enum radeon_family family;
1097    unsigned wave_size;
1098    Stage stage; /* Stage */
1099    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1100    bool needs_wqm = false; /* there exists a p_wqm instruction */
1101    bool wb_smem_l1_on_end = false;
1102
1103    std::vector<uint8_t> constant_data;
1104    Temp private_segment_buffer;
1105    Temp scratch_offset;
1106
1107    uint16_t lds_alloc_granule;
1108    uint32_t lds_limit; /* in bytes */
1109    uint16_t vgpr_limit;
1110    uint16_t sgpr_limit;
1111    uint16_t physical_sgprs;
1112    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1113
1114    bool needs_vcc = false;
1115    bool needs_xnack_mask = false;
1116    bool needs_flat_scr = false;
1117
1118    uint32_t allocateId()
1119    {
1120       assert(allocationID <= 16777215);
1121       return allocationID++;
1122    }
1123
1124    uint32_t peekAllocationId()
1125    {
1126       return allocationID;
1127    }
1128
1129    void setAllocationId(uint32_t id)
1130    {
1131       allocationID = id;
1132    }
1133
1134    Block* create_and_insert_block() {
1135       blocks.emplace_back(blocks.size());
1136       return &blocks.back();
1137    }
1138
1139    Block* insert_block(Block&& block) {
1140       block.index = blocks.size();
1141       blocks.emplace_back(std::move(block));
1142       return &blocks.back();
1143    }
1144
1145 private:
1146    uint32_t allocationID = 1;
1147 };
1148
1149 struct live {
1150    /* live temps out per block */
1151    std::vector<std::set<Temp>> live_out;
1152    /* register demand (sgpr/vgpr) per instruction per block */
1153    std::vector<std::vector<RegisterDemand>> register_demand;
1154 };
1155
1156 void select_program(Program *program,
1157                     unsigned shader_count,
1158                     struct nir_shader *const *shaders,
1159                     ac_shader_config* config,
1160                     struct radv_shader_info *info,
1161                     struct radv_nir_compiler_options *options);
1162
1163 void lower_wqm(Program* program, live& live_vars,
1164                const struct radv_nir_compiler_options *options);
1165 void lower_bool_phis(Program* program);
1166 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1167 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1168 std::vector<uint16_t> dead_code_analysis(Program *program);
1169 void dominator_tree(Program* program);
1170 void insert_exec_mask(Program *program);
1171 void value_numbering(Program* program);
1172 void optimize(Program* program);
1173 void setup_reduce_temp(Program* program);
1174 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1175 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1176 void ssa_elimination(Program* program);
1177 void lower_to_hw_instr(Program* program);
1178 void schedule_program(Program* program, live& live_vars);
1179 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1180 void insert_wait_states(Program* program);
1181 void insert_NOPs(Program* program);
1182 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1183 void print_asm(Program *program, std::vector<uint32_t>& binary,
1184                unsigned exec_size, std::ostream& out);
1185 void validate(Program* program, FILE *output);
1186 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1187 #ifndef NDEBUG
1188 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1189 #else
1190 #define perfwarn(program, cond, msg, ...)
1191 #endif
1192
1193 void aco_print_instr(Instruction *instr, FILE *output);
1194 void aco_print_program(Program *program, FILE *output);
1195
1196 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1197 uint16_t get_extra_sgprs(Program *program);
1198
1199 /* get number of sgprs allocated required to address a number of sgprs */
1200 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1201
1202 /* return number of addressable SGPRs for max_waves */
1203 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1204
1205 typedef struct {
1206    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1207    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1208    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1209    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1210    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1211    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1212 } Info;
1213
1214 extern const Info instr_info;
1215
1216 }
1217
1218 #endif /* ACO_IR_H */
1219