src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    barrier_count = 4,
 112 };
 113
 114 enum fp_round {
 115    fp_round_ne = 0,
 116    fp_round_pi = 1,
 117    fp_round_ni = 2,
 118    fp_round_tz = 3,
 119 };
 120
 121 enum fp_denorm {
 122    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 123     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 124    fp_denorm_flush = 0x0,
 125    fp_denorm_keep = 0x3,
 126 };
 127
 128 struct float_mode {
 129    /* matches encoding of the MODE register */
 130    union {
 131       struct {
 132           fp_round round32:2;
 133           fp_round round16_64:2;
 134           unsigned denorm32:2;
 135           unsigned denorm16_64:2;
 136       };
 137       uint8_t val = 0;
 138    };
 139    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 140    bool preserve_signed_zero_inf_nan32:1;
 141    bool preserve_signed_zero_inf_nan16_64:1;
 142    /* if false, optimizations which may remove denormal flushing can be done */
 143    bool must_flush_denorms32:1;
 144    bool must_flush_denorms16_64:1;
 145    bool care_about_round32:1;
 146    bool care_about_round16_64:1;
 147
 148    /* Returns true if instructions using the mode "other" can safely use the
 149     * current one instead. */
 150    bool canReplace(float_mode other) const noexcept {
 151       return val == other.val &&
 152              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 153              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 154              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 155              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 156              (care_about_round32 || !other.care_about_round32) &&
 157              (care_about_round16_64 || !other.care_about_round16_64);
 158    }
 159 };
 160
 161 constexpr Format asVOP3(Format format) {
 162    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 163 };
 164
 165 enum class RegType {
 166    none = 0,
 167    sgpr,
 168    vgpr,
 169    linear_vgpr,
 170 };
 171
 172 struct RegClass {
 173
 174    enum RC : uint8_t {
 175       s1 = 1,
 176       s2 = 2,
 177       s3 = 3,
 178       s4 = 4,
 179       s6 = 6,
 180       s8 = 8,
 181       s16 = 16,
 182       v1 = s1 | (1 << 5),
 183       v2 = s2 | (1 << 5),
 184       v3 = s3 | (1 << 5),
 185       v4 = s4 | (1 << 5),
 186       v5 = 5  | (1 << 5),
 187       v6 = 6  | (1 << 5),
 188       v7 = 7  | (1 << 5),
 189       v8 = 8  | (1 << 5),
 190       /* these are used for WWM and spills to vgpr */
 191       v1_linear = v1 | (1 << 6),
 192       v2_linear = v2 | (1 << 6),
 193    };
 194
 195    RegClass() = default;
 196    constexpr RegClass(RC rc)
 197       : rc(rc) {}
 198    constexpr RegClass(RegType type, unsigned size)
 199       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 200
 201    constexpr operator RC() const { return rc; }
 202    explicit operator bool() = delete;
 203
 204    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 205    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 206    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 207    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 208
 209 private:
 210    RC rc;
 211 };
 212
 213 /* transitional helper expressions */
 214 static constexpr RegClass s1{RegClass::s1};
 215 static constexpr RegClass s2{RegClass::s2};
 216 static constexpr RegClass s3{RegClass::s3};
 217 static constexpr RegClass s4{RegClass::s4};
 218 static constexpr RegClass s8{RegClass::s8};
 219 static constexpr RegClass s16{RegClass::s16};
 220 static constexpr RegClass v1{RegClass::v1};
 221 static constexpr RegClass v2{RegClass::v2};
 222 static constexpr RegClass v3{RegClass::v3};
 223 static constexpr RegClass v4{RegClass::v4};
 224 static constexpr RegClass v5{RegClass::v5};
 225 static constexpr RegClass v6{RegClass::v6};
 226 static constexpr RegClass v7{RegClass::v7};
 227 static constexpr RegClass v8{RegClass::v8};
 228
 229 /**
 230  * Temp Class
 231  * Each temporary virtual register has a
 232  * register class (i.e. size and type)
 233  * and SSA id.
 234  */
 235 struct Temp {
 236    Temp() = default;
 237    constexpr Temp(uint32_t id, RegClass cls) noexcept
 238       : id_(id), reg_class(cls) {}
 239
 240    constexpr uint32_t id() const noexcept { return id_; }
 241    constexpr RegClass regClass() const noexcept { return reg_class; }
 242
 243    constexpr unsigned size() const noexcept { return reg_class.size(); }
 244    constexpr RegType type() const noexcept { return reg_class.type(); }
 245    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 246
 247    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 248    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 249    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 250
 251 private:
 252    uint32_t id_:24;
 253    RegClass reg_class;
 254 };
 255
 256 /**
 257  * PhysReg
 258  * Represents the physical register for each
 259  * Operand and Definition.
 260  */
 261 struct PhysReg {
 262    constexpr PhysReg() = default;
 263    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 264    constexpr operator unsigned() const { return reg; }
 265
 266    uint16_t reg = 0;
 267 };
 268
 269 /* helper expressions for special registers */
 270 static constexpr PhysReg m0{124};
 271 static constexpr PhysReg vcc{106};
 272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 273 static constexpr PhysReg exec{126};
 274 static constexpr PhysReg exec_lo{126};
 275 static constexpr PhysReg exec_hi{127};
 276 static constexpr PhysReg scc{253};
 277
 278 /**
 279  * Operand Class
 280  * Initially, each Operand refers to either
 281  * a temporary virtual register
 282  * or to a constant value
 283  * Temporary registers get mapped to physical register during RA
 284  * Constant values are inlined into the instruction sequence.
 285  */
 286 class Operand final
 287 {
 288 public:
 289    constexpr Operand()
 290       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 291         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 292
 293    explicit Operand(Temp r) noexcept
 294    {
 295       data_.temp = r;
 296       if (r.id()) {
 297          isTemp_ = true;
 298       } else {
 299          isUndef_ = true;
 300          setFixed(PhysReg{128});
 301       }
 302    };
 303    explicit Operand(uint32_t v) noexcept
 304    {
 305       data_.i = v;
 306       isConstant_ = true;
 307       if (v <= 64)
 308          setFixed(PhysReg{128 + v});
 309       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 310          setFixed(PhysReg{192 - v});
 311       else if (v == 0x3f000000) /* 0.5 */
 312          setFixed(PhysReg{240});
 313       else if (v == 0xbf000000) /* -0.5 */
 314          setFixed(PhysReg{241});
 315       else if (v == 0x3f800000) /* 1.0 */
 316          setFixed(PhysReg{242});
 317       else if (v == 0xbf800000) /* -1.0 */
 318          setFixed(PhysReg{243});
 319       else if (v == 0x40000000) /* 2.0 */
 320          setFixed(PhysReg{244});
 321       else if (v == 0xc0000000) /* -2.0 */
 322          setFixed(PhysReg{245});
 323       else if (v == 0x40800000) /* 4.0 */
 324          setFixed(PhysReg{246});
 325       else if (v == 0xc0800000) /* -4.0 */
 326          setFixed(PhysReg{247});
 327       else if (v == 0x3e22f983) /* 1/(2*PI) */
 328          setFixed(PhysReg{248});
 329       else /* Literal Constant */
 330          setFixed(PhysReg{255});
 331    };
 332    explicit Operand(uint64_t v) noexcept
 333    {
 334       isConstant_ = true;
 335       is64BitConst_ = true;
 336       if (v <= 64)
 337          setFixed(PhysReg{128 + (uint32_t) v});
 338       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 339          setFixed(PhysReg{192 - (uint32_t) v});
 340       else if (v == 0x3FE0000000000000) /* 0.5 */
 341          setFixed(PhysReg{240});
 342       else if (v == 0xBFE0000000000000) /* -0.5 */
 343          setFixed(PhysReg{241});
 344       else if (v == 0x3FF0000000000000) /* 1.0 */
 345          setFixed(PhysReg{242});
 346       else if (v == 0xBFF0000000000000) /* -1.0 */
 347          setFixed(PhysReg{243});
 348       else if (v == 0x4000000000000000) /* 2.0 */
 349          setFixed(PhysReg{244});
 350       else if (v == 0xC000000000000000) /* -2.0 */
 351          setFixed(PhysReg{245});
 352       else if (v == 0x4010000000000000) /* 4.0 */
 353          setFixed(PhysReg{246});
 354       else if (v == 0xC010000000000000) /* -4.0 */
 355          setFixed(PhysReg{247});
 356       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 357          setFixed(PhysReg{248});
 358       else { /* Literal Constant: we don't know if it is a long or double.*/
 359          isConstant_ = 0;
 360          assert(false && "attempt to create a 64-bit literal constant");
 361       }
 362    };
 363    explicit Operand(RegClass type) noexcept
 364    {
 365       isUndef_ = true;
 366       data_.temp = Temp(0, type);
 367       setFixed(PhysReg{128});
 368    };
 369    explicit Operand(PhysReg reg, RegClass type) noexcept
 370    {
 371       data_.temp = Temp(0, type);
 372       setFixed(reg);
 373    }
 374
 375    constexpr bool isTemp() const noexcept
 376    {
 377       return isTemp_;
 378    }
 379
 380    constexpr void setTemp(Temp t) noexcept {
 381       assert(!isConstant_);
 382       isTemp_ = true;
 383       data_.temp = t;
 384    }
 385
 386    constexpr Temp getTemp() const noexcept
 387    {
 388       return data_.temp;
 389    }
 390
 391    constexpr uint32_t tempId() const noexcept
 392    {
 393       return data_.temp.id();
 394    }
 395
 396    constexpr bool hasRegClass() const noexcept
 397    {
 398       return isTemp() || isUndefined();
 399    }
 400
 401    constexpr RegClass regClass() const noexcept
 402    {
 403       return data_.temp.regClass();
 404    }
 405
 406    constexpr unsigned size() const noexcept
 407    {
 408       if (isConstant())
 409          return is64BitConst_ ? 2 : 1;
 410       else
 411          return data_.temp.size();
 412    }
 413
 414    constexpr bool isFixed() const noexcept
 415    {
 416       return isFixed_;
 417    }
 418
 419    constexpr PhysReg physReg() const noexcept
 420    {
 421       return reg_;
 422    }
 423
 424    constexpr void setFixed(PhysReg reg) noexcept
 425    {
 426       isFixed_ = reg != unsigned(-1);
 427       reg_ = reg;
 428    }
 429
 430    constexpr bool isConstant() const noexcept
 431    {
 432       return isConstant_;
 433    }
 434
 435    constexpr bool isLiteral() const noexcept
 436    {
 437       return isConstant() && reg_ == 255;
 438    }
 439
 440    constexpr bool isUndefined() const noexcept
 441    {
 442       return isUndef_;
 443    }
 444
 445    constexpr uint32_t constantValue() const noexcept
 446    {
 447       return data_.i;
 448    }
 449
 450    constexpr bool constantEquals(uint32_t cmp) const noexcept
 451    {
 452       return isConstant() && constantValue() == cmp;
 453    }
 454
 455    constexpr void setKill(bool flag) noexcept
 456    {
 457       isKill_ = flag;
 458       if (!flag)
 459          setFirstKill(false);
 460    }
 461
 462    constexpr bool isKill() const noexcept
 463    {
 464       return isKill_ || isFirstKill();
 465    }
 466
 467    constexpr void setFirstKill(bool flag) noexcept
 468    {
 469       isFirstKill_ = flag;
 470       if (flag)
 471          setKill(flag);
 472    }
 473
 474    /* When there are multiple operands killing the same temporary,
 475     * isFirstKill() is only returns true for the first one. */
 476    constexpr bool isFirstKill() const noexcept
 477    {
 478       return isFirstKill_;
 479    }
 480
 481 private:
 482    union {
 483       uint32_t i;
 484       float f;
 485       Temp temp = Temp(0, s1);
 486    } data_;
 487    PhysReg reg_;
 488    union {
 489       struct {
 490          uint8_t isTemp_:1;
 491          uint8_t isFixed_:1;
 492          uint8_t isConstant_:1;
 493          uint8_t isKill_:1;
 494          uint8_t isUndef_:1;
 495          uint8_t isFirstKill_:1;
 496          uint8_t is64BitConst_:1;
 497       };
 498       /* can't initialize bit-fields in c++11, so work around using a union */
 499       uint8_t control_ = 0;
 500    };
 501 };
 502
 503 /**
 504  * Definition Class
 505  * Definitions are the results of Instructions
 506  * and refer to temporary virtual registers
 507  * which are later mapped to physical registers
 508  */
 509 class Definition final
 510 {
 511 public:
 512    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 513    Definition(uint32_t index, RegClass type) noexcept
 514       : temp(index, type) {}
 515    explicit Definition(Temp tmp) noexcept
 516       : temp(tmp) {}
 517    Definition(PhysReg reg, RegClass type) noexcept
 518       : temp(Temp(0, type))
 519    {
 520       setFixed(reg);
 521    }
 522    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 523       : temp(Temp(tmpId, type))
 524    {
 525       setFixed(reg);
 526    }
 527
 528    constexpr bool isTemp() const noexcept
 529    {
 530       return tempId() > 0;
 531    }
 532
 533    constexpr Temp getTemp() const noexcept
 534    {
 535       return temp;
 536    }
 537
 538    constexpr uint32_t tempId() const noexcept
 539    {
 540       return temp.id();
 541    }
 542
 543    constexpr void setTemp(Temp t) noexcept {
 544       temp = t;
 545    }
 546
 547    constexpr RegClass regClass() const noexcept
 548    {
 549       return temp.regClass();
 550    }
 551
 552    constexpr unsigned size() const noexcept
 553    {
 554       return temp.size();
 555    }
 556
 557    constexpr bool isFixed() const noexcept
 558    {
 559       return isFixed_;
 560    }
 561
 562    constexpr PhysReg physReg() const noexcept
 563    {
 564       return reg_;
 565    }
 566
 567    constexpr void setFixed(PhysReg reg) noexcept
 568    {
 569       isFixed_ = 1;
 570       reg_ = reg;
 571    }
 572
 573    constexpr void setHint(PhysReg reg) noexcept
 574    {
 575       hasHint_ = 1;
 576       reg_ = reg;
 577    }
 578
 579    constexpr bool hasHint() const noexcept
 580    {
 581       return hasHint_;
 582    }
 583
 584    constexpr void setKill(bool flag) noexcept
 585    {
 586       isKill_ = flag;
 587    }
 588
 589    constexpr bool isKill() const noexcept
 590    {
 591       return isKill_;
 592    }
 593
 594 private:
 595    Temp temp = Temp(0, s1);
 596    PhysReg reg_;
 597    union {
 598       struct {
 599          uint8_t isFixed_:1;
 600          uint8_t hasHint_:1;
 601          uint8_t isKill_:1;
 602       };
 603       /* can't initialize bit-fields in c++11, so work around using a union */
 604       uint8_t control_ = 0;
 605    };
 606 };
 607
 608 class Block;
 609
 610 struct Instruction {
 611    aco_opcode opcode;
 612    Format format;
 613    uint32_t pass_flags;
 614
 615    aco::span<Operand> operands;
 616    aco::span<Definition> definitions;
 617
 618    constexpr bool isVALU() const noexcept
 619    {
 620       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 621           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 622           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 623           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 624           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 625           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 626    }
 627
 628    constexpr bool isSALU() const noexcept
 629    {
 630       return format == Format::SOP1 ||
 631              format == Format::SOP2 ||
 632              format == Format::SOPC ||
 633              format == Format::SOPK ||
 634              format == Format::SOPP;
 635    }
 636
 637    constexpr bool isVMEM() const noexcept
 638    {
 639       return format == Format::MTBUF ||
 640              format == Format::MUBUF ||
 641              format == Format::MIMG;
 642    }
 643
 644    constexpr bool isDPP() const noexcept
 645    {
 646       return (uint16_t) format & (uint16_t) Format::DPP;
 647    }
 648
 649    constexpr bool isVOP3() const noexcept
 650    {
 651       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 652              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 653              format == Format::VOP3P;
 654    }
 655
 656    constexpr bool isSDWA() const noexcept
 657    {
 658       return (uint16_t) format & (uint16_t) Format::SDWA;
 659    }
 660
 661    constexpr bool isFlatOrGlobal() const noexcept
 662    {
 663       return format == Format::FLAT || format == Format::GLOBAL;
 664    }
 665
 666    constexpr bool usesModifiers() const noexcept;
 667
 668    constexpr bool reads_exec() const noexcept
 669    {
 670       for (const Operand& op : operands) {
 671          if (op.isFixed() && op.physReg() == exec)
 672             return true;
 673       }
 674       return false;
 675    }
 676 };
 677
 678 struct SOPK_instruction : public Instruction {
 679    uint16_t imm;
 680 };
 681
 682 struct SOPP_instruction : public Instruction {
 683    uint32_t imm;
 684    int block;
 685 };
 686
 687 struct SOPC_instruction : public Instruction {
 688 };
 689
 690 struct SOP1_instruction : public Instruction {
 691 };
 692
 693 struct SOP2_instruction : public Instruction {
 694 };
 695
 696 /**
 697  * Scalar Memory Format:
 698  * For s_(buffer_)load_dword*:
 699  * Operand(0): SBASE - SGPR-pair which provides base address
 700  * Operand(1): Offset - immediate (un)signed offset or SGPR
 701  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 702  * Operand(n-1): SOffset - SGPR offset (Vega only)
 703  *
 704  * Having no operands is also valid for instructions such as s_dcache_inv.
 705  *
 706  */
 707 struct SMEM_instruction : public Instruction {
 708    bool glc; /* VI+: globally coherent */
 709    bool dlc; /* NAVI: device level coherent */
 710    bool nv; /* VEGA only: Non-volatile */
 711    bool can_reorder;
 712    bool disable_wqm;
 713    barrier_interaction barrier;
 714 };
 715
 716 struct VOP1_instruction : public Instruction {
 717 };
 718
 719 struct VOP2_instruction : public Instruction {
 720 };
 721
 722 struct VOPC_instruction : public Instruction {
 723 };
 724
 725 struct VOP3A_instruction : public Instruction {
 726    bool abs[3];
 727    bool opsel[4];
 728    bool clamp;
 729    unsigned omod;
 730    bool neg[3];
 731 };
 732
 733 /**
 734  * Data Parallel Primitives Format:
 735  * This format can be used for VOP1, VOP2 or VOPC instructions.
 736  * The swizzle applies to the src0 operand.
 737  *
 738  */
 739 struct DPP_instruction : public Instruction {
 740    uint16_t dpp_ctrl;
 741    uint8_t row_mask;
 742    uint8_t bank_mask;
 743    bool abs[2];
 744    bool neg[2];
 745    bool bound_ctrl;
 746 };
 747
 748 struct Interp_instruction : public Instruction {
 749    unsigned attribute;
 750    unsigned component;
 751 };
 752
 753 /**
 754  * Local and Global Data Sharing instructions
 755  * Operand(0): ADDR - VGPR which supplies the address.
 756  * Operand(1): DATA0 - First data VGPR.
 757  * Operand(2): DATA1 - Second data VGPR.
 758  * Operand(n-1): M0 - LDS size.
 759  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 760  *
 761  */
 762 struct DS_instruction : public Instruction {
 763    int16_t offset0;
 764    int8_t offset1;
 765    bool gds;
 766 };
 767
 768 /**
 769  * Vector Memory Untyped-buffer Instructions
 770  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 771  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 772  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 773  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 774  *
 775  */
 776 struct MUBUF_instruction : public Instruction {
 777    unsigned offset; /* Unsigned byte offset - 12 bit */
 778    bool offen; /* Supply an offset from VGPR (VADDR) */
 779    bool idxen; /* Supply an index from VGPR (VADDR) */
 780    bool glc; /* globally coherent */
 781    bool dlc; /* NAVI: device level coherent */
 782    bool slc; /* system level coherent */
 783    bool tfe; /* texture fail enable */
 784    bool lds; /* Return read-data to LDS instead of VGPRs */
 785    bool disable_wqm; /* Require an exec mask without helper invocations */
 786    bool can_reorder;
 787    barrier_interaction barrier;
 788 };
 789
 790 /**
 791  * Vector Memory Typed-buffer Instructions
 792  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 793  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 794  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 795  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 796  *
 797  */
 798 struct MTBUF_instruction : public Instruction {
 799    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 800    uint8_t nfmt : 3; /* Numeric format of data in memory */
 801    unsigned offset; /* Unsigned byte offset - 12 bit */
 802    bool offen; /* Supply an offset from VGPR (VADDR) */
 803    bool idxen; /* Supply an index from VGPR (VADDR) */
 804    bool glc; /* globally coherent */
 805    bool dlc; /* NAVI: device level coherent */
 806    bool slc; /* system level coherent */
 807    bool tfe; /* texture fail enable */
 808    bool disable_wqm; /* Require an exec mask without helper invocations */
 809    bool can_reorder;
 810    barrier_interaction barrier;
 811 };
 812
 813 /**
 814  * Vector Memory Image Instructions
 815  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 816  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 817  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 818  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 819  *
 820  */
 821 struct MIMG_instruction : public Instruction {
 822    unsigned dmask; /* Data VGPR enable mask */
 823    unsigned dim; /* NAVI: dimensionality */
 824    bool unrm; /* Force address to be un-normalized */
 825    bool dlc; /* NAVI: device level coherent */
 826    bool glc; /* globally coherent */
 827    bool slc; /* system level coherent */
 828    bool tfe; /* texture fail enable */
 829    bool da; /* declare an array */
 830    bool lwe; /* Force data to be un-normalized */
 831    bool r128; /* NAVI: Texture resource size */
 832    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 833    bool d16; /* Convert 32-bit data to 16-bit data */
 834    bool disable_wqm; /* Require an exec mask without helper invocations */
 835    bool can_reorder;
 836    barrier_interaction barrier;
 837 };
 838
 839 /**
 840  * Flat/Scratch/Global Instructions
 841  * Operand(0): ADDR
 842  * Operand(1): SADDR
 843  * Operand(2) / Definition(0): DATA/VDST
 844  *
 845  */
 846 struct FLAT_instruction : public Instruction {
 847    uint16_t offset; /* Vega/Navi only */
 848    bool slc; /* system level coherent */
 849    bool glc; /* globally coherent */
 850    bool dlc; /* NAVI: device level coherent */
 851    bool lds;
 852    bool nv;
 853    bool disable_wqm; /* Require an exec mask without helper invocations */
 854    bool can_reorder;
 855    barrier_interaction barrier;
 856 };
 857
 858 struct Export_instruction : public Instruction {
 859    unsigned enabled_mask;
 860    unsigned dest;
 861    bool compressed;
 862    bool done;
 863    bool valid_mask;
 864 };
 865
 866 struct Pseudo_instruction : public Instruction {
 867    bool tmp_in_scc;
 868    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 869 };
 870
 871 struct Pseudo_branch_instruction : public Instruction {
 872    /* target[0] is the block index of the branch target.
 873     * For conditional branches, target[1] contains the fall-through alternative.
 874     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 875     */
 876    uint32_t target[2];
 877 };
 878
 879 struct Pseudo_barrier_instruction : public Instruction {
 880 };
 881
 882 enum ReduceOp {
 883    iadd32, iadd64,
 884    imul32, imul64,
 885    fadd32, fadd64,
 886    fmul32, fmul64,
 887    imin32, imin64,
 888    imax32, imax64,
 889    umin32, umin64,
 890    umax32, umax64,
 891    fmin32, fmin64,
 892    fmax32, fmax64,
 893    iand32, iand64,
 894    ior32, ior64,
 895    ixor32, ixor64,
 896    gfx10_wave64_bpermute
 897 };
 898
 899 /**
 900  * Subgroup Reduction Instructions, everything except for the data to be
 901  * reduced and the result as inserted by setup_reduce_temp().
 902  * Operand(0): data to be reduced
 903  * Operand(1): reduce temporary
 904  * Operand(2): vector temporary
 905  * Definition(0): result
 906  * Definition(1): scalar temporary
 907  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 908  * Definition(3): scc clobber
 909  * Definition(4): vcc clobber
 910  *
 911  */
 912 struct Pseudo_reduction_instruction : public Instruction {
 913    ReduceOp reduce_op;
 914    unsigned cluster_size; // must be 0 for scans
 915 };
 916
 917 struct instr_deleter_functor {
 918    void operator()(void* p) {
 919       free(p);
 920    }
 921 };
 922
 923 template<typename T>
 924 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 925
 926 template<typename T>
 927 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 928 {
 929    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 930    char *data = (char*) calloc(1, size);
 931    T* inst = (T*) data;
 932
 933    inst->opcode = opcode;
 934    inst->format = format;
 935
 936    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 937    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 938
 939    return inst;
 940 }
 941
 942 constexpr bool Instruction::usesModifiers() const noexcept
 943 {
 944    if (isDPP() || isSDWA())
 945       return true;
 946    if (!isVOP3())
 947       return false;
 948    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 949    for (unsigned i = 0; i < operands.size(); i++) {
 950       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 951          return true;
 952    }
 953    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 954 }
 955
 956 constexpr bool is_phi(Instruction* instr)
 957 {
 958    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 959 }
 960
 961 static inline bool is_phi(aco_ptr<Instruction>& instr)
 962 {
 963    return is_phi(instr.get());
 964 }
 965
 966 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 967 {
 968    switch (instr->format) {
 969    case Format::SMEM:
 970       return static_cast<SMEM_instruction*>(instr)->barrier;
 971    case Format::MUBUF:
 972       return static_cast<MUBUF_instruction*>(instr)->barrier;
 973    case Format::MIMG:
 974       return static_cast<MIMG_instruction*>(instr)->barrier;
 975    case Format::FLAT:
 976    case Format::GLOBAL:
 977    case Format::SCRATCH:
 978       return static_cast<FLAT_instruction*>(instr)->barrier;
 979    case Format::DS:
 980       return barrier_shared;
 981    default:
 982       return barrier_none;
 983    }
 984 }
 985
 986 enum block_kind {
 987    /* uniform indicates that leaving this block,
 988     * all actives lanes stay active */
 989    block_kind_uniform = 1 << 0,
 990    block_kind_top_level = 1 << 1,
 991    block_kind_loop_preheader = 1 << 2,
 992    block_kind_loop_header = 1 << 3,
 993    block_kind_loop_exit = 1 << 4,
 994    block_kind_continue = 1 << 5,
 995    block_kind_break = 1 << 6,
 996    block_kind_continue_or_break = 1 << 7,
 997    block_kind_discard = 1 << 8,
 998    block_kind_branch = 1 << 9,
 999    block_kind_merge = 1 << 10,
1000    block_kind_invert = 1 << 11,
1001    block_kind_uses_discard_if = 1 << 12,
1002    block_kind_needs_lowering = 1 << 13,
1003    block_kind_uses_demote = 1 << 14,
1004 };
1005
1006
1007 struct RegisterDemand {
1008    constexpr RegisterDemand() = default;
1009    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1010       : vgpr{v}, sgpr{s} {}
1011    int16_t vgpr = 0;
1012    int16_t sgpr = 0;
1013
1014    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1015       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1016    }
1017
1018    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1019       return vgpr > other.vgpr || sgpr > other.sgpr;
1020    }
1021
1022    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1023       if (t.type() == RegType::sgpr)
1024          return RegisterDemand( vgpr, sgpr + t.size() );
1025       else
1026          return RegisterDemand( vgpr + t.size(), sgpr );
1027    }
1028
1029    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1030       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1031    }
1032
1033    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1034       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1035    }
1036
1037    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1038       vgpr += other.vgpr;
1039       sgpr += other.sgpr;
1040       return *this;
1041    }
1042
1043    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1044       vgpr -= other.vgpr;
1045       sgpr -= other.sgpr;
1046       return *this;
1047    }
1048
1049    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1050       if (t.type() == RegType::sgpr)
1051          sgpr += t.size();
1052       else
1053          vgpr += t.size();
1054       return *this;
1055    }
1056
1057    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1058       if (t.type() == RegType::sgpr)
1059          sgpr -= t.size();
1060       else
1061          vgpr -= t.size();
1062       return *this;
1063    }
1064
1065    constexpr void update(const RegisterDemand other) noexcept {
1066       vgpr = std::max(vgpr, other.vgpr);
1067       sgpr = std::max(sgpr, other.sgpr);
1068    }
1069
1070 };
1071
1072 /* CFG */
1073 struct Block {
1074    float_mode fp_mode;
1075    unsigned index;
1076    unsigned offset = 0;
1077    std::vector<aco_ptr<Instruction>> instructions;
1078    std::vector<unsigned> logical_preds;
1079    std::vector<unsigned> linear_preds;
1080    std::vector<unsigned> logical_succs;
1081    std::vector<unsigned> linear_succs;
1082    RegisterDemand register_demand = RegisterDemand();
1083    uint16_t loop_nest_depth = 0;
1084    uint16_t kind = 0;
1085    int logical_idom = -1;
1086    int linear_idom = -1;
1087    Temp live_out_exec = Temp();
1088
1089    /* this information is needed for predecessors to blocks with phis when
1090     * moving out of ssa */
1091    bool scc_live_out = false;
1092    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1093
1094    Block(unsigned idx) : index(idx) {}
1095    Block() : index(0) {}
1096 };
1097
1098 using Stage = uint16_t;
1099
1100 /* software stages */
1101 static constexpr Stage sw_vs = 1 << 0;
1102 static constexpr Stage sw_gs = 1 << 1;
1103 static constexpr Stage sw_tcs = 1 << 2;
1104 static constexpr Stage sw_tes = 1 << 3;
1105 static constexpr Stage sw_fs = 1 << 4;
1106 static constexpr Stage sw_cs = 1 << 5;
1107 static constexpr Stage sw_mask = 0x3f;
1108
1109 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1110 static constexpr Stage hw_vs = 1 << 6;
1111 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1112 static constexpr Stage hw_gs = 1 << 8;
1113 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1114 static constexpr Stage hw_hs = 1 << 10;
1115 static constexpr Stage hw_fs = 1 << 11;
1116 static constexpr Stage hw_cs = 1 << 12;
1117 static constexpr Stage hw_mask = 0x7f << 6;
1118
1119 /* possible settings of Program::stage */
1120 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1121 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1122 static constexpr Stage compute_cs = sw_cs | hw_cs;
1123 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1124 /* GFX10/NGG */
1125 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1126 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1128 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1129 /* GFX9 (and GFX10 if NGG isn't used) */
1130 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1131 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1132 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1133 /* pre-GFX9 */
1134 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1135 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1136 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1137 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1138 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1139
1140 class Program final {
1141 public:
1142    float_mode next_fp_mode;
1143    std::vector<Block> blocks;
1144    RegisterDemand max_reg_demand = RegisterDemand();
1145    uint16_t num_waves = 0;
1146    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1147    ac_shader_config* config;
1148    struct radv_shader_info *info;
1149    enum chip_class chip_class;
1150    enum radeon_family family;
1151    unsigned wave_size;
1152    RegClass lane_mask;
1153    Stage stage; /* Stage */
1154    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1155    bool needs_wqm = false; /* there exists a p_wqm instruction */
1156    bool wb_smem_l1_on_end = false;
1157
1158    std::vector<uint8_t> constant_data;
1159    Temp private_segment_buffer;
1160    Temp scratch_offset;
1161
1162    uint16_t lds_alloc_granule;
1163    uint32_t lds_limit; /* in bytes */
1164    uint16_t vgpr_limit;
1165    uint16_t sgpr_limit;
1166    uint16_t physical_sgprs;
1167    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1168
1169    bool needs_vcc = false;
1170    bool needs_xnack_mask = false;
1171    bool needs_flat_scr = false;
1172
1173    uint32_t allocateId()
1174    {
1175       assert(allocationID <= 16777215);
1176       return allocationID++;
1177    }
1178
1179    uint32_t peekAllocationId()
1180    {
1181       return allocationID;
1182    }
1183
1184    void setAllocationId(uint32_t id)
1185    {
1186       allocationID = id;
1187    }
1188
1189    Block* create_and_insert_block() {
1190       blocks.emplace_back(blocks.size());
1191       blocks.back().fp_mode = next_fp_mode;
1192       return &blocks.back();
1193    }
1194
1195    Block* insert_block(Block&& block) {
1196       block.index = blocks.size();
1197       block.fp_mode = next_fp_mode;
1198       blocks.emplace_back(std::move(block));
1199       return &blocks.back();
1200    }
1201
1202 private:
1203    uint32_t allocationID = 1;
1204 };
1205
1206 struct live {
1207    /* live temps out per block */
1208    std::vector<std::set<Temp>> live_out;
1209    /* register demand (sgpr/vgpr) per instruction per block */
1210    std::vector<std::vector<RegisterDemand>> register_demand;
1211 };
1212
1213 void select_program(Program *program,
1214                     unsigned shader_count,
1215                     struct nir_shader *const *shaders,
1216                     ac_shader_config* config,
1217                     struct radv_shader_args *args);
1218
1219 void lower_wqm(Program* program, live& live_vars,
1220                const struct radv_nir_compiler_options *options);
1221 void lower_bool_phis(Program* program);
1222 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1223 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1224 std::vector<uint16_t> dead_code_analysis(Program *program);
1225 void dominator_tree(Program* program);
1226 void insert_exec_mask(Program *program);
1227 void value_numbering(Program* program);
1228 void optimize(Program* program);
1229 void setup_reduce_temp(Program* program);
1230 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1232 void ssa_elimination(Program* program);
1233 void lower_to_hw_instr(Program* program);
1234 void schedule_program(Program* program, live& live_vars);
1235 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1236 void insert_wait_states(Program* program);
1237 void insert_NOPs(Program* program);
1238 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1239 void print_asm(Program *program, std::vector<uint32_t>& binary,
1240                unsigned exec_size, std::ostream& out);
1241 void validate(Program* program, FILE *output);
1242 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1243 #ifndef NDEBUG
1244 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1245 #else
1246 #define perfwarn(program, cond, msg, ...)
1247 #endif
1248
1249 void aco_print_instr(Instruction *instr, FILE *output);
1250 void aco_print_program(Program *program, FILE *output);
1251
1252 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1253 uint16_t get_extra_sgprs(Program *program);
1254
1255 /* get number of sgprs allocated required to address a number of sgprs */
1256 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1257
1258 /* return number of addressable SGPRs for max_waves */
1259 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1260
1261 typedef struct {
1262    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1263    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1264    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1265    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1266    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1267    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1268    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1269 } Info;
1270
1271 extern const Info instr_info;
1272
1273 }
1274
1275 #endif /* ACO_IR_H */
1276