src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    barrier_count = 4,
 112 };
 113
 114 enum fp_round {
 115    fp_round_ne = 0,
 116    fp_round_pi = 1,
 117    fp_round_ni = 2,
 118    fp_round_tz = 3,
 119 };
 120
 121 enum fp_denorm {
 122    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 123     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 124    fp_denorm_flush = 0x0,
 125    fp_denorm_keep = 0x3,
 126 };
 127
 128 struct float_mode {
 129    /* matches encoding of the MODE register */
 130    union {
 131       struct {
 132           fp_round round32:2;
 133           fp_round round16_64:2;
 134           unsigned denorm32:2;
 135           unsigned denorm16_64:2;
 136       };
 137       uint8_t val = 0;
 138    };
 139    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 140    bool preserve_signed_zero_inf_nan32:1;
 141    bool preserve_signed_zero_inf_nan16_64:1;
 142    /* if false, optimizations which may remove denormal flushing can be done */
 143    bool must_flush_denorms32:1;
 144    bool must_flush_denorms16_64:1;
 145    bool care_about_round32:1;
 146    bool care_about_round16_64:1;
 147
 148    /* Returns true if instructions using the mode "other" can safely use the
 149     * current one instead. */
 150    bool canReplace(float_mode other) const noexcept {
 151       return val == other.val &&
 152              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 153              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 154              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 155              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 156              (care_about_round32 || !other.care_about_round32) &&
 157              (care_about_round16_64 || !other.care_about_round16_64);
 158    }
 159 };
 160
 161 constexpr Format asVOP3(Format format) {
 162    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 163 };
 164
 165 enum class RegType {
 166    none = 0,
 167    sgpr,
 168    vgpr,
 169    linear_vgpr,
 170 };
 171
 172 struct RegClass {
 173
 174    enum RC : uint8_t {
 175       s1 = 1,
 176       s2 = 2,
 177       s3 = 3,
 178       s4 = 4,
 179       s6 = 6,
 180       s8 = 8,
 181       s16 = 16,
 182       v1 = s1 | (1 << 5),
 183       v2 = s2 | (1 << 5),
 184       v3 = s3 | (1 << 5),
 185       v4 = s4 | (1 << 5),
 186       v5 = 5  | (1 << 5),
 187       v6 = 6  | (1 << 5),
 188       v7 = 7  | (1 << 5),
 189       v8 = 8  | (1 << 5),
 190       /* these are used for WWM and spills to vgpr */
 191       v1_linear = v1 | (1 << 6),
 192       v2_linear = v2 | (1 << 6),
 193    };
 194
 195    RegClass() = default;
 196    constexpr RegClass(RC rc)
 197       : rc(rc) {}
 198    constexpr RegClass(RegType type, unsigned size)
 199       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 200
 201    constexpr operator RC() const { return rc; }
 202    explicit operator bool() = delete;
 203
 204    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 205    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 206    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 207    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 208
 209 private:
 210    RC rc;
 211 };
 212
 213 /* transitional helper expressions */
 214 static constexpr RegClass s1{RegClass::s1};
 215 static constexpr RegClass s2{RegClass::s2};
 216 static constexpr RegClass s3{RegClass::s3};
 217 static constexpr RegClass s4{RegClass::s4};
 218 static constexpr RegClass s8{RegClass::s8};
 219 static constexpr RegClass s16{RegClass::s16};
 220 static constexpr RegClass v1{RegClass::v1};
 221 static constexpr RegClass v2{RegClass::v2};
 222 static constexpr RegClass v3{RegClass::v3};
 223 static constexpr RegClass v4{RegClass::v4};
 224 static constexpr RegClass v5{RegClass::v5};
 225 static constexpr RegClass v6{RegClass::v6};
 226 static constexpr RegClass v7{RegClass::v7};
 227 static constexpr RegClass v8{RegClass::v8};
 228
 229 /**
 230  * Temp Class
 231  * Each temporary virtual register has a
 232  * register class (i.e. size and type)
 233  * and SSA id.
 234  */
 235 struct Temp {
 236    Temp() = default;
 237    constexpr Temp(uint32_t id, RegClass cls) noexcept
 238       : id_(id), reg_class(cls) {}
 239
 240    constexpr uint32_t id() const noexcept { return id_; }
 241    constexpr RegClass regClass() const noexcept { return reg_class; }
 242
 243    constexpr unsigned size() const noexcept { return reg_class.size(); }
 244    constexpr RegType type() const noexcept { return reg_class.type(); }
 245    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 246
 247    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 248    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 249    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 250
 251 private:
 252    uint32_t id_:24;
 253    RegClass reg_class;
 254 };
 255
 256 /**
 257  * PhysReg
 258  * Represents the physical register for each
 259  * Operand and Definition.
 260  */
 261 struct PhysReg {
 262    constexpr PhysReg() = default;
 263    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 264    constexpr operator unsigned() const { return reg; }
 265
 266    uint16_t reg = 0;
 267 };
 268
 269 /* helper expressions for special registers */
 270 static constexpr PhysReg m0{124};
 271 static constexpr PhysReg vcc{106};
 272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 273 static constexpr PhysReg exec{126};
 274 static constexpr PhysReg exec_lo{126};
 275 static constexpr PhysReg exec_hi{127};
 276 static constexpr PhysReg scc{253};
 277
 278 /**
 279  * Operand Class
 280  * Initially, each Operand refers to either
 281  * a temporary virtual register
 282  * or to a constant value
 283  * Temporary registers get mapped to physical register during RA
 284  * Constant values are inlined into the instruction sequence.
 285  */
 286 class Operand final
 287 {
 288 public:
 289    constexpr Operand()
 290       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 291         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 292
 293    explicit Operand(Temp r) noexcept
 294    {
 295       data_.temp = r;
 296       if (r.id()) {
 297          isTemp_ = true;
 298       } else {
 299          isUndef_ = true;
 300          setFixed(PhysReg{128});
 301       }
 302    };
 303    explicit Operand(uint32_t v) noexcept
 304    {
 305       data_.i = v;
 306       isConstant_ = true;
 307       if (v <= 64)
 308          setFixed(PhysReg{128 + v});
 309       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 310          setFixed(PhysReg{192 - v});
 311       else if (v == 0x3f000000) /* 0.5 */
 312          setFixed(PhysReg{240});
 313       else if (v == 0xbf000000) /* -0.5 */
 314          setFixed(PhysReg{241});
 315       else if (v == 0x3f800000) /* 1.0 */
 316          setFixed(PhysReg{242});
 317       else if (v == 0xbf800000) /* -1.0 */
 318          setFixed(PhysReg{243});
 319       else if (v == 0x40000000) /* 2.0 */
 320          setFixed(PhysReg{244});
 321       else if (v == 0xc0000000) /* -2.0 */
 322          setFixed(PhysReg{245});
 323       else if (v == 0x40800000) /* 4.0 */
 324          setFixed(PhysReg{246});
 325       else if (v == 0xc0800000) /* -4.0 */
 326          setFixed(PhysReg{247});
 327       else if (v == 0x3e22f983) /* 1/(2*PI) */
 328          setFixed(PhysReg{248});
 329       else /* Literal Constant */
 330          setFixed(PhysReg{255});
 331    };
 332    explicit Operand(uint64_t v) noexcept
 333    {
 334       isConstant_ = true;
 335       is64BitConst_ = true;
 336       if (v <= 64)
 337          setFixed(PhysReg{128 + (uint32_t) v});
 338       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 339          setFixed(PhysReg{192 - (uint32_t) v});
 340       else if (v == 0x3FE0000000000000) /* 0.5 */
 341          setFixed(PhysReg{240});
 342       else if (v == 0xBFE0000000000000) /* -0.5 */
 343          setFixed(PhysReg{241});
 344       else if (v == 0x3FF0000000000000) /* 1.0 */
 345          setFixed(PhysReg{242});
 346       else if (v == 0xBFF0000000000000) /* -1.0 */
 347          setFixed(PhysReg{243});
 348       else if (v == 0x4000000000000000) /* 2.0 */
 349          setFixed(PhysReg{244});
 350       else if (v == 0xC000000000000000) /* -2.0 */
 351          setFixed(PhysReg{245});
 352       else if (v == 0x4010000000000000) /* 4.0 */
 353          setFixed(PhysReg{246});
 354       else if (v == 0xC010000000000000) /* -4.0 */
 355          setFixed(PhysReg{247});
 356       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 357          setFixed(PhysReg{248});
 358       else { /* Literal Constant: we don't know if it is a long or double.*/
 359          isConstant_ = 0;
 360          assert(false && "attempt to create a 64-bit literal constant");
 361       }
 362    };
 363    explicit Operand(RegClass type) noexcept
 364    {
 365       isUndef_ = true;
 366       data_.temp = Temp(0, type);
 367       setFixed(PhysReg{128});
 368    };
 369    explicit Operand(PhysReg reg, RegClass type) noexcept
 370    {
 371       data_.temp = Temp(0, type);
 372       setFixed(reg);
 373    }
 374
 375    constexpr bool isTemp() const noexcept
 376    {
 377       return isTemp_;
 378    }
 379
 380    constexpr void setTemp(Temp t) noexcept {
 381       assert(!isConstant_);
 382       isTemp_ = true;
 383       data_.temp = t;
 384    }
 385
 386    constexpr Temp getTemp() const noexcept
 387    {
 388       return data_.temp;
 389    }
 390
 391    constexpr uint32_t tempId() const noexcept
 392    {
 393       return data_.temp.id();
 394    }
 395
 396    constexpr bool hasRegClass() const noexcept
 397    {
 398       return isTemp() || isUndefined();
 399    }
 400
 401    constexpr RegClass regClass() const noexcept
 402    {
 403       return data_.temp.regClass();
 404    }
 405
 406    constexpr unsigned size() const noexcept
 407    {
 408       if (isConstant())
 409          return is64BitConst_ ? 2 : 1;
 410       else
 411          return data_.temp.size();
 412    }
 413
 414    constexpr bool isFixed() const noexcept
 415    {
 416       return isFixed_;
 417    }
 418
 419    constexpr PhysReg physReg() const noexcept
 420    {
 421       return reg_;
 422    }
 423
 424    constexpr void setFixed(PhysReg reg) noexcept
 425    {
 426       isFixed_ = reg != unsigned(-1);
 427       reg_ = reg;
 428    }
 429
 430    constexpr bool isConstant() const noexcept
 431    {
 432       return isConstant_;
 433    }
 434
 435    constexpr bool isLiteral() const noexcept
 436    {
 437       return isConstant() && reg_ == 255;
 438    }
 439
 440    constexpr bool isUndefined() const noexcept
 441    {
 442       return isUndef_;
 443    }
 444
 445    constexpr uint32_t constantValue() const noexcept
 446    {
 447       return data_.i;
 448    }
 449
 450    constexpr bool constantEquals(uint32_t cmp) const noexcept
 451    {
 452       return isConstant() && constantValue() == cmp;
 453    }
 454
 455    constexpr void setKill(bool flag) noexcept
 456    {
 457       isKill_ = flag;
 458       if (!flag)
 459          setFirstKill(false);
 460    }
 461
 462    constexpr bool isKill() const noexcept
 463    {
 464       return isKill_ || isFirstKill();
 465    }
 466
 467    constexpr void setFirstKill(bool flag) noexcept
 468    {
 469       isFirstKill_ = flag;
 470       if (flag)
 471          setKill(flag);
 472    }
 473
 474    /* When there are multiple operands killing the same temporary,
 475     * isFirstKill() is only returns true for the first one. */
 476    constexpr bool isFirstKill() const noexcept
 477    {
 478       return isFirstKill_;
 479    }
 480
 481 private:
 482    union {
 483       uint32_t i;
 484       float f;
 485       Temp temp = Temp(0, s1);
 486    } data_;
 487    PhysReg reg_;
 488    union {
 489       struct {
 490          uint8_t isTemp_:1;
 491          uint8_t isFixed_:1;
 492          uint8_t isConstant_:1;
 493          uint8_t isKill_:1;
 494          uint8_t isUndef_:1;
 495          uint8_t isFirstKill_:1;
 496          uint8_t is64BitConst_:1;
 497       };
 498       /* can't initialize bit-fields in c++11, so work around using a union */
 499       uint8_t control_ = 0;
 500    };
 501 };
 502
 503 /**
 504  * Definition Class
 505  * Definitions are the results of Instructions
 506  * and refer to temporary virtual registers
 507  * which are later mapped to physical registers
 508  */
 509 class Definition final
 510 {
 511 public:
 512    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 513    Definition(uint32_t index, RegClass type) noexcept
 514       : temp(index, type) {}
 515    explicit Definition(Temp tmp) noexcept
 516       : temp(tmp) {}
 517    Definition(PhysReg reg, RegClass type) noexcept
 518       : temp(Temp(0, type))
 519    {
 520       setFixed(reg);
 521    }
 522    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 523       : temp(Temp(tmpId, type))
 524    {
 525       setFixed(reg);
 526    }
 527
 528    constexpr bool isTemp() const noexcept
 529    {
 530       return tempId() > 0;
 531    }
 532
 533    constexpr Temp getTemp() const noexcept
 534    {
 535       return temp;
 536    }
 537
 538    constexpr uint32_t tempId() const noexcept
 539    {
 540       return temp.id();
 541    }
 542
 543    constexpr void setTemp(Temp t) noexcept {
 544       temp = t;
 545    }
 546
 547    constexpr RegClass regClass() const noexcept
 548    {
 549       return temp.regClass();
 550    }
 551
 552    constexpr unsigned size() const noexcept
 553    {
 554       return temp.size();
 555    }
 556
 557    constexpr bool isFixed() const noexcept
 558    {
 559       return isFixed_;
 560    }
 561
 562    constexpr PhysReg physReg() const noexcept
 563    {
 564       return reg_;
 565    }
 566
 567    constexpr void setFixed(PhysReg reg) noexcept
 568    {
 569       isFixed_ = 1;
 570       reg_ = reg;
 571    }
 572
 573    constexpr void setHint(PhysReg reg) noexcept
 574    {
 575       hasHint_ = 1;
 576       reg_ = reg;
 577    }
 578
 579    constexpr bool hasHint() const noexcept
 580    {
 581       return hasHint_;
 582    }
 583
 584    constexpr void setKill(bool flag) noexcept
 585    {
 586       isKill_ = flag;
 587    }
 588
 589    constexpr bool isKill() const noexcept
 590    {
 591       return isKill_;
 592    }
 593
 594 private:
 595    Temp temp = Temp(0, s1);
 596    PhysReg reg_;
 597    union {
 598       struct {
 599          uint8_t isFixed_:1;
 600          uint8_t hasHint_:1;
 601          uint8_t isKill_:1;
 602       };
 603       /* can't initialize bit-fields in c++11, so work around using a union */
 604       uint8_t control_ = 0;
 605    };
 606 };
 607
 608 class Block;
 609
 610 struct Instruction {
 611    aco_opcode opcode;
 612    Format format;
 613    uint32_t pass_flags;
 614
 615    aco::span<Operand> operands;
 616    aco::span<Definition> definitions;
 617
 618    constexpr bool isVALU() const noexcept
 619    {
 620       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 621           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 622           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 623           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 624           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 625           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 626    }
 627
 628    constexpr bool isSALU() const noexcept
 629    {
 630       return format == Format::SOP1 ||
 631              format == Format::SOP2 ||
 632              format == Format::SOPC ||
 633              format == Format::SOPK ||
 634              format == Format::SOPP;
 635    }
 636
 637    constexpr bool isVMEM() const noexcept
 638    {
 639       return format == Format::MTBUF ||
 640              format == Format::MUBUF ||
 641              format == Format::MIMG;
 642    }
 643
 644    constexpr bool isDPP() const noexcept
 645    {
 646       return (uint16_t) format & (uint16_t) Format::DPP;
 647    }
 648
 649    constexpr bool isVOP3() const noexcept
 650    {
 651       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 652              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 653              format == Format::VOP3P;
 654    }
 655
 656    constexpr bool isSDWA() const noexcept
 657    {
 658       return (uint16_t) format & (uint16_t) Format::SDWA;
 659    }
 660
 661    constexpr bool isFlatOrGlobal() const noexcept
 662    {
 663       return format == Format::FLAT || format == Format::GLOBAL;
 664    }
 665
 666    constexpr bool usesModifiers() const noexcept;
 667
 668    constexpr bool reads_exec() const noexcept
 669    {
 670       for (const Operand& op : operands) {
 671          if (op.isFixed() && op.physReg() == exec)
 672             return true;
 673       }
 674       return false;
 675    }
 676 };
 677
 678 struct SOPK_instruction : public Instruction {
 679    uint16_t imm;
 680 };
 681
 682 struct SOPP_instruction : public Instruction {
 683    uint32_t imm;
 684    int block;
 685 };
 686
 687 struct SOPC_instruction : public Instruction {
 688 };
 689
 690 struct SOP1_instruction : public Instruction {
 691 };
 692
 693 struct SOP2_instruction : public Instruction {
 694 };
 695
 696 /**
 697  * Scalar Memory Format:
 698  * For s_(buffer_)load_dword*:
 699  * Operand(0): SBASE - SGPR-pair which provides base address
 700  * Operand(1): Offset - immediate (un)signed offset or SGPR
 701  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 702  * Operand(n-1): SOffset - SGPR offset (Vega only)
 703  *
 704  * Having no operands is also valid for instructions such as s_dcache_inv.
 705  *
 706  */
 707 struct SMEM_instruction : public Instruction {
 708    bool glc; /* VI+: globally coherent */
 709    bool dlc; /* NAVI: device level coherent */
 710    bool nv; /* VEGA only: Non-volatile */
 711    bool can_reorder;
 712    bool disable_wqm;
 713    barrier_interaction barrier;
 714 };
 715
 716 struct VOP1_instruction : public Instruction {
 717 };
 718
 719 struct VOP2_instruction : public Instruction {
 720 };
 721
 722 struct VOPC_instruction : public Instruction {
 723 };
 724
 725 struct VOP3A_instruction : public Instruction {
 726    bool abs[3];
 727    bool opsel[4];
 728    bool clamp;
 729    unsigned omod;
 730    bool neg[3];
 731 };
 732
 733 /**
 734  * Data Parallel Primitives Format:
 735  * This format can be used for VOP1, VOP2 or VOPC instructions.
 736  * The swizzle applies to the src0 operand.
 737  *
 738  */
 739 struct DPP_instruction : public Instruction {
 740    uint16_t dpp_ctrl;
 741    uint8_t row_mask;
 742    uint8_t bank_mask;
 743    bool abs[2];
 744    bool neg[2];
 745    bool bound_ctrl;
 746 };
 747
 748 struct Interp_instruction : public Instruction {
 749    unsigned attribute;
 750    unsigned component;
 751 };
 752
 753 /**
 754  * Local and Global Data Sharing instructions
 755  * Operand(0): ADDR - VGPR which supplies the address.
 756  * Operand(1): DATA0 - First data VGPR.
 757  * Operand(2): DATA1 - Second data VGPR.
 758  * Operand(n-1): M0 - LDS size.
 759  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 760  *
 761  */
 762 struct DS_instruction : public Instruction {
 763    int16_t offset0;
 764    int8_t offset1;
 765    bool gds;
 766 };
 767
 768 /**
 769  * Vector Memory Untyped-buffer Instructions
 770  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 771  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 772  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 773  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 774  *
 775  */
 776 struct MUBUF_instruction : public Instruction {
 777    unsigned offset; /* Unsigned byte offset - 12 bit */
 778    bool offen; /* Supply an offset from VGPR (VADDR) */
 779    bool idxen; /* Supply an index from VGPR (VADDR) */
 780    bool glc; /* globally coherent */
 781    bool dlc; /* NAVI: device level coherent */
 782    bool slc; /* system level coherent */
 783    bool tfe; /* texture fail enable */
 784    bool lds; /* Return read-data to LDS instead of VGPRs */
 785    bool disable_wqm; /* Require an exec mask without helper invocations */
 786    bool can_reorder;
 787    barrier_interaction barrier;
 788 };
 789
 790 /**
 791  * Vector Memory Typed-buffer Instructions
 792  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 793  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 794  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 795  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 796  *
 797  */
 798 struct MTBUF_instruction : public Instruction {
 799    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 800    uint8_t nfmt : 3; /* Numeric format of data in memory */
 801    unsigned offset; /* Unsigned byte offset - 12 bit */
 802    bool offen; /* Supply an offset from VGPR (VADDR) */
 803    bool idxen; /* Supply an index from VGPR (VADDR) */
 804    bool glc; /* globally coherent */
 805    bool dlc; /* NAVI: device level coherent */
 806    bool slc; /* system level coherent */
 807    bool tfe; /* texture fail enable */
 808    bool disable_wqm; /* Require an exec mask without helper invocations */
 809    bool can_reorder;
 810    barrier_interaction barrier;
 811 };
 812
 813 /**
 814  * Vector Memory Image Instructions
 815  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 816  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 817  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 818  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 819  *
 820  */
 821 struct MIMG_instruction : public Instruction {
 822    unsigned dmask; /* Data VGPR enable mask */
 823    unsigned dim; /* NAVI: dimensionality */
 824    bool unrm; /* Force address to be un-normalized */
 825    bool dlc; /* NAVI: device level coherent */
 826    bool glc; /* globally coherent */
 827    bool slc; /* system level coherent */
 828    bool tfe; /* texture fail enable */
 829    bool da; /* declare an array */
 830    bool lwe; /* Force data to be un-normalized */
 831    bool r128; /* NAVI: Texture resource size */
 832    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 833    bool d16; /* Convert 32-bit data to 16-bit data */
 834    bool disable_wqm; /* Require an exec mask without helper invocations */
 835    bool can_reorder;
 836    barrier_interaction barrier;
 837 };
 838
 839 /**
 840  * Flat/Scratch/Global Instructions
 841  * Operand(0): ADDR
 842  * Operand(1): SADDR
 843  * Operand(2) / Definition(0): DATA/VDST
 844  *
 845  */
 846 struct FLAT_instruction : public Instruction {
 847    uint16_t offset; /* Vega/Navi only */
 848    bool slc; /* system level coherent */
 849    bool glc; /* globally coherent */
 850    bool dlc; /* NAVI: device level coherent */
 851    bool lds;
 852    bool nv;
 853    bool disable_wqm;
 854 };
 855
 856 struct Export_instruction : public Instruction {
 857    unsigned enabled_mask;
 858    unsigned dest;
 859    bool compressed;
 860    bool done;
 861    bool valid_mask;
 862 };
 863
 864 struct Pseudo_instruction : public Instruction {
 865    bool tmp_in_scc;
 866    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 867 };
 868
 869 struct Pseudo_branch_instruction : public Instruction {
 870    /* target[0] is the block index of the branch target.
 871     * For conditional branches, target[1] contains the fall-through alternative.
 872     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 873     */
 874    uint32_t target[2];
 875 };
 876
 877 struct Pseudo_barrier_instruction : public Instruction {
 878 };
 879
 880 enum ReduceOp {
 881    iadd32, iadd64,
 882    imul32, imul64,
 883    fadd32, fadd64,
 884    fmul32, fmul64,
 885    imin32, imin64,
 886    imax32, imax64,
 887    umin32, umin64,
 888    umax32, umax64,
 889    fmin32, fmin64,
 890    fmax32, fmax64,
 891    iand32, iand64,
 892    ior32, ior64,
 893    ixor32, ixor64,
 894    gfx10_wave64_bpermute
 895 };
 896
 897 /**
 898  * Subgroup Reduction Instructions, everything except for the data to be
 899  * reduced and the result as inserted by setup_reduce_temp().
 900  * Operand(0): data to be reduced
 901  * Operand(1): reduce temporary
 902  * Operand(2): vector temporary
 903  * Definition(0): result
 904  * Definition(1): scalar temporary
 905  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 906  * Definition(3): scc clobber
 907  * Definition(4): vcc clobber
 908  *
 909  */
 910 struct Pseudo_reduction_instruction : public Instruction {
 911    ReduceOp reduce_op;
 912    unsigned cluster_size; // must be 0 for scans
 913 };
 914
 915 struct instr_deleter_functor {
 916    void operator()(void* p) {
 917       free(p);
 918    }
 919 };
 920
 921 template<typename T>
 922 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 923
 924 template<typename T>
 925 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 926 {
 927    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 928    char *data = (char*) calloc(1, size);
 929    T* inst = (T*) data;
 930
 931    inst->opcode = opcode;
 932    inst->format = format;
 933
 934    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 935    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 936
 937    return inst;
 938 }
 939
 940 constexpr bool Instruction::usesModifiers() const noexcept
 941 {
 942    if (isDPP() || isSDWA())
 943       return true;
 944    if (!isVOP3())
 945       return false;
 946    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 947    for (unsigned i = 0; i < operands.size(); i++) {
 948       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 949          return true;
 950    }
 951    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 952 }
 953
 954 constexpr bool is_phi(Instruction* instr)
 955 {
 956    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 957 }
 958
 959 static inline bool is_phi(aco_ptr<Instruction>& instr)
 960 {
 961    return is_phi(instr.get());
 962 }
 963
 964 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 965 {
 966    switch (instr->format) {
 967    case Format::SMEM:
 968       return static_cast<SMEM_instruction*>(instr)->barrier;
 969    case Format::MUBUF:
 970       return static_cast<MUBUF_instruction*>(instr)->barrier;
 971    case Format::MIMG:
 972       return static_cast<MIMG_instruction*>(instr)->barrier;
 973    case Format::FLAT:
 974    case Format::GLOBAL:
 975       return barrier_buffer;
 976    case Format::DS:
 977       return barrier_shared;
 978    default:
 979       return barrier_none;
 980    }
 981 }
 982
 983 enum block_kind {
 984    /* uniform indicates that leaving this block,
 985     * all actives lanes stay active */
 986    block_kind_uniform = 1 << 0,
 987    block_kind_top_level = 1 << 1,
 988    block_kind_loop_preheader = 1 << 2,
 989    block_kind_loop_header = 1 << 3,
 990    block_kind_loop_exit = 1 << 4,
 991    block_kind_continue = 1 << 5,
 992    block_kind_break = 1 << 6,
 993    block_kind_continue_or_break = 1 << 7,
 994    block_kind_discard = 1 << 8,
 995    block_kind_branch = 1 << 9,
 996    block_kind_merge = 1 << 10,
 997    block_kind_invert = 1 << 11,
 998    block_kind_uses_discard_if = 1 << 12,
 999    block_kind_needs_lowering = 1 << 13,
1000    block_kind_uses_demote = 1 << 14,
1001 };
1002
1003
1004 struct RegisterDemand {
1005    constexpr RegisterDemand() = default;
1006    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1007       : vgpr{v}, sgpr{s} {}
1008    int16_t vgpr = 0;
1009    int16_t sgpr = 0;
1010
1011    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1012       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1013    }
1014
1015    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1016       return vgpr > other.vgpr || sgpr > other.sgpr;
1017    }
1018
1019    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1020       if (t.type() == RegType::sgpr)
1021          return RegisterDemand( vgpr, sgpr + t.size() );
1022       else
1023          return RegisterDemand( vgpr + t.size(), sgpr );
1024    }
1025
1026    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1027       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1028    }
1029
1030    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1031       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1032    }
1033
1034    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1035       vgpr += other.vgpr;
1036       sgpr += other.sgpr;
1037       return *this;
1038    }
1039
1040    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1041       vgpr -= other.vgpr;
1042       sgpr -= other.sgpr;
1043       return *this;
1044    }
1045
1046    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1047       if (t.type() == RegType::sgpr)
1048          sgpr += t.size();
1049       else
1050          vgpr += t.size();
1051       return *this;
1052    }
1053
1054    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1055       if (t.type() == RegType::sgpr)
1056          sgpr -= t.size();
1057       else
1058          vgpr -= t.size();
1059       return *this;
1060    }
1061
1062    constexpr void update(const RegisterDemand other) noexcept {
1063       vgpr = std::max(vgpr, other.vgpr);
1064       sgpr = std::max(sgpr, other.sgpr);
1065    }
1066
1067 };
1068
1069 /* CFG */
1070 struct Block {
1071    float_mode fp_mode;
1072    unsigned index;
1073    unsigned offset = 0;
1074    std::vector<aco_ptr<Instruction>> instructions;
1075    std::vector<unsigned> logical_preds;
1076    std::vector<unsigned> linear_preds;
1077    std::vector<unsigned> logical_succs;
1078    std::vector<unsigned> linear_succs;
1079    RegisterDemand register_demand = RegisterDemand();
1080    uint16_t loop_nest_depth = 0;
1081    uint16_t kind = 0;
1082    int logical_idom = -1;
1083    int linear_idom = -1;
1084    Temp live_out_exec = Temp();
1085
1086    /* this information is needed for predecessors to blocks with phis when
1087     * moving out of ssa */
1088    bool scc_live_out = false;
1089    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1090
1091    Block(unsigned idx) : index(idx) {}
1092    Block() : index(0) {}
1093 };
1094
1095 using Stage = uint16_t;
1096
1097 /* software stages */
1098 static constexpr Stage sw_vs = 1 << 0;
1099 static constexpr Stage sw_gs = 1 << 1;
1100 static constexpr Stage sw_tcs = 1 << 2;
1101 static constexpr Stage sw_tes = 1 << 3;
1102 static constexpr Stage sw_fs = 1 << 4;
1103 static constexpr Stage sw_cs = 1 << 5;
1104 static constexpr Stage sw_mask = 0x3f;
1105
1106 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1107 static constexpr Stage hw_vs = 1 << 6;
1108 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1109 static constexpr Stage hw_gs = 1 << 8;
1110 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1111 static constexpr Stage hw_hs = 1 << 10;
1112 static constexpr Stage hw_fs = 1 << 11;
1113 static constexpr Stage hw_cs = 1 << 12;
1114 static constexpr Stage hw_mask = 0x7f << 6;
1115
1116 /* possible settings of Program::stage */
1117 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1118 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1119 static constexpr Stage compute_cs = sw_cs | hw_cs;
1120 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1121 /* GFX10/NGG */
1122 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1123 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1124 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1125 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1126 /* GFX9 (and GFX10 if NGG isn't used) */
1127 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1128 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1129 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1130 /* pre-GFX9 */
1131 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1132 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1133 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1134 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1135 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1136
1137 class Program final {
1138 public:
1139    float_mode next_fp_mode;
1140    std::vector<Block> blocks;
1141    RegisterDemand max_reg_demand = RegisterDemand();
1142    uint16_t num_waves = 0;
1143    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1144    ac_shader_config* config;
1145    struct radv_shader_info *info;
1146    enum chip_class chip_class;
1147    enum radeon_family family;
1148    unsigned wave_size;
1149    Stage stage; /* Stage */
1150    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1151    bool needs_wqm = false; /* there exists a p_wqm instruction */
1152    bool wb_smem_l1_on_end = false;
1153
1154    std::vector<uint8_t> constant_data;
1155    Temp private_segment_buffer;
1156    Temp scratch_offset;
1157
1158    uint16_t lds_alloc_granule;
1159    uint32_t lds_limit; /* in bytes */
1160    uint16_t vgpr_limit;
1161    uint16_t sgpr_limit;
1162    uint16_t physical_sgprs;
1163    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1164
1165    bool needs_vcc = false;
1166    bool needs_xnack_mask = false;
1167    bool needs_flat_scr = false;
1168
1169    uint32_t allocateId()
1170    {
1171       assert(allocationID <= 16777215);
1172       return allocationID++;
1173    }
1174
1175    uint32_t peekAllocationId()
1176    {
1177       return allocationID;
1178    }
1179
1180    void setAllocationId(uint32_t id)
1181    {
1182       allocationID = id;
1183    }
1184
1185    Block* create_and_insert_block() {
1186       blocks.emplace_back(blocks.size());
1187       blocks.back().fp_mode = next_fp_mode;
1188       return &blocks.back();
1189    }
1190
1191    Block* insert_block(Block&& block) {
1192       block.index = blocks.size();
1193       block.fp_mode = next_fp_mode;
1194       blocks.emplace_back(std::move(block));
1195       return &blocks.back();
1196    }
1197
1198 private:
1199    uint32_t allocationID = 1;
1200 };
1201
1202 struct live {
1203    /* live temps out per block */
1204    std::vector<std::set<Temp>> live_out;
1205    /* register demand (sgpr/vgpr) per instruction per block */
1206    std::vector<std::vector<RegisterDemand>> register_demand;
1207 };
1208
1209 void select_program(Program *program,
1210                     unsigned shader_count,
1211                     struct nir_shader *const *shaders,
1212                     ac_shader_config* config,
1213                     struct radv_shader_args *args);
1214
1215 void lower_wqm(Program* program, live& live_vars,
1216                const struct radv_nir_compiler_options *options);
1217 void lower_bool_phis(Program* program);
1218 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1219 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1220 std::vector<uint16_t> dead_code_analysis(Program *program);
1221 void dominator_tree(Program* program);
1222 void insert_exec_mask(Program *program);
1223 void value_numbering(Program* program);
1224 void optimize(Program* program);
1225 void setup_reduce_temp(Program* program);
1226 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1227 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1228 void ssa_elimination(Program* program);
1229 void lower_to_hw_instr(Program* program);
1230 void schedule_program(Program* program, live& live_vars);
1231 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1232 void insert_wait_states(Program* program);
1233 void insert_NOPs(Program* program);
1234 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1235 void print_asm(Program *program, std::vector<uint32_t>& binary,
1236                unsigned exec_size, std::ostream& out);
1237 void validate(Program* program, FILE *output);
1238 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1239 #ifndef NDEBUG
1240 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1241 #else
1242 #define perfwarn(program, cond, msg, ...)
1243 #endif
1244
1245 void aco_print_instr(Instruction *instr, FILE *output);
1246 void aco_print_program(Program *program, FILE *output);
1247
1248 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1249 uint16_t get_extra_sgprs(Program *program);
1250
1251 /* get number of sgprs allocated required to address a number of sgprs */
1252 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1253
1254 /* return number of addressable SGPRs for max_waves */
1255 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1256
1257 typedef struct {
1258    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1259    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1260    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1261    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1262    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1263    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1264 } Info;
1265
1266 extern const Info instr_info;
1267
1268 }
1269
1270 #endif /* ACO_IR_H */
1271