src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    barrier_count = 4,
 112 };
 113
 114 enum fp_round {
 115    fp_round_ne = 0,
 116    fp_round_pi = 1,
 117    fp_round_ni = 2,
 118    fp_round_tz = 3,
 119 };
 120
 121 enum fp_denorm {
 122    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 123     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 124    fp_denorm_flush = 0x0,
 125    fp_denorm_keep = 0x3,
 126 };
 127
 128 struct float_mode {
 129    /* matches encoding of the MODE register */
 130    union {
 131       struct {
 132           fp_round round32:2;
 133           fp_round round16_64:2;
 134           unsigned denorm32:2;
 135           unsigned denorm16_64:2;
 136       };
 137       uint8_t val = 0;
 138    };
 139    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 140    bool preserve_signed_zero_inf_nan32:1;
 141    bool preserve_signed_zero_inf_nan16_64:1;
 142    /* if false, optimizations which may remove denormal flushing can be done */
 143    bool must_flush_denorms32:1;
 144    bool must_flush_denorms16_64:1;
 145    bool care_about_round32:1;
 146    bool care_about_round16_64:1;
 147
 148    /* Returns true if instructions using the mode "other" can safely use the
 149     * current one instead. */
 150    bool canReplace(float_mode other) const noexcept {
 151       return val == other.val &&
 152              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 153              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 154              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 155              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 156              (care_about_round32 || !other.care_about_round32) &&
 157              (care_about_round16_64 || !other.care_about_round16_64);
 158    }
 159 };
 160
 161 constexpr Format asVOP3(Format format) {
 162    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 163 };
 164
 165 enum class RegType {
 166    none = 0,
 167    sgpr,
 168    vgpr,
 169    linear_vgpr,
 170 };
 171
 172 struct RegClass {
 173
 174    enum RC : uint8_t {
 175       s1 = 1,
 176       s2 = 2,
 177       s3 = 3,
 178       s4 = 4,
 179       s6 = 6,
 180       s8 = 8,
 181       s16 = 16,
 182       v1 = s1 | (1 << 5),
 183       v2 = s2 | (1 << 5),
 184       v3 = s3 | (1 << 5),
 185       v4 = s4 | (1 << 5),
 186       v5 = 5  | (1 << 5),
 187       v6 = 6  | (1 << 5),
 188       v7 = 7  | (1 << 5),
 189       v8 = 8  | (1 << 5),
 190       /* these are used for WWM and spills to vgpr */
 191       v1_linear = v1 | (1 << 6),
 192       v2_linear = v2 | (1 << 6),
 193    };
 194
 195    RegClass() = default;
 196    constexpr RegClass(RC rc)
 197       : rc(rc) {}
 198    constexpr RegClass(RegType type, unsigned size)
 199       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 200
 201    constexpr operator RC() const { return rc; }
 202    explicit operator bool() = delete;
 203
 204    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 205    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 206    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 207    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 208
 209 private:
 210    RC rc;
 211 };
 212
 213 /* transitional helper expressions */
 214 static constexpr RegClass s1{RegClass::s1};
 215 static constexpr RegClass s2{RegClass::s2};
 216 static constexpr RegClass s3{RegClass::s3};
 217 static constexpr RegClass s4{RegClass::s4};
 218 static constexpr RegClass s8{RegClass::s8};
 219 static constexpr RegClass s16{RegClass::s16};
 220 static constexpr RegClass v1{RegClass::v1};
 221 static constexpr RegClass v2{RegClass::v2};
 222 static constexpr RegClass v3{RegClass::v3};
 223 static constexpr RegClass v4{RegClass::v4};
 224 static constexpr RegClass v5{RegClass::v5};
 225 static constexpr RegClass v6{RegClass::v6};
 226 static constexpr RegClass v7{RegClass::v7};
 227 static constexpr RegClass v8{RegClass::v8};
 228
 229 /**
 230  * Temp Class
 231  * Each temporary virtual register has a
 232  * register class (i.e. size and type)
 233  * and SSA id.
 234  */
 235 struct Temp {
 236    Temp() = default;
 237    constexpr Temp(uint32_t id, RegClass cls) noexcept
 238       : id_(id), reg_class(cls) {}
 239
 240    constexpr uint32_t id() const noexcept { return id_; }
 241    constexpr RegClass regClass() const noexcept { return reg_class; }
 242
 243    constexpr unsigned size() const noexcept { return reg_class.size(); }
 244    constexpr RegType type() const noexcept { return reg_class.type(); }
 245    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 246
 247    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 248    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 249    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 250
 251 private:
 252    uint32_t id_:24;
 253    RegClass reg_class;
 254 };
 255
 256 /**
 257  * PhysReg
 258  * Represents the physical register for each
 259  * Operand and Definition.
 260  */
 261 struct PhysReg {
 262    constexpr PhysReg() = default;
 263    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 264    constexpr operator unsigned() const { return reg; }
 265
 266    uint16_t reg = 0;
 267 };
 268
 269 /* helper expressions for special registers */
 270 static constexpr PhysReg m0{124};
 271 static constexpr PhysReg vcc{106};
 272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 273 static constexpr PhysReg exec{126};
 274 static constexpr PhysReg exec_lo{126};
 275 static constexpr PhysReg exec_hi{127};
 276 static constexpr PhysReg scc{253};
 277
 278 /**
 279  * Operand Class
 280  * Initially, each Operand refers to either
 281  * a temporary virtual register
 282  * or to a constant value
 283  * Temporary registers get mapped to physical register during RA
 284  * Constant values are inlined into the instruction sequence.
 285  */
 286 class Operand final
 287 {
 288 public:
 289    constexpr Operand()
 290       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 291         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 292
 293    explicit Operand(Temp r) noexcept
 294    {
 295       data_.temp = r;
 296       if (r.id()) {
 297          isTemp_ = true;
 298       } else {
 299          isUndef_ = true;
 300          setFixed(PhysReg{128});
 301       }
 302    };
 303    explicit Operand(uint32_t v) noexcept
 304    {
 305       data_.i = v;
 306       isConstant_ = true;
 307       if (v <= 64)
 308          setFixed(PhysReg{128 + v});
 309       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 310          setFixed(PhysReg{192 - v});
 311       else if (v == 0x3f000000) /* 0.5 */
 312          setFixed(PhysReg{240});
 313       else if (v == 0xbf000000) /* -0.5 */
 314          setFixed(PhysReg{241});
 315       else if (v == 0x3f800000) /* 1.0 */
 316          setFixed(PhysReg{242});
 317       else if (v == 0xbf800000) /* -1.0 */
 318          setFixed(PhysReg{243});
 319       else if (v == 0x40000000) /* 2.0 */
 320          setFixed(PhysReg{244});
 321       else if (v == 0xc0000000) /* -2.0 */
 322          setFixed(PhysReg{245});
 323       else if (v == 0x40800000) /* 4.0 */
 324          setFixed(PhysReg{246});
 325       else if (v == 0xc0800000) /* -4.0 */
 326          setFixed(PhysReg{247});
 327       else if (v == 0x3e22f983) /* 1/(2*PI) */
 328          setFixed(PhysReg{248});
 329       else /* Literal Constant */
 330          setFixed(PhysReg{255});
 331    };
 332    explicit Operand(uint64_t v) noexcept
 333    {
 334       isConstant_ = true;
 335       is64BitConst_ = true;
 336       if (v <= 64)
 337          setFixed(PhysReg{128 + (uint32_t) v});
 338       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 339          setFixed(PhysReg{192 - (uint32_t) v});
 340       else if (v == 0x3FE0000000000000) /* 0.5 */
 341          setFixed(PhysReg{240});
 342       else if (v == 0xBFE0000000000000) /* -0.5 */
 343          setFixed(PhysReg{241});
 344       else if (v == 0x3FF0000000000000) /* 1.0 */
 345          setFixed(PhysReg{242});
 346       else if (v == 0xBFF0000000000000) /* -1.0 */
 347          setFixed(PhysReg{243});
 348       else if (v == 0x4000000000000000) /* 2.0 */
 349          setFixed(PhysReg{244});
 350       else if (v == 0xC000000000000000) /* -2.0 */
 351          setFixed(PhysReg{245});
 352       else if (v == 0x4010000000000000) /* 4.0 */
 353          setFixed(PhysReg{246});
 354       else if (v == 0xC010000000000000) /* -4.0 */
 355          setFixed(PhysReg{247});
 356       else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */
 357          setFixed(PhysReg{248});
 358       else { /* Literal Constant: we don't know if it is a long or double.*/
 359          isConstant_ = 0;
 360          assert(false && "attempt to create a 64-bit literal constant");
 361       }
 362    };
 363    explicit Operand(RegClass type) noexcept
 364    {
 365       isUndef_ = true;
 366       data_.temp = Temp(0, type);
 367       setFixed(PhysReg{128});
 368    };
 369    explicit Operand(PhysReg reg, RegClass type) noexcept
 370    {
 371       data_.temp = Temp(0, type);
 372       setFixed(reg);
 373    }
 374
 375    constexpr bool isTemp() const noexcept
 376    {
 377       return isTemp_;
 378    }
 379
 380    constexpr void setTemp(Temp t) noexcept {
 381       assert(!isConstant_);
 382       isTemp_ = true;
 383       data_.temp = t;
 384    }
 385
 386    constexpr Temp getTemp() const noexcept
 387    {
 388       return data_.temp;
 389    }
 390
 391    constexpr uint32_t tempId() const noexcept
 392    {
 393       return data_.temp.id();
 394    }
 395
 396    constexpr bool hasRegClass() const noexcept
 397    {
 398       return isTemp() || isUndefined();
 399    }
 400
 401    constexpr RegClass regClass() const noexcept
 402    {
 403       return data_.temp.regClass();
 404    }
 405
 406    constexpr unsigned size() const noexcept
 407    {
 408       if (isConstant())
 409          return is64BitConst_ ? 2 : 1;
 410       else
 411          return data_.temp.size();
 412    }
 413
 414    constexpr bool isFixed() const noexcept
 415    {
 416       return isFixed_;
 417    }
 418
 419    constexpr PhysReg physReg() const noexcept
 420    {
 421       return reg_;
 422    }
 423
 424    constexpr void setFixed(PhysReg reg) noexcept
 425    {
 426       isFixed_ = reg != unsigned(-1);
 427       reg_ = reg;
 428    }
 429
 430    constexpr bool isConstant() const noexcept
 431    {
 432       return isConstant_;
 433    }
 434
 435    constexpr bool isLiteral() const noexcept
 436    {
 437       return isConstant() && reg_ == 255;
 438    }
 439
 440    constexpr bool isUndefined() const noexcept
 441    {
 442       return isUndef_;
 443    }
 444
 445    constexpr uint32_t constantValue() const noexcept
 446    {
 447       return data_.i;
 448    }
 449
 450    constexpr bool constantEquals(uint32_t cmp) const noexcept
 451    {
 452       return isConstant() && constantValue() == cmp;
 453    }
 454
 455    constexpr void setKill(bool flag) noexcept
 456    {
 457       isKill_ = flag;
 458       if (!flag)
 459          setFirstKill(false);
 460    }
 461
 462    constexpr bool isKill() const noexcept
 463    {
 464       return isKill_ || isFirstKill();
 465    }
 466
 467    constexpr void setFirstKill(bool flag) noexcept
 468    {
 469       isFirstKill_ = flag;
 470       if (flag)
 471          setKill(flag);
 472    }
 473
 474    /* When there are multiple operands killing the same temporary,
 475     * isFirstKill() is only returns true for the first one. */
 476    constexpr bool isFirstKill() const noexcept
 477    {
 478       return isFirstKill_;
 479    }
 480
 481 private:
 482    union {
 483       uint32_t i;
 484       float f;
 485       Temp temp = Temp(0, s1);
 486    } data_;
 487    PhysReg reg_;
 488    union {
 489       struct {
 490          uint8_t isTemp_:1;
 491          uint8_t isFixed_:1;
 492          uint8_t isConstant_:1;
 493          uint8_t isKill_:1;
 494          uint8_t isUndef_:1;
 495          uint8_t isFirstKill_:1;
 496          uint8_t is64BitConst_:1;
 497       };
 498       /* can't initialize bit-fields in c++11, so work around using a union */
 499       uint8_t control_ = 0;
 500    };
 501 };
 502
 503 /**
 504  * Definition Class
 505  * Definitions are the results of Instructions
 506  * and refer to temporary virtual registers
 507  * which are later mapped to physical registers
 508  */
 509 class Definition final
 510 {
 511 public:
 512    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 513    Definition(uint32_t index, RegClass type) noexcept
 514       : temp(index, type) {}
 515    explicit Definition(Temp tmp) noexcept
 516       : temp(tmp) {}
 517    Definition(PhysReg reg, RegClass type) noexcept
 518       : temp(Temp(0, type))
 519    {
 520       setFixed(reg);
 521    }
 522    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 523       : temp(Temp(tmpId, type))
 524    {
 525       setFixed(reg);
 526    }
 527
 528    constexpr bool isTemp() const noexcept
 529    {
 530       return tempId() > 0;
 531    }
 532
 533    constexpr Temp getTemp() const noexcept
 534    {
 535       return temp;
 536    }
 537
 538    constexpr uint32_t tempId() const noexcept
 539    {
 540       return temp.id();
 541    }
 542
 543    constexpr void setTemp(Temp t) noexcept {
 544       temp = t;
 545    }
 546
 547    constexpr RegClass regClass() const noexcept
 548    {
 549       return temp.regClass();
 550    }
 551
 552    constexpr unsigned size() const noexcept
 553    {
 554       return temp.size();
 555    }
 556
 557    constexpr bool isFixed() const noexcept
 558    {
 559       return isFixed_;
 560    }
 561
 562    constexpr PhysReg physReg() const noexcept
 563    {
 564       return reg_;
 565    }
 566
 567    constexpr void setFixed(PhysReg reg) noexcept
 568    {
 569       isFixed_ = 1;
 570       reg_ = reg;
 571    }
 572
 573    constexpr void setHint(PhysReg reg) noexcept
 574    {
 575       hasHint_ = 1;
 576       reg_ = reg;
 577    }
 578
 579    constexpr bool hasHint() const noexcept
 580    {
 581       return hasHint_;
 582    }
 583
 584    constexpr void setKill(bool flag) noexcept
 585    {
 586       isKill_ = flag;
 587    }
 588
 589    constexpr bool isKill() const noexcept
 590    {
 591       return isKill_;
 592    }
 593
 594 private:
 595    Temp temp = Temp(0, s1);
 596    PhysReg reg_;
 597    union {
 598       struct {
 599          uint8_t isFixed_:1;
 600          uint8_t hasHint_:1;
 601          uint8_t isKill_:1;
 602       };
 603       /* can't initialize bit-fields in c++11, so work around using a union */
 604       uint8_t control_ = 0;
 605    };
 606 };
 607
 608 class Block;
 609
 610 struct Instruction {
 611    aco_opcode opcode;
 612    Format format;
 613    uint32_t pass_flags;
 614
 615    aco::span<Operand> operands;
 616    aco::span<Definition> definitions;
 617
 618    constexpr bool isVALU() const noexcept
 619    {
 620       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 621           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 622           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 623           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 624           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 625           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 626    }
 627
 628    constexpr bool isSALU() const noexcept
 629    {
 630       return format == Format::SOP1 ||
 631              format == Format::SOP2 ||
 632              format == Format::SOPC ||
 633              format == Format::SOPK ||
 634              format == Format::SOPP;
 635    }
 636
 637    constexpr bool isVMEM() const noexcept
 638    {
 639       return format == Format::MTBUF ||
 640              format == Format::MUBUF ||
 641              format == Format::MIMG;
 642    }
 643
 644    constexpr bool isDPP() const noexcept
 645    {
 646       return (uint16_t) format & (uint16_t) Format::DPP;
 647    }
 648
 649    constexpr bool isVOP3() const noexcept
 650    {
 651       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 652              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 653              format == Format::VOP3P;
 654    }
 655
 656    constexpr bool isSDWA() const noexcept
 657    {
 658       return (uint16_t) format & (uint16_t) Format::SDWA;
 659    }
 660
 661    constexpr bool isFlatOrGlobal() const noexcept
 662    {
 663       return format == Format::FLAT || format == Format::GLOBAL;
 664    }
 665
 666    constexpr bool usesModifiers() const noexcept;
 667
 668    constexpr bool reads_exec() const noexcept
 669    {
 670       for (const Operand& op : operands) {
 671          if (op.isFixed() && op.physReg() == exec)
 672             return true;
 673       }
 674       return false;
 675    }
 676 };
 677
 678 struct SOPK_instruction : public Instruction {
 679    uint16_t imm;
 680 };
 681
 682 struct SOPP_instruction : public Instruction {
 683    uint32_t imm;
 684    int block;
 685 };
 686
 687 struct SOPC_instruction : public Instruction {
 688 };
 689
 690 struct SOP1_instruction : public Instruction {
 691 };
 692
 693 struct SOP2_instruction : public Instruction {
 694 };
 695
 696 /**
 697  * Scalar Memory Format:
 698  * For s_(buffer_)load_dword*:
 699  * Operand(0): SBASE - SGPR-pair which provides base address
 700  * Operand(1): Offset - immediate (un)signed offset or SGPR
 701  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 702  * Operand(n-1): SOffset - SGPR offset (Vega only)
 703  *
 704  * Having no operands is also valid for instructions such as s_dcache_inv.
 705  *
 706  */
 707 struct SMEM_instruction : public Instruction {
 708    bool glc; /* VI+: globally coherent */
 709    bool dlc; /* NAVI: device level coherent */
 710    bool nv; /* VEGA only: Non-volatile */
 711    bool can_reorder;
 712    bool disable_wqm;
 713    barrier_interaction barrier;
 714 };
 715
 716 struct VOP1_instruction : public Instruction {
 717 };
 718
 719 struct VOP2_instruction : public Instruction {
 720 };
 721
 722 struct VOPC_instruction : public Instruction {
 723 };
 724
 725 struct VOP3A_instruction : public Instruction {
 726    bool abs[3];
 727    bool opsel[4];
 728    bool clamp;
 729    unsigned omod;
 730    bool neg[3];
 731 };
 732
 733 /**
 734  * Data Parallel Primitives Format:
 735  * This format can be used for VOP1, VOP2 or VOPC instructions.
 736  * The swizzle applies to the src0 operand.
 737  *
 738  */
 739 struct DPP_instruction : public Instruction {
 740    uint16_t dpp_ctrl;
 741    uint8_t row_mask;
 742    uint8_t bank_mask;
 743    bool abs[2];
 744    bool neg[2];
 745    bool bound_ctrl;
 746 };
 747
 748 struct Interp_instruction : public Instruction {
 749    unsigned attribute;
 750    unsigned component;
 751 };
 752
 753 /**
 754  * Local and Global Data Sharing instructions
 755  * Operand(0): ADDR - VGPR which supplies the address.
 756  * Operand(1): DATA0 - First data VGPR.
 757  * Operand(2): DATA1 - Second data VGPR.
 758  * Operand(n-1): M0 - LDS size.
 759  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 760  *
 761  */
 762 struct DS_instruction : public Instruction {
 763    int16_t offset0;
 764    int8_t offset1;
 765    bool gds;
 766 };
 767
 768 /**
 769  * Vector Memory Untyped-buffer Instructions
 770  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 771  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 772  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 773  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 774  *
 775  */
 776 struct MUBUF_instruction : public Instruction {
 777    unsigned offset; /* Unsigned byte offset - 12 bit */
 778    bool offen; /* Supply an offset from VGPR (VADDR) */
 779    bool idxen; /* Supply an index from VGPR (VADDR) */
 780    bool glc; /* globally coherent */
 781    bool dlc; /* NAVI: device level coherent */
 782    bool slc; /* system level coherent */
 783    bool tfe; /* texture fail enable */
 784    bool lds; /* Return read-data to LDS instead of VGPRs */
 785    bool disable_wqm; /* Require an exec mask without helper invocations */
 786    bool can_reorder;
 787    barrier_interaction barrier;
 788 };
 789
 790 /**
 791  * Vector Memory Typed-buffer Instructions
 792  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 793  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 794  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 795  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 796  *
 797  */
 798 struct MTBUF_instruction : public Instruction {
 799    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 800    uint8_t nfmt : 3; /* Numeric format of data in memory */
 801    unsigned offset; /* Unsigned byte offset - 12 bit */
 802    bool offen; /* Supply an offset from VGPR (VADDR) */
 803    bool idxen; /* Supply an index from VGPR (VADDR) */
 804    bool glc; /* globally coherent */
 805    bool dlc; /* NAVI: device level coherent */
 806    bool slc; /* system level coherent */
 807    bool tfe; /* texture fail enable */
 808    bool disable_wqm; /* Require an exec mask without helper invocations */
 809    bool can_reorder;
 810    barrier_interaction barrier;
 811 };
 812
 813 /**
 814  * Vector Memory Image Instructions
 815  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 816  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 817  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 818  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 819  *
 820  */
 821 struct MIMG_instruction : public Instruction {
 822    unsigned dmask; /* Data VGPR enable mask */
 823    unsigned dim; /* NAVI: dimensionality */
 824    bool unrm; /* Force address to be un-normalized */
 825    bool dlc; /* NAVI: device level coherent */
 826    bool glc; /* globally coherent */
 827    bool slc; /* system level coherent */
 828    bool tfe; /* texture fail enable */
 829    bool da; /* declare an array */
 830    bool lwe; /* Force data to be un-normalized */
 831    bool r128; /* NAVI: Texture resource size */
 832    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 833    bool d16; /* Convert 32-bit data to 16-bit data */
 834    bool disable_wqm; /* Require an exec mask without helper invocations */
 835    bool can_reorder;
 836    barrier_interaction barrier;
 837 };
 838
 839 /**
 840  * Flat/Scratch/Global Instructions
 841  * Operand(0): ADDR
 842  * Operand(1): SADDR
 843  * Operand(2) / Definition(0): DATA/VDST
 844  *
 845  */
 846 struct FLAT_instruction : public Instruction {
 847    uint16_t offset; /* Vega only */
 848    bool slc; /* system level coherent */
 849    bool glc; /* globally coherent */
 850    bool dlc; /* NAVI: device level coherent */
 851    bool lds;
 852    bool nv;
 853 };
 854
 855 struct Export_instruction : public Instruction {
 856    unsigned enabled_mask;
 857    unsigned dest;
 858    bool compressed;
 859    bool done;
 860    bool valid_mask;
 861 };
 862
 863 struct Pseudo_instruction : public Instruction {
 864    bool tmp_in_scc;
 865    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 866 };
 867
 868 struct Pseudo_branch_instruction : public Instruction {
 869    /* target[0] is the block index of the branch target.
 870     * For conditional branches, target[1] contains the fall-through alternative.
 871     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 872     */
 873    uint32_t target[2];
 874 };
 875
 876 struct Pseudo_barrier_instruction : public Instruction {
 877 };
 878
 879 enum ReduceOp {
 880    iadd32, iadd64,
 881    imul32, imul64,
 882    fadd32, fadd64,
 883    fmul32, fmul64,
 884    imin32, imin64,
 885    imax32, imax64,
 886    umin32, umin64,
 887    umax32, umax64,
 888    fmin32, fmin64,
 889    fmax32, fmax64,
 890    iand32, iand64,
 891    ior32, ior64,
 892    ixor32, ixor64,
 893    gfx10_wave64_bpermute
 894 };
 895
 896 /**
 897  * Subgroup Reduction Instructions, everything except for the data to be
 898  * reduced and the result as inserted by setup_reduce_temp().
 899  * Operand(0): data to be reduced
 900  * Operand(1): reduce temporary
 901  * Operand(2): vector temporary
 902  * Definition(0): result
 903  * Definition(1): scalar temporary
 904  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 905  * Definition(3): scc clobber
 906  * Definition(4): vcc clobber
 907  *
 908  */
 909 struct Pseudo_reduction_instruction : public Instruction {
 910    ReduceOp reduce_op;
 911    unsigned cluster_size; // must be 0 for scans
 912 };
 913
 914 struct instr_deleter_functor {
 915    void operator()(void* p) {
 916       free(p);
 917    }
 918 };
 919
 920 template<typename T>
 921 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 922
 923 template<typename T>
 924 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 925 {
 926    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 927    char *data = (char*) calloc(1, size);
 928    T* inst = (T*) data;
 929
 930    inst->opcode = opcode;
 931    inst->format = format;
 932
 933    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 934    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 935
 936    return inst;
 937 }
 938
 939 constexpr bool Instruction::usesModifiers() const noexcept
 940 {
 941    if (isDPP() || isSDWA())
 942       return true;
 943    if (!isVOP3())
 944       return false;
 945    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 946    for (unsigned i = 0; i < operands.size(); i++) {
 947       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 948          return true;
 949    }
 950    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 951 }
 952
 953 constexpr bool is_phi(Instruction* instr)
 954 {
 955    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 956 }
 957
 958 static inline bool is_phi(aco_ptr<Instruction>& instr)
 959 {
 960    return is_phi(instr.get());
 961 }
 962
 963 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 964 {
 965    switch (instr->format) {
 966    case Format::SMEM:
 967       return static_cast<SMEM_instruction*>(instr)->barrier;
 968    case Format::MUBUF:
 969       return static_cast<MUBUF_instruction*>(instr)->barrier;
 970    case Format::MIMG:
 971       return static_cast<MIMG_instruction*>(instr)->barrier;
 972    case Format::FLAT:
 973    case Format::GLOBAL:
 974       return barrier_buffer;
 975    case Format::DS:
 976       return barrier_shared;
 977    default:
 978       return barrier_none;
 979    }
 980 }
 981
 982 enum block_kind {
 983    /* uniform indicates that leaving this block,
 984     * all actives lanes stay active */
 985    block_kind_uniform = 1 << 0,
 986    block_kind_top_level = 1 << 1,
 987    block_kind_loop_preheader = 1 << 2,
 988    block_kind_loop_header = 1 << 3,
 989    block_kind_loop_exit = 1 << 4,
 990    block_kind_continue = 1 << 5,
 991    block_kind_break = 1 << 6,
 992    block_kind_continue_or_break = 1 << 7,
 993    block_kind_discard = 1 << 8,
 994    block_kind_branch = 1 << 9,
 995    block_kind_merge = 1 << 10,
 996    block_kind_invert = 1 << 11,
 997    block_kind_uses_discard_if = 1 << 12,
 998    block_kind_needs_lowering = 1 << 13,
 999    block_kind_uses_demote = 1 << 14,
1000 };
1001
1002
1003 struct RegisterDemand {
1004    constexpr RegisterDemand() = default;
1005    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1006       : vgpr{v}, sgpr{s} {}
1007    int16_t vgpr = 0;
1008    int16_t sgpr = 0;
1009
1010    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1011       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1012    }
1013
1014    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1015       return vgpr > other.vgpr || sgpr > other.sgpr;
1016    }
1017
1018    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1019       if (t.type() == RegType::sgpr)
1020          return RegisterDemand( vgpr, sgpr + t.size() );
1021       else
1022          return RegisterDemand( vgpr + t.size(), sgpr );
1023    }
1024
1025    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1026       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1027    }
1028
1029    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1030       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1031    }
1032
1033    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1034       vgpr += other.vgpr;
1035       sgpr += other.sgpr;
1036       return *this;
1037    }
1038
1039    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1040       vgpr -= other.vgpr;
1041       sgpr -= other.sgpr;
1042       return *this;
1043    }
1044
1045    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1046       if (t.type() == RegType::sgpr)
1047          sgpr += t.size();
1048       else
1049          vgpr += t.size();
1050       return *this;
1051    }
1052
1053    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1054       if (t.type() == RegType::sgpr)
1055          sgpr -= t.size();
1056       else
1057          vgpr -= t.size();
1058       return *this;
1059    }
1060
1061    constexpr void update(const RegisterDemand other) noexcept {
1062       vgpr = std::max(vgpr, other.vgpr);
1063       sgpr = std::max(sgpr, other.sgpr);
1064    }
1065
1066 };
1067
1068 /* CFG */
1069 struct Block {
1070    float_mode fp_mode;
1071    unsigned index;
1072    unsigned offset = 0;
1073    std::vector<aco_ptr<Instruction>> instructions;
1074    std::vector<unsigned> logical_preds;
1075    std::vector<unsigned> linear_preds;
1076    std::vector<unsigned> logical_succs;
1077    std::vector<unsigned> linear_succs;
1078    RegisterDemand register_demand = RegisterDemand();
1079    uint16_t loop_nest_depth = 0;
1080    uint16_t kind = 0;
1081    int logical_idom = -1;
1082    int linear_idom = -1;
1083    Temp live_out_exec = Temp();
1084
1085    /* this information is needed for predecessors to blocks with phis when
1086     * moving out of ssa */
1087    bool scc_live_out = false;
1088    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1089
1090    Block(unsigned idx) : index(idx) {}
1091    Block() : index(0) {}
1092 };
1093
1094 using Stage = uint16_t;
1095
1096 /* software stages */
1097 static constexpr Stage sw_vs = 1 << 0;
1098 static constexpr Stage sw_gs = 1 << 1;
1099 static constexpr Stage sw_tcs = 1 << 2;
1100 static constexpr Stage sw_tes = 1 << 3;
1101 static constexpr Stage sw_fs = 1 << 4;
1102 static constexpr Stage sw_cs = 1 << 5;
1103 static constexpr Stage sw_mask = 0x3f;
1104
1105 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1106 static constexpr Stage hw_vs = 1 << 6;
1107 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1108 static constexpr Stage hw_gs = 1 << 8;
1109 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1110 static constexpr Stage hw_hs = 1 << 10;
1111 static constexpr Stage hw_fs = 1 << 11;
1112 static constexpr Stage hw_cs = 1 << 12;
1113 static constexpr Stage hw_mask = 0x7f << 6;
1114
1115 /* possible settings of Program::stage */
1116 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1117 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1118 static constexpr Stage compute_cs = sw_cs | hw_cs;
1119 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1120 /* GFX10/NGG */
1121 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1122 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1123 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1124 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1125 /* GFX9 (and GFX10 if NGG isn't used) */
1126 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1128 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1129 /* pre-GFX9 */
1130 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1131 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1132 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1133 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1134 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1135
1136 class Program final {
1137 public:
1138    float_mode next_fp_mode;
1139    std::vector<Block> blocks;
1140    RegisterDemand max_reg_demand = RegisterDemand();
1141    uint16_t num_waves = 0;
1142    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1143    ac_shader_config* config;
1144    struct radv_shader_info *info;
1145    enum chip_class chip_class;
1146    enum radeon_family family;
1147    unsigned wave_size;
1148    Stage stage; /* Stage */
1149    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1150    bool needs_wqm = false; /* there exists a p_wqm instruction */
1151    bool wb_smem_l1_on_end = false;
1152
1153    std::vector<uint8_t> constant_data;
1154    Temp private_segment_buffer;
1155    Temp scratch_offset;
1156
1157    uint16_t lds_alloc_granule;
1158    uint32_t lds_limit; /* in bytes */
1159    uint16_t vgpr_limit;
1160    uint16_t sgpr_limit;
1161    uint16_t physical_sgprs;
1162    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1163
1164    bool needs_vcc = false;
1165    bool needs_xnack_mask = false;
1166    bool needs_flat_scr = false;
1167
1168    uint32_t allocateId()
1169    {
1170       assert(allocationID <= 16777215);
1171       return allocationID++;
1172    }
1173
1174    uint32_t peekAllocationId()
1175    {
1176       return allocationID;
1177    }
1178
1179    void setAllocationId(uint32_t id)
1180    {
1181       allocationID = id;
1182    }
1183
1184    Block* create_and_insert_block() {
1185       blocks.emplace_back(blocks.size());
1186       blocks.back().fp_mode = next_fp_mode;
1187       return &blocks.back();
1188    }
1189
1190    Block* insert_block(Block&& block) {
1191       block.index = blocks.size();
1192       block.fp_mode = next_fp_mode;
1193       blocks.emplace_back(std::move(block));
1194       return &blocks.back();
1195    }
1196
1197 private:
1198    uint32_t allocationID = 1;
1199 };
1200
1201 struct live {
1202    /* live temps out per block */
1203    std::vector<std::set<Temp>> live_out;
1204    /* register demand (sgpr/vgpr) per instruction per block */
1205    std::vector<std::vector<RegisterDemand>> register_demand;
1206 };
1207
1208 void select_program(Program *program,
1209                     unsigned shader_count,
1210                     struct nir_shader *const *shaders,
1211                     ac_shader_config* config,
1212                     struct radv_shader_args *args);
1213
1214 void lower_wqm(Program* program, live& live_vars,
1215                const struct radv_nir_compiler_options *options);
1216 void lower_bool_phis(Program* program);
1217 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1218 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1219 std::vector<uint16_t> dead_code_analysis(Program *program);
1220 void dominator_tree(Program* program);
1221 void insert_exec_mask(Program *program);
1222 void value_numbering(Program* program);
1223 void optimize(Program* program);
1224 void setup_reduce_temp(Program* program);
1225 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1226 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1227 void ssa_elimination(Program* program);
1228 void lower_to_hw_instr(Program* program);
1229 void schedule_program(Program* program, live& live_vars);
1230 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1231 void insert_wait_states(Program* program);
1232 void insert_NOPs(Program* program);
1233 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1234 void print_asm(Program *program, std::vector<uint32_t>& binary,
1235                unsigned exec_size, std::ostream& out);
1236 void validate(Program* program, FILE *output);
1237 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1238 #ifndef NDEBUG
1239 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1240 #else
1241 #define perfwarn(program, cond, msg, ...)
1242 #endif
1243
1244 void aco_print_instr(Instruction *instr, FILE *output);
1245 void aco_print_program(Program *program, FILE *output);
1246
1247 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1248 uint16_t get_extra_sgprs(Program *program);
1249
1250 /* get number of sgprs allocated required to address a number of sgprs */
1251 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1252
1253 /* return number of addressable SGPRs for max_waves */
1254 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1255
1256 typedef struct {
1257    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1258    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1259    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1260    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1261    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1262    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1263 } Info;
1264
1265 extern const Info instr_info;
1266
1267 }
1268
1269 #endif /* ACO_IR_H */
1270