src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <bitset>
  31 #include <memory>
  32
  33 #include "nir.h"
  34 #include "ac_binary.h"
  35 #include "amd_family.h"
  36 #include "aco_opcodes.h"
  37 #include "aco_util.h"
  38
  39 struct radv_nir_compiler_options;
  40 struct radv_shader_args;
  41 struct radv_shader_info;
  42
  43 namespace aco {
  44
  45 extern uint64_t debug_flags;
  46
  47 enum {
  48    DEBUG_VALIDATE = 0x1,
  49    DEBUG_VALIDATE_RA = 0x2,
  50    DEBUG_PERFWARN = 0x4,
  51 };
  52
  53 /**
  54  * Representation of the instruction's microcode encoding format
  55  * Note: Some Vector ALU Formats can be combined, such that:
  56  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  57  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  58  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  59  *
  60  * (*) The same is applicable for VOP1 and VOPC instructions.
  61  */
  62 enum class Format : std::uint16_t {
  63    /* Pseudo Instruction Format */
  64    PSEUDO = 0,
  65    /* Scalar ALU & Control Formats */
  66    SOP1 = 1,
  67    SOP2 = 2,
  68    SOPK = 3,
  69    SOPP = 4,
  70    SOPC = 5,
  71    /* Scalar Memory Format */
  72    SMEM = 6,
  73    /* LDS/GDS Format */
  74    DS = 8,
  75    /* Vector Memory Buffer Formats */
  76    MTBUF = 9,
  77    MUBUF = 10,
  78    /* Vector Memory Image Format */
  79    MIMG = 11,
  80    /* Export Format */
  81    EXP = 12,
  82    /* Flat Formats */
  83    FLAT = 13,
  84    GLOBAL = 14,
  85    SCRATCH = 15,
  86
  87    PSEUDO_BRANCH = 16,
  88    PSEUDO_BARRIER = 17,
  89    PSEUDO_REDUCTION = 18,
  90
  91    /* Vector ALU Formats */
  92    VOP1 = 1 << 8,
  93    VOP2 = 1 << 9,
  94    VOPC = 1 << 10,
  95    VOP3 = 1 << 11,
  96    VOP3A = 1 << 11,
  97    VOP3B = 1 << 11,
  98    VOP3P = 1 << 12,
  99    /* Vector Parameter Interpolation Format */
 100    VINTRP = 1 << 13,
 101    DPP = 1 << 14,
 102    SDWA = 1 << 15,
 103 };
 104
 105 enum barrier_interaction {
 106    barrier_none = 0,
 107    barrier_buffer = 0x1,
 108    barrier_image = 0x2,
 109    barrier_atomic = 0x4,
 110    barrier_shared = 0x8,
 111    barrier_count = 4,
 112 };
 113
 114 enum fp_round {
 115    fp_round_ne = 0,
 116    fp_round_pi = 1,
 117    fp_round_ni = 2,
 118    fp_round_tz = 3,
 119 };
 120
 121 enum fp_denorm {
 122    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 123     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 124    fp_denorm_flush = 0x0,
 125    fp_denorm_keep = 0x3,
 126 };
 127
 128 struct float_mode {
 129    /* matches encoding of the MODE register */
 130    union {
 131       struct {
 132           fp_round round32:2;
 133           fp_round round16_64:2;
 134           unsigned denorm32:2;
 135           unsigned denorm16_64:2;
 136       };
 137       uint8_t val = 0;
 138    };
 139    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 140    bool preserve_signed_zero_inf_nan32:1;
 141    bool preserve_signed_zero_inf_nan16_64:1;
 142    /* if false, optimizations which may remove denormal flushing can be done */
 143    bool must_flush_denorms32:1;
 144    bool must_flush_denorms16_64:1;
 145    bool care_about_round32:1;
 146    bool care_about_round16_64:1;
 147
 148    /* Returns true if instructions using the mode "other" can safely use the
 149     * current one instead. */
 150    bool canReplace(float_mode other) const noexcept {
 151       return val == other.val &&
 152              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 153              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 154              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 155              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 156              (care_about_round32 || !other.care_about_round32) &&
 157              (care_about_round16_64 || !other.care_about_round16_64);
 158    }
 159 };
 160
 161 constexpr Format asVOP3(Format format) {
 162    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 163 };
 164
 165 enum class RegType {
 166    none = 0,
 167    sgpr,
 168    vgpr,
 169    linear_vgpr,
 170 };
 171
 172 struct RegClass {
 173
 174    enum RC : uint8_t {
 175       s1 = 1,
 176       s2 = 2,
 177       s3 = 3,
 178       s4 = 4,
 179       s6 = 6,
 180       s8 = 8,
 181       s16 = 16,
 182       v1 = s1 | (1 << 5),
 183       v2 = s2 | (1 << 5),
 184       v3 = s3 | (1 << 5),
 185       v4 = s4 | (1 << 5),
 186       v5 = 5  | (1 << 5),
 187       v6 = 6  | (1 << 5),
 188       v7 = 7  | (1 << 5),
 189       v8 = 8  | (1 << 5),
 190       /* these are used for WWM and spills to vgpr */
 191       v1_linear = v1 | (1 << 6),
 192       v2_linear = v2 | (1 << 6),
 193    };
 194
 195    RegClass() = default;
 196    constexpr RegClass(RC rc)
 197       : rc(rc) {}
 198    constexpr RegClass(RegType type, unsigned size)
 199       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 200
 201    constexpr operator RC() const { return rc; }
 202    explicit operator bool() = delete;
 203
 204    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 205    constexpr unsigned size() const { return (unsigned) rc & 0x1F; }
 206    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 207    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 208
 209 private:
 210    RC rc;
 211 };
 212
 213 /* transitional helper expressions */
 214 static constexpr RegClass s1{RegClass::s1};
 215 static constexpr RegClass s2{RegClass::s2};
 216 static constexpr RegClass s3{RegClass::s3};
 217 static constexpr RegClass s4{RegClass::s4};
 218 static constexpr RegClass s8{RegClass::s8};
 219 static constexpr RegClass s16{RegClass::s16};
 220 static constexpr RegClass v1{RegClass::v1};
 221 static constexpr RegClass v2{RegClass::v2};
 222 static constexpr RegClass v3{RegClass::v3};
 223 static constexpr RegClass v4{RegClass::v4};
 224 static constexpr RegClass v5{RegClass::v5};
 225 static constexpr RegClass v6{RegClass::v6};
 226 static constexpr RegClass v7{RegClass::v7};
 227 static constexpr RegClass v8{RegClass::v8};
 228
 229 /**
 230  * Temp Class
 231  * Each temporary virtual register has a
 232  * register class (i.e. size and type)
 233  * and SSA id.
 234  */
 235 struct Temp {
 236    Temp() = default;
 237    constexpr Temp(uint32_t id, RegClass cls) noexcept
 238       : id_(id), reg_class(cls) {}
 239
 240    constexpr uint32_t id() const noexcept { return id_; }
 241    constexpr RegClass regClass() const noexcept { return reg_class; }
 242
 243    constexpr unsigned size() const noexcept { return reg_class.size(); }
 244    constexpr RegType type() const noexcept { return reg_class.type(); }
 245    constexpr bool is_linear() const noexcept { return reg_class.is_linear(); }
 246
 247    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 248    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 249    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 250
 251 private:
 252    uint32_t id_:24;
 253    RegClass reg_class;
 254 };
 255
 256 /**
 257  * PhysReg
 258  * Represents the physical register for each
 259  * Operand and Definition.
 260  */
 261 struct PhysReg {
 262    constexpr PhysReg() = default;
 263    explicit constexpr PhysReg(unsigned r) : reg(r) {}
 264    constexpr operator unsigned() const { return reg; }
 265
 266    uint16_t reg = 0;
 267 };
 268
 269 /* helper expressions for special registers */
 270 static constexpr PhysReg m0{124};
 271 static constexpr PhysReg vcc{106};
 272 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 273 static constexpr PhysReg exec{126};
 274 static constexpr PhysReg exec_lo{126};
 275 static constexpr PhysReg exec_hi{127};
 276 static constexpr PhysReg scc{253};
 277
 278 /**
 279  * Operand Class
 280  * Initially, each Operand refers to either
 281  * a temporary virtual register
 282  * or to a constant value
 283  * Temporary registers get mapped to physical register during RA
 284  * Constant values are inlined into the instruction sequence.
 285  */
 286 class Operand final
 287 {
 288 public:
 289    constexpr Operand()
 290       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 291         isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {}
 292
 293    explicit Operand(Temp r) noexcept
 294    {
 295       data_.temp = r;
 296       if (r.id()) {
 297          isTemp_ = true;
 298       } else {
 299          isUndef_ = true;
 300          setFixed(PhysReg{128});
 301       }
 302    };
 303    explicit Operand(uint32_t v) noexcept
 304    {
 305       data_.i = v;
 306       isConstant_ = true;
 307       if (v <= 64)
 308          setFixed(PhysReg{128 + v});
 309       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 310          setFixed(PhysReg{192 - v});
 311       else if (v == 0x3f000000) /* 0.5 */
 312          setFixed(PhysReg{240});
 313       else if (v == 0xbf000000) /* -0.5 */
 314          setFixed(PhysReg{241});
 315       else if (v == 0x3f800000) /* 1.0 */
 316          setFixed(PhysReg{242});
 317       else if (v == 0xbf800000) /* -1.0 */
 318          setFixed(PhysReg{243});
 319       else if (v == 0x40000000) /* 2.0 */
 320          setFixed(PhysReg{244});
 321       else if (v == 0xc0000000) /* -2.0 */
 322          setFixed(PhysReg{245});
 323       else if (v == 0x40800000) /* 4.0 */
 324          setFixed(PhysReg{246});
 325       else if (v == 0xc0800000) /* -4.0 */
 326          setFixed(PhysReg{247});
 327       else /* Literal Constant */
 328          setFixed(PhysReg{255});
 329    };
 330    explicit Operand(uint64_t v) noexcept
 331    {
 332       isConstant_ = true;
 333       is64BitConst_ = true;
 334       if (v <= 64)
 335          setFixed(PhysReg{128 + (uint32_t) v});
 336       else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */
 337          setFixed(PhysReg{192 - (uint32_t) v});
 338       else if (v == 0x3FE0000000000000) /* 0.5 */
 339          setFixed(PhysReg{240});
 340       else if (v == 0xBFE0000000000000) /* -0.5 */
 341          setFixed(PhysReg{241});
 342       else if (v == 0x3FF0000000000000) /* 1.0 */
 343          setFixed(PhysReg{242});
 344       else if (v == 0xBFF0000000000000) /* -1.0 */
 345          setFixed(PhysReg{243});
 346       else if (v == 0x4000000000000000) /* 2.0 */
 347          setFixed(PhysReg{244});
 348       else if (v == 0xC000000000000000) /* -2.0 */
 349          setFixed(PhysReg{245});
 350       else if (v == 0x4010000000000000) /* 4.0 */
 351          setFixed(PhysReg{246});
 352       else if (v == 0xC010000000000000) /* -4.0 */
 353          setFixed(PhysReg{247});
 354       else { /* Literal Constant: we don't know if it is a long or double.*/
 355          isConstant_ = 0;
 356          assert(false && "attempt to create a 64-bit literal constant");
 357       }
 358    };
 359    explicit Operand(RegClass type) noexcept
 360    {
 361       isUndef_ = true;
 362       data_.temp = Temp(0, type);
 363       setFixed(PhysReg{128});
 364    };
 365    explicit Operand(PhysReg reg, RegClass type) noexcept
 366    {
 367       data_.temp = Temp(0, type);
 368       setFixed(reg);
 369    }
 370
 371    constexpr bool isTemp() const noexcept
 372    {
 373       return isTemp_;
 374    }
 375
 376    constexpr void setTemp(Temp t) noexcept {
 377       assert(!isConstant_);
 378       isTemp_ = true;
 379       data_.temp = t;
 380    }
 381
 382    constexpr Temp getTemp() const noexcept
 383    {
 384       return data_.temp;
 385    }
 386
 387    constexpr uint32_t tempId() const noexcept
 388    {
 389       return data_.temp.id();
 390    }
 391
 392    constexpr bool hasRegClass() const noexcept
 393    {
 394       return isTemp() || isUndefined();
 395    }
 396
 397    constexpr RegClass regClass() const noexcept
 398    {
 399       return data_.temp.regClass();
 400    }
 401
 402    constexpr unsigned size() const noexcept
 403    {
 404       if (isConstant())
 405          return is64BitConst_ ? 2 : 1;
 406       else
 407          return data_.temp.size();
 408    }
 409
 410    constexpr bool isFixed() const noexcept
 411    {
 412       return isFixed_;
 413    }
 414
 415    constexpr PhysReg physReg() const noexcept
 416    {
 417       return reg_;
 418    }
 419
 420    constexpr void setFixed(PhysReg reg) noexcept
 421    {
 422       isFixed_ = reg != unsigned(-1);
 423       reg_ = reg;
 424    }
 425
 426    constexpr bool isConstant() const noexcept
 427    {
 428       return isConstant_;
 429    }
 430
 431    constexpr bool isLiteral() const noexcept
 432    {
 433       return isConstant() && reg_ == 255;
 434    }
 435
 436    constexpr bool isUndefined() const noexcept
 437    {
 438       return isUndef_;
 439    }
 440
 441    constexpr uint32_t constantValue() const noexcept
 442    {
 443       return data_.i;
 444    }
 445
 446    constexpr bool constantEquals(uint32_t cmp) const noexcept
 447    {
 448       return isConstant() && constantValue() == cmp;
 449    }
 450
 451    constexpr void setKill(bool flag) noexcept
 452    {
 453       isKill_ = flag;
 454       if (!flag)
 455          setFirstKill(false);
 456    }
 457
 458    constexpr bool isKill() const noexcept
 459    {
 460       return isKill_ || isFirstKill();
 461    }
 462
 463    constexpr void setFirstKill(bool flag) noexcept
 464    {
 465       isFirstKill_ = flag;
 466       if (flag)
 467          setKill(flag);
 468    }
 469
 470    /* When there are multiple operands killing the same temporary,
 471     * isFirstKill() is only returns true for the first one. */
 472    constexpr bool isFirstKill() const noexcept
 473    {
 474       return isFirstKill_;
 475    }
 476
 477 private:
 478    union {
 479       uint32_t i;
 480       float f;
 481       Temp temp = Temp(0, s1);
 482    } data_;
 483    PhysReg reg_;
 484    union {
 485       struct {
 486          uint8_t isTemp_:1;
 487          uint8_t isFixed_:1;
 488          uint8_t isConstant_:1;
 489          uint8_t isKill_:1;
 490          uint8_t isUndef_:1;
 491          uint8_t isFirstKill_:1;
 492          uint8_t is64BitConst_:1;
 493       };
 494       /* can't initialize bit-fields in c++11, so work around using a union */
 495       uint8_t control_ = 0;
 496    };
 497 };
 498
 499 /**
 500  * Definition Class
 501  * Definitions are the results of Instructions
 502  * and refer to temporary virtual registers
 503  * which are later mapped to physical registers
 504  */
 505 class Definition final
 506 {
 507 public:
 508    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {}
 509    Definition(uint32_t index, RegClass type) noexcept
 510       : temp(index, type) {}
 511    explicit Definition(Temp tmp) noexcept
 512       : temp(tmp) {}
 513    Definition(PhysReg reg, RegClass type) noexcept
 514       : temp(Temp(0, type))
 515    {
 516       setFixed(reg);
 517    }
 518    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 519       : temp(Temp(tmpId, type))
 520    {
 521       setFixed(reg);
 522    }
 523
 524    constexpr bool isTemp() const noexcept
 525    {
 526       return tempId() > 0;
 527    }
 528
 529    constexpr Temp getTemp() const noexcept
 530    {
 531       return temp;
 532    }
 533
 534    constexpr uint32_t tempId() const noexcept
 535    {
 536       return temp.id();
 537    }
 538
 539    constexpr void setTemp(Temp t) noexcept {
 540       temp = t;
 541    }
 542
 543    constexpr RegClass regClass() const noexcept
 544    {
 545       return temp.regClass();
 546    }
 547
 548    constexpr unsigned size() const noexcept
 549    {
 550       return temp.size();
 551    }
 552
 553    constexpr bool isFixed() const noexcept
 554    {
 555       return isFixed_;
 556    }
 557
 558    constexpr PhysReg physReg() const noexcept
 559    {
 560       return reg_;
 561    }
 562
 563    constexpr void setFixed(PhysReg reg) noexcept
 564    {
 565       isFixed_ = 1;
 566       reg_ = reg;
 567    }
 568
 569    constexpr void setHint(PhysReg reg) noexcept
 570    {
 571       hasHint_ = 1;
 572       reg_ = reg;
 573    }
 574
 575    constexpr bool hasHint() const noexcept
 576    {
 577       return hasHint_;
 578    }
 579
 580    constexpr void setKill(bool flag) noexcept
 581    {
 582       isKill_ = flag;
 583    }
 584
 585    constexpr bool isKill() const noexcept
 586    {
 587       return isKill_;
 588    }
 589
 590 private:
 591    Temp temp = Temp(0, s1);
 592    PhysReg reg_;
 593    union {
 594       struct {
 595          uint8_t isFixed_:1;
 596          uint8_t hasHint_:1;
 597          uint8_t isKill_:1;
 598       };
 599       /* can't initialize bit-fields in c++11, so work around using a union */
 600       uint8_t control_ = 0;
 601    };
 602 };
 603
 604 class Block;
 605
 606 struct Instruction {
 607    aco_opcode opcode;
 608    Format format;
 609    uint32_t pass_flags;
 610
 611    aco::span<Operand> operands;
 612    aco::span<Definition> definitions;
 613
 614    constexpr bool isVALU() const noexcept
 615    {
 616       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 617           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 618           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 619           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 620           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 621           || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P;
 622    }
 623
 624    constexpr bool isSALU() const noexcept
 625    {
 626       return format == Format::SOP1 ||
 627              format == Format::SOP2 ||
 628              format == Format::SOPC ||
 629              format == Format::SOPK ||
 630              format == Format::SOPP;
 631    }
 632
 633    constexpr bool isVMEM() const noexcept
 634    {
 635       return format == Format::MTBUF ||
 636              format == Format::MUBUF ||
 637              format == Format::MIMG;
 638    }
 639
 640    constexpr bool isDPP() const noexcept
 641    {
 642       return (uint16_t) format & (uint16_t) Format::DPP;
 643    }
 644
 645    constexpr bool isVOP3() const noexcept
 646    {
 647       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 648              ((uint16_t) format & (uint16_t) Format::VOP3B) ||
 649              format == Format::VOP3P;
 650    }
 651
 652    constexpr bool isSDWA() const noexcept
 653    {
 654       return (uint16_t) format & (uint16_t) Format::SDWA;
 655    }
 656
 657    constexpr bool isFlatOrGlobal() const noexcept
 658    {
 659       return format == Format::FLAT || format == Format::GLOBAL;
 660    }
 661
 662    constexpr bool usesModifiers() const noexcept;
 663
 664    constexpr bool reads_exec() const noexcept
 665    {
 666       for (const Operand& op : operands) {
 667          if (op.isFixed() && op.physReg() == exec)
 668             return true;
 669       }
 670       return false;
 671    }
 672 };
 673
 674 struct SOPK_instruction : public Instruction {
 675    uint16_t imm;
 676 };
 677
 678 struct SOPP_instruction : public Instruction {
 679    uint32_t imm;
 680    int block;
 681 };
 682
 683 struct SOPC_instruction : public Instruction {
 684 };
 685
 686 struct SOP1_instruction : public Instruction {
 687 };
 688
 689 struct SOP2_instruction : public Instruction {
 690 };
 691
 692 /**
 693  * Scalar Memory Format:
 694  * For s_(buffer_)load_dword*:
 695  * Operand(0): SBASE - SGPR-pair which provides base address
 696  * Operand(1): Offset - immediate (un)signed offset or SGPR
 697  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 698  * Operand(n-1): SOffset - SGPR offset (Vega only)
 699  *
 700  * Having no operands is also valid for instructions such as s_dcache_inv.
 701  *
 702  */
 703 struct SMEM_instruction : public Instruction {
 704    bool glc; /* VI+: globally coherent */
 705    bool dlc; /* NAVI: device level coherent */
 706    bool nv; /* VEGA only: Non-volatile */
 707    bool can_reorder;
 708    bool disable_wqm;
 709    barrier_interaction barrier;
 710 };
 711
 712 struct VOP1_instruction : public Instruction {
 713 };
 714
 715 struct VOP2_instruction : public Instruction {
 716 };
 717
 718 struct VOPC_instruction : public Instruction {
 719 };
 720
 721 struct VOP3A_instruction : public Instruction {
 722    bool abs[3];
 723    bool opsel[4];
 724    bool clamp;
 725    unsigned omod;
 726    bool neg[3];
 727 };
 728
 729 /**
 730  * Data Parallel Primitives Format:
 731  * This format can be used for VOP1, VOP2 or VOPC instructions.
 732  * The swizzle applies to the src0 operand.
 733  *
 734  */
 735 struct DPP_instruction : public Instruction {
 736    uint16_t dpp_ctrl;
 737    uint8_t row_mask;
 738    uint8_t bank_mask;
 739    bool abs[2];
 740    bool neg[2];
 741    bool bound_ctrl;
 742 };
 743
 744 struct Interp_instruction : public Instruction {
 745    unsigned attribute;
 746    unsigned component;
 747 };
 748
 749 /**
 750  * Local and Global Data Sharing instructions
 751  * Operand(0): ADDR - VGPR which supplies the address.
 752  * Operand(1): DATA0 - First data VGPR.
 753  * Operand(2): DATA1 - Second data VGPR.
 754  * Operand(n-1): M0 - LDS size.
 755  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 756  *
 757  */
 758 struct DS_instruction : public Instruction {
 759    int16_t offset0;
 760    int8_t offset1;
 761    bool gds;
 762 };
 763
 764 /**
 765  * Vector Memory Untyped-buffer Instructions
 766  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 767  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 768  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 769  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 770  *
 771  */
 772 struct MUBUF_instruction : public Instruction {
 773    unsigned offset; /* Unsigned byte offset - 12 bit */
 774    bool offen; /* Supply an offset from VGPR (VADDR) */
 775    bool idxen; /* Supply an index from VGPR (VADDR) */
 776    bool glc; /* globally coherent */
 777    bool dlc; /* NAVI: device level coherent */
 778    bool slc; /* system level coherent */
 779    bool tfe; /* texture fail enable */
 780    bool lds; /* Return read-data to LDS instead of VGPRs */
 781    bool disable_wqm; /* Require an exec mask without helper invocations */
 782    bool can_reorder;
 783    barrier_interaction barrier;
 784 };
 785
 786 /**
 787  * Vector Memory Typed-buffer Instructions
 788  * Operand(0): VADDR - Address source. Can carry an index and/or offset
 789  * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant)
 790  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 791  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 792  *
 793  */
 794 struct MTBUF_instruction : public Instruction {
 795    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
 796    uint8_t nfmt : 3; /* Numeric format of data in memory */
 797    unsigned offset; /* Unsigned byte offset - 12 bit */
 798    bool offen; /* Supply an offset from VGPR (VADDR) */
 799    bool idxen; /* Supply an index from VGPR (VADDR) */
 800    bool glc; /* globally coherent */
 801    bool dlc; /* NAVI: device level coherent */
 802    bool slc; /* system level coherent */
 803    bool tfe; /* texture fail enable */
 804    bool disable_wqm; /* Require an exec mask without helper invocations */
 805    bool can_reorder;
 806    barrier_interaction barrier;
 807 };
 808
 809 /**
 810  * Vector Memory Image Instructions
 811  * Operand(0): VADDR - Address source. Can carry an offset or an index.
 812  * Operand(1): SRSRC - Scalar GPR that specifies the resource constant.
 813  * Operand(2): SSAMP - Scalar GPR that specifies sampler constant.
 814  * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result.
 815  *
 816  */
 817 struct MIMG_instruction : public Instruction {
 818    unsigned dmask; /* Data VGPR enable mask */
 819    unsigned dim; /* NAVI: dimensionality */
 820    bool unrm; /* Force address to be un-normalized */
 821    bool dlc; /* NAVI: device level coherent */
 822    bool glc; /* globally coherent */
 823    bool slc; /* system level coherent */
 824    bool tfe; /* texture fail enable */
 825    bool da; /* declare an array */
 826    bool lwe; /* Force data to be un-normalized */
 827    bool r128; /* NAVI: Texture resource size */
 828    bool a16; /* VEGA, NAVI: Address components are 16-bits */
 829    bool d16; /* Convert 32-bit data to 16-bit data */
 830    bool disable_wqm; /* Require an exec mask without helper invocations */
 831    bool can_reorder;
 832    barrier_interaction barrier;
 833 };
 834
 835 /**
 836  * Flat/Scratch/Global Instructions
 837  * Operand(0): ADDR
 838  * Operand(1): SADDR
 839  * Operand(2) / Definition(0): DATA/VDST
 840  *
 841  */
 842 struct FLAT_instruction : public Instruction {
 843    uint16_t offset; /* Vega/Navi only */
 844    bool slc; /* system level coherent */
 845    bool glc; /* globally coherent */
 846    bool dlc; /* NAVI: device level coherent */
 847    bool lds;
 848    bool nv;
 849    bool disable_wqm; /* Require an exec mask without helper invocations */
 850    bool can_reorder;
 851    barrier_interaction barrier;
 852 };
 853
 854 struct Export_instruction : public Instruction {
 855    unsigned enabled_mask;
 856    unsigned dest;
 857    bool compressed;
 858    bool done;
 859    bool valid_mask;
 860 };
 861
 862 struct Pseudo_instruction : public Instruction {
 863    bool tmp_in_scc;
 864    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
 865 };
 866
 867 struct Pseudo_branch_instruction : public Instruction {
 868    /* target[0] is the block index of the branch target.
 869     * For conditional branches, target[1] contains the fall-through alternative.
 870     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
 871     */
 872    uint32_t target[2];
 873 };
 874
 875 struct Pseudo_barrier_instruction : public Instruction {
 876 };
 877
 878 enum ReduceOp {
 879    iadd32, iadd64,
 880    imul32, imul64,
 881    fadd32, fadd64,
 882    fmul32, fmul64,
 883    imin32, imin64,
 884    imax32, imax64,
 885    umin32, umin64,
 886    umax32, umax64,
 887    fmin32, fmin64,
 888    fmax32, fmax64,
 889    iand32, iand64,
 890    ior32, ior64,
 891    ixor32, ixor64,
 892    gfx10_wave64_bpermute
 893 };
 894
 895 /**
 896  * Subgroup Reduction Instructions, everything except for the data to be
 897  * reduced and the result as inserted by setup_reduce_temp().
 898  * Operand(0): data to be reduced
 899  * Operand(1): reduce temporary
 900  * Operand(2): vector temporary
 901  * Definition(0): result
 902  * Definition(1): scalar temporary
 903  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 904  * Definition(3): scc clobber
 905  * Definition(4): vcc clobber
 906  *
 907  */
 908 struct Pseudo_reduction_instruction : public Instruction {
 909    ReduceOp reduce_op;
 910    unsigned cluster_size; // must be 0 for scans
 911 };
 912
 913 struct instr_deleter_functor {
 914    void operator()(void* p) {
 915       free(p);
 916    }
 917 };
 918
 919 template<typename T>
 920 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
 921
 922 template<typename T>
 923 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
 924 {
 925    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
 926    char *data = (char*) calloc(1, size);
 927    T* inst = (T*) data;
 928
 929    inst->opcode = opcode;
 930    inst->format = format;
 931
 932    inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands);
 933    inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions);
 934
 935    return inst;
 936 }
 937
 938 constexpr bool Instruction::usesModifiers() const noexcept
 939 {
 940    if (isDPP() || isSDWA())
 941       return true;
 942    if (!isVOP3())
 943       return false;
 944    const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
 945    for (unsigned i = 0; i < operands.size(); i++) {
 946       if (vop3->abs[i] || vop3->opsel[i] || vop3->neg[i])
 947          return true;
 948    }
 949    return vop3->opsel[3] || vop3->clamp || vop3->omod;
 950 }
 951
 952 constexpr bool is_phi(Instruction* instr)
 953 {
 954    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
 955 }
 956
 957 static inline bool is_phi(aco_ptr<Instruction>& instr)
 958 {
 959    return is_phi(instr.get());
 960 }
 961
 962 constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
 963 {
 964    switch (instr->format) {
 965    case Format::SMEM:
 966       return static_cast<SMEM_instruction*>(instr)->barrier;
 967    case Format::MUBUF:
 968       return static_cast<MUBUF_instruction*>(instr)->barrier;
 969    case Format::MIMG:
 970       return static_cast<MIMG_instruction*>(instr)->barrier;
 971    case Format::FLAT:
 972    case Format::GLOBAL:
 973    case Format::SCRATCH:
 974       return static_cast<FLAT_instruction*>(instr)->barrier;
 975    case Format::DS:
 976       return barrier_shared;
 977    default:
 978       return barrier_none;
 979    }
 980 }
 981
 982 enum block_kind {
 983    /* uniform indicates that leaving this block,
 984     * all actives lanes stay active */
 985    block_kind_uniform = 1 << 0,
 986    block_kind_top_level = 1 << 1,
 987    block_kind_loop_preheader = 1 << 2,
 988    block_kind_loop_header = 1 << 3,
 989    block_kind_loop_exit = 1 << 4,
 990    block_kind_continue = 1 << 5,
 991    block_kind_break = 1 << 6,
 992    block_kind_continue_or_break = 1 << 7,
 993    block_kind_discard = 1 << 8,
 994    block_kind_branch = 1 << 9,
 995    block_kind_merge = 1 << 10,
 996    block_kind_invert = 1 << 11,
 997    block_kind_uses_discard_if = 1 << 12,
 998    block_kind_needs_lowering = 1 << 13,
 999    block_kind_uses_demote = 1 << 14,
1000 };
1001
1002
1003 struct RegisterDemand {
1004    constexpr RegisterDemand() = default;
1005    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1006       : vgpr{v}, sgpr{s} {}
1007    int16_t vgpr = 0;
1008    int16_t sgpr = 0;
1009
1010    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1011       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1012    }
1013
1014    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1015       return vgpr > other.vgpr || sgpr > other.sgpr;
1016    }
1017
1018    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1019       if (t.type() == RegType::sgpr)
1020          return RegisterDemand( vgpr, sgpr + t.size() );
1021       else
1022          return RegisterDemand( vgpr + t.size(), sgpr );
1023    }
1024
1025    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1026       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1027    }
1028
1029    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1030       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1031    }
1032
1033    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1034       vgpr += other.vgpr;
1035       sgpr += other.sgpr;
1036       return *this;
1037    }
1038
1039    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1040       vgpr -= other.vgpr;
1041       sgpr -= other.sgpr;
1042       return *this;
1043    }
1044
1045    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1046       if (t.type() == RegType::sgpr)
1047          sgpr += t.size();
1048       else
1049          vgpr += t.size();
1050       return *this;
1051    }
1052
1053    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1054       if (t.type() == RegType::sgpr)
1055          sgpr -= t.size();
1056       else
1057          vgpr -= t.size();
1058       return *this;
1059    }
1060
1061    constexpr void update(const RegisterDemand other) noexcept {
1062       vgpr = std::max(vgpr, other.vgpr);
1063       sgpr = std::max(sgpr, other.sgpr);
1064    }
1065
1066 };
1067
1068 /* CFG */
1069 struct Block {
1070    float_mode fp_mode;
1071    unsigned index;
1072    unsigned offset = 0;
1073    std::vector<aco_ptr<Instruction>> instructions;
1074    std::vector<unsigned> logical_preds;
1075    std::vector<unsigned> linear_preds;
1076    std::vector<unsigned> logical_succs;
1077    std::vector<unsigned> linear_succs;
1078    RegisterDemand register_demand = RegisterDemand();
1079    uint16_t loop_nest_depth = 0;
1080    uint16_t kind = 0;
1081    int logical_idom = -1;
1082    int linear_idom = -1;
1083    Temp live_out_exec = Temp();
1084
1085    /* this information is needed for predecessors to blocks with phis when
1086     * moving out of ssa */
1087    bool scc_live_out = false;
1088    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1089
1090    Block(unsigned idx) : index(idx) {}
1091    Block() : index(0) {}
1092 };
1093
1094 using Stage = uint16_t;
1095
1096 /* software stages */
1097 static constexpr Stage sw_vs = 1 << 0;
1098 static constexpr Stage sw_gs = 1 << 1;
1099 static constexpr Stage sw_tcs = 1 << 2;
1100 static constexpr Stage sw_tes = 1 << 3;
1101 static constexpr Stage sw_fs = 1 << 4;
1102 static constexpr Stage sw_cs = 1 << 5;
1103 static constexpr Stage sw_mask = 0x3f;
1104
1105 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1106 static constexpr Stage hw_vs = 1 << 6;
1107 static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
1108 static constexpr Stage hw_gs = 1 << 8;
1109 static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
1110 static constexpr Stage hw_hs = 1 << 10;
1111 static constexpr Stage hw_fs = 1 << 11;
1112 static constexpr Stage hw_cs = 1 << 12;
1113 static constexpr Stage hw_mask = 0x7f << 6;
1114
1115 /* possible settings of Program::stage */
1116 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1117 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1118 static constexpr Stage compute_cs = sw_cs | hw_cs;
1119 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1120 /* GFX10/NGG */
1121 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
1122 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1123 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1124 static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1125 /* GFX9 (and GFX10 if NGG isn't used) */
1126 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1127 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1128 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1129 /* pre-GFX9 */
1130 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1131 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1132 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1133 static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */
1134 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1135
1136 class Program final {
1137 public:
1138    float_mode next_fp_mode;
1139    std::vector<Block> blocks;
1140    RegisterDemand max_reg_demand = RegisterDemand();
1141    uint16_t num_waves = 0;
1142    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1143    ac_shader_config* config;
1144    struct radv_shader_info *info;
1145    enum chip_class chip_class;
1146    enum radeon_family family;
1147    unsigned wave_size;
1148    RegClass lane_mask;
1149    Stage stage; /* Stage */
1150    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1151    bool needs_wqm = false; /* there exists a p_wqm instruction */
1152    bool wb_smem_l1_on_end = false;
1153
1154    std::vector<uint8_t> constant_data;
1155    Temp private_segment_buffer;
1156    Temp scratch_offset;
1157
1158    uint16_t lds_alloc_granule;
1159    uint32_t lds_limit; /* in bytes */
1160    uint16_t vgpr_limit;
1161    uint16_t sgpr_limit;
1162    uint16_t physical_sgprs;
1163    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1164    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1165
1166    bool needs_vcc = false;
1167    bool needs_xnack_mask = false;
1168    bool needs_flat_scr = false;
1169
1170    uint32_t allocateId()
1171    {
1172       assert(allocationID <= 16777215);
1173       return allocationID++;
1174    }
1175
1176    uint32_t peekAllocationId()
1177    {
1178       return allocationID;
1179    }
1180
1181    void setAllocationId(uint32_t id)
1182    {
1183       allocationID = id;
1184    }
1185
1186    Block* create_and_insert_block() {
1187       blocks.emplace_back(blocks.size());
1188       blocks.back().fp_mode = next_fp_mode;
1189       return &blocks.back();
1190    }
1191
1192    Block* insert_block(Block&& block) {
1193       block.index = blocks.size();
1194       block.fp_mode = next_fp_mode;
1195       blocks.emplace_back(std::move(block));
1196       return &blocks.back();
1197    }
1198
1199 private:
1200    uint32_t allocationID = 1;
1201 };
1202
1203 struct live {
1204    /* live temps out per block */
1205    std::vector<std::set<Temp>> live_out;
1206    /* register demand (sgpr/vgpr) per instruction per block */
1207    std::vector<std::vector<RegisterDemand>> register_demand;
1208 };
1209
1210 void select_program(Program *program,
1211                     unsigned shader_count,
1212                     struct nir_shader *const *shaders,
1213                     ac_shader_config* config,
1214                     struct radv_shader_args *args);
1215
1216 void lower_wqm(Program* program, live& live_vars,
1217                const struct radv_nir_compiler_options *options);
1218 void lower_bool_phis(Program* program);
1219 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1220 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1221 std::vector<uint16_t> dead_code_analysis(Program *program);
1222 void dominator_tree(Program* program);
1223 void insert_exec_mask(Program *program);
1224 void value_numbering(Program* program);
1225 void optimize(Program* program);
1226 void setup_reduce_temp(Program* program);
1227 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1228 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block);
1229 void ssa_elimination(Program* program);
1230 void lower_to_hw_instr(Program* program);
1231 void schedule_program(Program* program, live& live_vars);
1232 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1233 void insert_wait_states(Program* program);
1234 void insert_NOPs(Program* program);
1235 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1236 void print_asm(Program *program, std::vector<uint32_t>& binary,
1237                unsigned exec_size, std::ostream& out);
1238 void validate(Program* program, FILE *output);
1239 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output);
1240 #ifndef NDEBUG
1241 void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
1242 #else
1243 #define perfwarn(program, cond, msg, ...) do {} while(0)
1244 #endif
1245
1246 void aco_print_instr(Instruction *instr, FILE *output);
1247 void aco_print_program(Program *program, FILE *output);
1248
1249 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1250 uint16_t get_extra_sgprs(Program *program);
1251
1252 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1253 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1254 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1255
1256 /* return number of addressable sgprs/vgprs for max_waves */
1257 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1258 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1259
1260 typedef struct {
1261    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1262    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1263    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1264    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1265    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1266    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1267    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1268 } Info;
1269
1270 extern const Info instr_info;
1271
1272 }
1273
1274 #endif /* ACO_IR_H */
1275