src/amd/compiler/aco_ir.h

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #ifndef ACO_IR_H
  26 #define ACO_IR_H
  27
  28 #include <vector>
  29 #include <set>
  30 #include <unordered_set>
  31 #include <bitset>
  32 #include <memory>
  33
  34 #include "nir.h"
  35 #include "ac_binary.h"
  36 #include "amd_family.h"
  37 #include "aco_opcodes.h"
  38 #include "aco_util.h"
  39
  40 #include "vulkan/radv_shader.h"
  41
  42 struct radv_nir_compiler_options;
  43 struct radv_shader_args;
  44 struct radv_shader_info;
  45
  46 namespace aco {
  47
  48 extern uint64_t debug_flags;
  49
  50 enum {
  51    DEBUG_VALIDATE_IR = 0x1,
  52    DEBUG_VALIDATE_RA = 0x2,
  53    DEBUG_PERFWARN = 0x4,
  54 };
  55
  56 /**
  57  * Representation of the instruction's microcode encoding format
  58  * Note: Some Vector ALU Formats can be combined, such that:
  59  * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding
  60  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
  61  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
  62  *
  63  * (*) The same is applicable for VOP1 and VOPC instructions.
  64  */
  65 enum class Format : std::uint16_t {
  66    /* Pseudo Instruction Format */
  67    PSEUDO = 0,
  68    /* Scalar ALU & Control Formats */
  69    SOP1 = 1,
  70    SOP2 = 2,
  71    SOPK = 3,
  72    SOPP = 4,
  73    SOPC = 5,
  74    /* Scalar Memory Format */
  75    SMEM = 6,
  76    /* LDS/GDS Format */
  77    DS = 8,
  78    /* Vector Memory Buffer Formats */
  79    MTBUF = 9,
  80    MUBUF = 10,
  81    /* Vector Memory Image Format */
  82    MIMG = 11,
  83    /* Export Format */
  84    EXP = 12,
  85    /* Flat Formats */
  86    FLAT = 13,
  87    GLOBAL = 14,
  88    SCRATCH = 15,
  89
  90    PSEUDO_BRANCH = 16,
  91    PSEUDO_BARRIER = 17,
  92    PSEUDO_REDUCTION = 18,
  93
  94    /* Vector ALU Formats */
  95    VOP3P = 19,
  96    VOP1 = 1 << 8,
  97    VOP2 = 1 << 9,
  98    VOPC = 1 << 10,
  99    VOP3 = 1 << 11,
 100    VOP3A = 1 << 11,
 101    VOP3B = 1 << 11,
 102    /* Vector Parameter Interpolation Format */
 103    VINTRP = 1 << 12,
 104    DPP = 1 << 13,
 105    SDWA = 1 << 14,
 106 };
 107
 108 enum storage_class : uint8_t {
 109    storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
 110    storage_buffer = 0x1, /* SSBOs and global memory */
 111    storage_atomic_counter = 0x2, /* not used for Vulkan */
 112    storage_image = 0x4,
 113    storage_shared = 0x8, /* or TCS output */
 114    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
 115    storage_scratch = 0x20,
 116    storage_vgpr_spill = 0x40,
 117    storage_count = 8,
 118 };
 119
 120 enum memory_semantics : uint8_t {
 121    semantic_none = 0x0,
 122    /* for loads: don't move any access after this load to before this load (even other loads)
 123     * for barriers: don't move any access after the barrier to before any
 124     * atomics/control_barriers/sendmsg_gs_done before the barrier */
 125    semantic_acquire = 0x1,
 126    /* for stores: don't move any access before this store to after this store
 127     * for barriers: don't move any access before the barrier to after any
 128     * atomics/control_barriers/sendmsg_gs_done after the barrier */
 129    semantic_release = 0x2,
 130
 131    /* the rest are for load/stores/atomics only */
 132    /* cannot be DCE'd or CSE'd */
 133    semantic_volatile = 0x4,
 134    /* does not interact with barriers and assumes this lane is the only lane
 135     * accessing this memory */
 136    semantic_private = 0x8,
 137    /* this operation can be reordered around operations of the same storage. says nothing about barriers */
 138    semantic_can_reorder = 0x10,
 139    /* this is a atomic instruction (may only read or write memory) */
 140    semantic_atomic = 0x20,
 141    /* this is instruction both reads and writes memory */
 142    semantic_rmw = 0x40,
 143
 144    semantic_acqrel = semantic_acquire | semantic_release,
 145    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
 146 };
 147
 148 enum sync_scope : uint8_t {
 149    scope_invocation = 0,
 150    scope_subgroup = 1,
 151    scope_workgroup = 2,
 152    scope_queuefamily = 3,
 153    scope_device = 4,
 154 };
 155
 156 struct memory_sync_info {
 157    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
 158    memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
 159       : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
 160
 161    storage_class storage:8;
 162    memory_semantics semantics:8;
 163    sync_scope scope:8;
 164
 165    bool operator == (const memory_sync_info& rhs) const {
 166       return storage == rhs.storage &&
 167              semantics == rhs.semantics &&
 168              scope == rhs.scope;
 169    }
 170
 171    bool can_reorder() const {
 172       if (semantics & semantic_acqrel)
 173          return false;
 174       /* Also check storage so that zero-initialized memory_sync_info can be
 175        * reordered. */
 176       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
 177    }
 178 };
 179 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
 180
 181 enum fp_round {
 182    fp_round_ne = 0,
 183    fp_round_pi = 1,
 184    fp_round_ni = 2,
 185    fp_round_tz = 3,
 186 };
 187
 188 enum fp_denorm {
 189    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
 190     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
 191    fp_denorm_flush = 0x0,
 192    fp_denorm_keep = 0x3,
 193 };
 194
 195 struct float_mode {
 196    /* matches encoding of the MODE register */
 197    union {
 198       struct {
 199           fp_round round32:2;
 200           fp_round round16_64:2;
 201           unsigned denorm32:2;
 202           unsigned denorm16_64:2;
 203       };
 204       struct {
 205          uint8_t round:4;
 206          uint8_t denorm:4;
 207       };
 208       uint8_t val = 0;
 209    };
 210    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
 211    bool preserve_signed_zero_inf_nan32:1;
 212    bool preserve_signed_zero_inf_nan16_64:1;
 213    /* if false, optimizations which may remove denormal flushing can be done */
 214    bool must_flush_denorms32:1;
 215    bool must_flush_denorms16_64:1;
 216    bool care_about_round32:1;
 217    bool care_about_round16_64:1;
 218
 219    /* Returns true if instructions using the mode "other" can safely use the
 220     * current one instead. */
 221    bool canReplace(float_mode other) const noexcept {
 222       return val == other.val &&
 223              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
 224              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
 225              (must_flush_denorms32  || !other.must_flush_denorms32) &&
 226              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
 227              (care_about_round32 || !other.care_about_round32) &&
 228              (care_about_round16_64 || !other.care_about_round16_64);
 229    }
 230 };
 231
 232 constexpr Format asVOP3(Format format) {
 233    return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format);
 234 };
 235
 236 constexpr Format asSDWA(Format format) {
 237    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
 238    return (Format) ((uint32_t) Format::SDWA | (uint32_t) format);
 239 }
 240
 241 enum class RegType {
 242    none = 0,
 243    sgpr,
 244    vgpr,
 245    linear_vgpr,
 246 };
 247
 248 struct RegClass {
 249
 250    enum RC : uint8_t {
 251       s1 = 1,
 252       s2 = 2,
 253       s3 = 3,
 254       s4 = 4,
 255       s6 = 6,
 256       s8 = 8,
 257       s16 = 16,
 258       v1 = s1 | (1 << 5),
 259       v2 = s2 | (1 << 5),
 260       v3 = s3 | (1 << 5),
 261       v4 = s4 | (1 << 5),
 262       v5 = 5  | (1 << 5),
 263       v6 = 6  | (1 << 5),
 264       v7 = 7  | (1 << 5),
 265       v8 = 8  | (1 << 5),
 266       /* byte-sized register class */
 267       v1b = v1 | (1 << 7),
 268       v2b = v2 | (1 << 7),
 269       v3b = v3 | (1 << 7),
 270       v4b = v4 | (1 << 7),
 271       v6b = v6 | (1 << 7),
 272       v8b = v8 | (1 << 7),
 273       /* these are used for WWM and spills to vgpr */
 274       v1_linear = v1 | (1 << 6),
 275       v2_linear = v2 | (1 << 6),
 276    };
 277
 278    RegClass() = default;
 279    constexpr RegClass(RC rc)
 280       : rc(rc) {}
 281    constexpr RegClass(RegType type, unsigned size)
 282       : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {}
 283
 284    constexpr operator RC() const { return rc; }
 285    explicit operator bool() = delete;
 286
 287    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
 288    constexpr bool is_subdword() const { return rc & (1 << 7); }
 289    constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); }
 290    //TODO: use size() less in favor of bytes()
 291    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
 292    constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
 293    constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); }
 294    constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); }
 295
 296    static constexpr RegClass get(RegType type, unsigned bytes) {
 297       if (type == RegType::sgpr) {
 298          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
 299       } else {
 300          return bytes % 4u ? RegClass(type, bytes).as_subdword() :
 301                              RegClass(type, bytes / 4u);
 302       }
 303    }
 304
 305 private:
 306    RC rc;
 307 };
 308
 309 /* transitional helper expressions */
 310 static constexpr RegClass s1{RegClass::s1};
 311 static constexpr RegClass s2{RegClass::s2};
 312 static constexpr RegClass s3{RegClass::s3};
 313 static constexpr RegClass s4{RegClass::s4};
 314 static constexpr RegClass s8{RegClass::s8};
 315 static constexpr RegClass s16{RegClass::s16};
 316 static constexpr RegClass v1{RegClass::v1};
 317 static constexpr RegClass v2{RegClass::v2};
 318 static constexpr RegClass v3{RegClass::v3};
 319 static constexpr RegClass v4{RegClass::v4};
 320 static constexpr RegClass v5{RegClass::v5};
 321 static constexpr RegClass v6{RegClass::v6};
 322 static constexpr RegClass v7{RegClass::v7};
 323 static constexpr RegClass v8{RegClass::v8};
 324 static constexpr RegClass v1b{RegClass::v1b};
 325 static constexpr RegClass v2b{RegClass::v2b};
 326 static constexpr RegClass v3b{RegClass::v3b};
 327 static constexpr RegClass v4b{RegClass::v4b};
 328 static constexpr RegClass v6b{RegClass::v6b};
 329 static constexpr RegClass v8b{RegClass::v8b};
 330
 331 /**
 332  * Temp Class
 333  * Each temporary virtual register has a
 334  * register class (i.e. size and type)
 335  * and SSA id.
 336  */
 337 struct Temp {
 338    Temp() noexcept : id_(0), reg_class(0) {}
 339    constexpr Temp(uint32_t id, RegClass cls) noexcept
 340       : id_(id), reg_class(uint8_t(cls)) {}
 341
 342    constexpr uint32_t id() const noexcept { return id_; }
 343    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
 344
 345    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
 346    constexpr unsigned size() const noexcept { return regClass().size(); }
 347    constexpr RegType type() const noexcept { return regClass().type(); }
 348    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
 349
 350    constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); }
 351    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
 352    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
 353
 354 private:
 355    uint32_t id_: 24;
 356    uint32_t reg_class : 8;
 357 };
 358
 359 /**
 360  * PhysReg
 361  * Represents the physical register for each
 362  * Operand and Definition.
 363  */
 364 struct PhysReg {
 365    constexpr PhysReg() = default;
 366    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
 367    constexpr unsigned reg() const { return reg_b >> 2; }
 368    constexpr unsigned byte() const { return reg_b & 0x3; }
 369    constexpr operator unsigned() const { return reg(); }
 370    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
 371    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
 372    constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; }
 373    constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; }
 374
 375    uint16_t reg_b = 0;
 376 };
 377
 378 /* helper expressions for special registers */
 379 static constexpr PhysReg m0{124};
 380 static constexpr PhysReg vcc{106};
 381 static constexpr PhysReg vcc_hi{107};
 382 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
 383 static constexpr PhysReg exec{126};
 384 static constexpr PhysReg exec_lo{126};
 385 static constexpr PhysReg exec_hi{127};
 386 static constexpr PhysReg vccz{251};
 387 static constexpr PhysReg execz{252};
 388 static constexpr PhysReg scc{253};
 389
 390 /**
 391  * Operand Class
 392  * Initially, each Operand refers to either
 393  * a temporary virtual register
 394  * or to a constant value
 395  * Temporary registers get mapped to physical register during RA
 396  * Constant values are inlined into the instruction sequence.
 397  */
 398 class Operand final
 399 {
 400 public:
 401    constexpr Operand()
 402       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false),
 403         isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0),
 404         isLateKill_(false) {}
 405
 406    explicit Operand(Temp r) noexcept
 407    {
 408       data_.temp = r;
 409       if (r.id()) {
 410          isTemp_ = true;
 411       } else {
 412          isUndef_ = true;
 413          setFixed(PhysReg{128});
 414       }
 415    };
 416    explicit Operand(uint8_t v) noexcept
 417    {
 418       /* 8-bit constants are only used for copies and copies from any 8-bit
 419        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
 420        * to be inline constants. */
 421       data_.i = v;
 422       isConstant_ = true;
 423       constSize = 0;
 424       setFixed(PhysReg{0u});
 425    };
 426    explicit Operand(uint16_t v) noexcept
 427    {
 428       data_.i = v;
 429       isConstant_ = true;
 430       constSize = 1;
 431       if (v <= 64)
 432          setFixed(PhysReg{128u + v});
 433       else if (v >= 0xFFF0) /* [-16 .. -1] */
 434          setFixed(PhysReg{192u + (0xFFFF - v)});
 435       else if (v == 0x3800) /* 0.5 */
 436          setFixed(PhysReg{240});
 437       else if (v == 0xB800) /* -0.5 */
 438          setFixed(PhysReg{241});
 439       else if (v == 0x3C00) /* 1.0 */
 440          setFixed(PhysReg{242});
 441       else if (v == 0xBC00) /* -1.0 */
 442          setFixed(PhysReg{243});
 443       else if (v == 0x4000) /* 2.0 */
 444          setFixed(PhysReg{244});
 445       else if (v == 0xC000) /* -2.0 */
 446          setFixed(PhysReg{245});
 447       else if (v == 0x4400) /* 4.0 */
 448          setFixed(PhysReg{246});
 449       else if (v == 0xC400) /* -4.0 */
 450          setFixed(PhysReg{247});
 451       else if (v == 0x3118) /* 1/2 PI */
 452          setFixed(PhysReg{248});
 453       else /* Literal Constant */
 454          setFixed(PhysReg{255});
 455    };
 456    explicit Operand(uint32_t v, bool is64bit = false) noexcept
 457    {
 458       data_.i = v;
 459       isConstant_ = true;
 460       constSize = is64bit ? 3 : 2;
 461       if (v <= 64)
 462          setFixed(PhysReg{128 + v});
 463       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
 464          setFixed(PhysReg{192 - v});
 465       else if (v == 0x3f000000) /* 0.5 */
 466          setFixed(PhysReg{240});
 467       else if (v == 0xbf000000) /* -0.5 */
 468          setFixed(PhysReg{241});
 469       else if (v == 0x3f800000) /* 1.0 */
 470          setFixed(PhysReg{242});
 471       else if (v == 0xbf800000) /* -1.0 */
 472          setFixed(PhysReg{243});
 473       else if (v == 0x40000000) /* 2.0 */
 474          setFixed(PhysReg{244});
 475       else if (v == 0xc0000000) /* -2.0 */
 476          setFixed(PhysReg{245});
 477       else if (v == 0x40800000) /* 4.0 */
 478          setFixed(PhysReg{246});
 479       else if (v == 0xc0800000) /* -4.0 */
 480          setFixed(PhysReg{247});
 481       else { /* Literal Constant */
 482          assert(!is64bit && "attempt to create a 64-bit literal constant");
 483          setFixed(PhysReg{255});
 484       }
 485    };
 486    explicit Operand(uint64_t v) noexcept
 487    {
 488       isConstant_ = true;
 489       constSize = 3;
 490       if (v <= 64) {
 491          data_.i = (uint32_t) v;
 492          setFixed(PhysReg{128 + (uint32_t) v});
 493       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
 494          data_.i = (uint32_t) v;
 495          setFixed(PhysReg{192 - (uint32_t) v});
 496       } else if (v == 0x3FE0000000000000) { /* 0.5 */
 497          data_.i = 0x3f000000;
 498          setFixed(PhysReg{240});
 499       } else if (v == 0xBFE0000000000000) { /* -0.5 */
 500          data_.i = 0xbf000000;
 501          setFixed(PhysReg{241});
 502       } else if (v == 0x3FF0000000000000) { /* 1.0 */
 503          data_.i = 0x3f800000;
 504          setFixed(PhysReg{242});
 505       } else if (v == 0xBFF0000000000000) { /* -1.0 */
 506          data_.i = 0xbf800000;
 507          setFixed(PhysReg{243});
 508       } else if (v == 0x4000000000000000) { /* 2.0 */
 509          data_.i = 0x40000000;
 510          setFixed(PhysReg{244});
 511       } else if (v == 0xC000000000000000) { /* -2.0 */
 512          data_.i = 0xc0000000;
 513          setFixed(PhysReg{245});
 514       } else if (v == 0x4010000000000000) { /* 4.0 */
 515          data_.i = 0x40800000;
 516          setFixed(PhysReg{246});
 517       } else if (v == 0xC010000000000000) { /* -4.0 */
 518          data_.i = 0xc0800000;
 519          setFixed(PhysReg{247});
 520       } else { /* Literal Constant: we don't know if it is a long or double.*/
 521          isConstant_ = 0;
 522          assert(false && "attempt to create a 64-bit literal constant");
 523       }
 524    };
 525    explicit Operand(RegClass type) noexcept
 526    {
 527       isUndef_ = true;
 528       data_.temp = Temp(0, type);
 529       setFixed(PhysReg{128});
 530    };
 531    explicit Operand(PhysReg reg, RegClass type) noexcept
 532    {
 533       data_.temp = Temp(0, type);
 534       setFixed(reg);
 535    }
 536
 537    constexpr bool isTemp() const noexcept
 538    {
 539       return isTemp_;
 540    }
 541
 542    constexpr void setTemp(Temp t) noexcept {
 543       assert(!isConstant_);
 544       isTemp_ = true;
 545       data_.temp = t;
 546    }
 547
 548    constexpr Temp getTemp() const noexcept
 549    {
 550       return data_.temp;
 551    }
 552
 553    constexpr uint32_t tempId() const noexcept
 554    {
 555       return data_.temp.id();
 556    }
 557
 558    constexpr bool hasRegClass() const noexcept
 559    {
 560       return isTemp() || isUndefined();
 561    }
 562
 563    constexpr RegClass regClass() const noexcept
 564    {
 565       return data_.temp.regClass();
 566    }
 567
 568    constexpr unsigned bytes() const noexcept
 569    {
 570       if (isConstant())
 571          return 1 << constSize;
 572       else
 573          return data_.temp.bytes();
 574    }
 575
 576    constexpr unsigned size() const noexcept
 577    {
 578       if (isConstant())
 579          return constSize > 2 ? 2 : 1;
 580       else
 581          return data_.temp.size();
 582    }
 583
 584    constexpr bool isFixed() const noexcept
 585    {
 586       return isFixed_;
 587    }
 588
 589    constexpr PhysReg physReg() const noexcept
 590    {
 591       return reg_;
 592    }
 593
 594    constexpr void setFixed(PhysReg reg) noexcept
 595    {
 596       isFixed_ = reg != unsigned(-1);
 597       reg_ = reg;
 598    }
 599
 600    constexpr bool isConstant() const noexcept
 601    {
 602       return isConstant_;
 603    }
 604
 605    constexpr bool isLiteral() const noexcept
 606    {
 607       return isConstant() && reg_ == 255;
 608    }
 609
 610    constexpr bool isUndefined() const noexcept
 611    {
 612       return isUndef_;
 613    }
 614
 615    constexpr uint32_t constantValue() const noexcept
 616    {
 617       return data_.i;
 618    }
 619
 620    constexpr bool constantEquals(uint32_t cmp) const noexcept
 621    {
 622       return isConstant() && constantValue() == cmp;
 623    }
 624
 625    constexpr uint64_t constantValue64(bool signext=false) const noexcept
 626    {
 627       if (constSize == 3) {
 628          if (reg_ <= 192)
 629             return reg_ - 128;
 630          else if (reg_ <= 208)
 631             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 632
 633          switch (reg_) {
 634          case 240:
 635             return 0x3FE0000000000000;
 636          case 241:
 637             return 0xBFE0000000000000;
 638          case 242:
 639             return 0x3FF0000000000000;
 640          case 243:
 641             return 0xBFF0000000000000;
 642          case 244:
 643             return 0x4000000000000000;
 644          case 245:
 645             return 0xC000000000000000;
 646          case 246:
 647             return 0x4010000000000000;
 648          case 247:
 649             return 0xC010000000000000;
 650          }
 651       } else if (constSize == 1) {
 652          return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i;
 653       } else if (constSize == 0) {
 654          return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i;
 655       }
 656       return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
 657    }
 658
 659    constexpr bool isOfType(RegType type) const noexcept
 660    {
 661       return hasRegClass() && regClass().type() == type;
 662    }
 663
 664    /* Indicates that the killed operand's live range intersects with the
 665     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
 666     * not set by liveness analysis. */
 667    constexpr void setLateKill(bool flag) noexcept
 668    {
 669       isLateKill_ = flag;
 670    }
 671
 672    constexpr bool isLateKill() const noexcept
 673    {
 674       return isLateKill_;
 675    }
 676
 677    constexpr void setKill(bool flag) noexcept
 678    {
 679       isKill_ = flag;
 680       if (!flag)
 681          setFirstKill(false);
 682    }
 683
 684    constexpr bool isKill() const noexcept
 685    {
 686       return isKill_ || isFirstKill();
 687    }
 688
 689    constexpr void setFirstKill(bool flag) noexcept
 690    {
 691       isFirstKill_ = flag;
 692       if (flag)
 693          setKill(flag);
 694    }
 695
 696    /* When there are multiple operands killing the same temporary,
 697     * isFirstKill() is only returns true for the first one. */
 698    constexpr bool isFirstKill() const noexcept
 699    {
 700       return isFirstKill_;
 701    }
 702
 703    constexpr bool isKillBeforeDef() const noexcept
 704    {
 705       return isKill() && !isLateKill();
 706    }
 707
 708    constexpr bool isFirstKillBeforeDef() const noexcept
 709    {
 710       return isFirstKill() && !isLateKill();
 711    }
 712
 713    constexpr bool operator == (Operand other) const noexcept
 714    {
 715       if (other.size() != size())
 716          return false;
 717       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
 718          return false;
 719       if (isFixed() && other.isFixed() && physReg() != other.physReg())
 720          return false;
 721       if (isLiteral())
 722          return other.isLiteral() && other.constantValue() == constantValue();
 723       else if (isConstant())
 724          return other.isConstant() && other.physReg() == physReg();
 725       else if (isUndefined())
 726          return other.isUndefined() && other.regClass() == regClass();
 727       else
 728          return other.isTemp() && other.getTemp() == getTemp();
 729    }
 730 private:
 731    union {
 732       uint32_t i;
 733       float f;
 734       Temp temp = Temp(0, s1);
 735    } data_;
 736    PhysReg reg_;
 737    union {
 738       struct {
 739          uint8_t isTemp_:1;
 740          uint8_t isFixed_:1;
 741          uint8_t isConstant_:1;
 742          uint8_t isKill_:1;
 743          uint8_t isUndef_:1;
 744          uint8_t isFirstKill_:1;
 745          uint8_t constSize:2;
 746          uint8_t isLateKill_:1;
 747       };
 748       /* can't initialize bit-fields in c++11, so work around using a union */
 749       uint16_t control_ = 0;
 750    };
 751 };
 752
 753 /**
 754  * Definition Class
 755  * Definitions are the results of Instructions
 756  * and refer to temporary virtual registers
 757  * which are later mapped to physical registers
 758  */
 759 class Definition final
 760 {
 761 public:
 762    constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
 763                             isKill_(0), isPrecise_(0), isNUW_(0) {}
 764    Definition(uint32_t index, RegClass type) noexcept
 765       : temp(index, type) {}
 766    explicit Definition(Temp tmp) noexcept
 767       : temp(tmp) {}
 768    Definition(PhysReg reg, RegClass type) noexcept
 769       : temp(Temp(0, type))
 770    {
 771       setFixed(reg);
 772    }
 773    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept
 774       : temp(Temp(tmpId, type))
 775    {
 776       setFixed(reg);
 777    }
 778
 779    constexpr bool isTemp() const noexcept
 780    {
 781       return tempId() > 0;
 782    }
 783
 784    constexpr Temp getTemp() const noexcept
 785    {
 786       return temp;
 787    }
 788
 789    constexpr uint32_t tempId() const noexcept
 790    {
 791       return temp.id();
 792    }
 793
 794    constexpr void setTemp(Temp t) noexcept {
 795       temp = t;
 796    }
 797
 798    constexpr RegClass regClass() const noexcept
 799    {
 800       return temp.regClass();
 801    }
 802
 803    constexpr unsigned bytes() const noexcept
 804    {
 805       return temp.bytes();
 806    }
 807
 808    constexpr unsigned size() const noexcept
 809    {
 810       return temp.size();
 811    }
 812
 813    constexpr bool isFixed() const noexcept
 814    {
 815       return isFixed_;
 816    }
 817
 818    constexpr PhysReg physReg() const noexcept
 819    {
 820       return reg_;
 821    }
 822
 823    constexpr void setFixed(PhysReg reg) noexcept
 824    {
 825       isFixed_ = 1;
 826       reg_ = reg;
 827    }
 828
 829    constexpr void setHint(PhysReg reg) noexcept
 830    {
 831       hasHint_ = 1;
 832       reg_ = reg;
 833    }
 834
 835    constexpr bool hasHint() const noexcept
 836    {
 837       return hasHint_;
 838    }
 839
 840    constexpr void setKill(bool flag) noexcept
 841    {
 842       isKill_ = flag;
 843    }
 844
 845    constexpr bool isKill() const noexcept
 846    {
 847       return isKill_;
 848    }
 849
 850    constexpr void setPrecise(bool precise) noexcept
 851    {
 852       isPrecise_ = precise;
 853    }
 854
 855    constexpr bool isPrecise() const noexcept
 856    {
 857       return isPrecise_;
 858    }
 859
 860    /* No Unsigned Wrap */
 861    constexpr void setNUW(bool nuw) noexcept
 862    {
 863       isNUW_ = nuw;
 864    }
 865
 866    constexpr bool isNUW() const noexcept
 867    {
 868       return isNUW_;
 869    }
 870
 871 private:
 872    Temp temp = Temp(0, s1);
 873    PhysReg reg_;
 874    union {
 875       struct {
 876          uint8_t isFixed_:1;
 877          uint8_t hasHint_:1;
 878          uint8_t isKill_:1;
 879          uint8_t isPrecise_:1;
 880          uint8_t isNUW_:1;
 881       };
 882       /* can't initialize bit-fields in c++11, so work around using a union */
 883       uint8_t control_ = 0;
 884    };
 885 };
 886
 887 struct Block;
 888
 889 struct Instruction {
 890    aco_opcode opcode;
 891    Format format;
 892    uint32_t pass_flags;
 893
 894    aco::span<Operand> operands;
 895    aco::span<Definition> definitions;
 896
 897    constexpr bool isVALU() const noexcept
 898    {
 899       return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1
 900           || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2
 901           || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC
 902           || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A
 903           || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B
 904           || format == Format::VOP3P;
 905    }
 906
 907    constexpr bool isSALU() const noexcept
 908    {
 909       return format == Format::SOP1 ||
 910              format == Format::SOP2 ||
 911              format == Format::SOPC ||
 912              format == Format::SOPK ||
 913              format == Format::SOPP;
 914    }
 915
 916    constexpr bool isVMEM() const noexcept
 917    {
 918       return format == Format::MTBUF ||
 919              format == Format::MUBUF ||
 920              format == Format::MIMG;
 921    }
 922
 923    constexpr bool isDPP() const noexcept
 924    {
 925       return (uint16_t) format & (uint16_t) Format::DPP;
 926    }
 927
 928    constexpr bool isVOP3() const noexcept
 929    {
 930       return ((uint16_t) format & (uint16_t) Format::VOP3A) ||
 931              ((uint16_t) format & (uint16_t) Format::VOP3B);
 932    }
 933
 934    constexpr bool isSDWA() const noexcept
 935    {
 936       return (uint16_t) format & (uint16_t) Format::SDWA;
 937    }
 938
 939    constexpr bool isFlatOrGlobal() const noexcept
 940    {
 941       return format == Format::FLAT || format == Format::GLOBAL;
 942    }
 943
 944    constexpr bool usesModifiers() const noexcept;
 945
 946    constexpr bool reads_exec() const noexcept
 947    {
 948       for (const Operand& op : operands) {
 949          if (op.isFixed() && op.physReg() == exec)
 950             return true;
 951       }
 952       return false;
 953    }
 954 };
 955 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
 956
 957 struct SOPK_instruction : public Instruction {
 958    uint16_t imm;
 959    uint16_t padding;
 960 };
 961 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 962
 963 struct SOPP_instruction : public Instruction {
 964    uint32_t imm;
 965    int block;
 966 };
 967 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 968
 969 struct SOPC_instruction : public Instruction {
 970 };
 971 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 972
 973 struct SOP1_instruction : public Instruction {
 974 };
 975 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 976
 977 struct SOP2_instruction : public Instruction {
 978 };
 979 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
 980
 981 /**
 982  * Scalar Memory Format:
 983  * For s_(buffer_)load_dword*:
 984  * Operand(0): SBASE - SGPR-pair which provides base address
 985  * Operand(1): Offset - immediate (un)signed offset or SGPR
 986  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 987  * Operand(n-1): SOffset - SGPR offset (Vega only)
 988  *
 989  * Having no operands is also valid for instructions such as s_dcache_inv.
 990  *
 991  */
 992 struct SMEM_instruction : public Instruction {
 993    memory_sync_info sync;
 994    bool glc : 1; /* VI+: globally coherent */
 995    bool dlc : 1; /* NAVI: device level coherent */
 996    bool nv : 1; /* VEGA only: Non-volatile */
 997    bool disable_wqm : 1;
 998    bool prevent_overflow : 1; /* avoid overflow when combining additions */
 999    uint32_t padding: 3;
1000 };
1001 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1002
1003 struct VOP1_instruction : public Instruction {
1004 };
1005 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1006
1007 struct VOP2_instruction : public Instruction {
1008 };
1009 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1010
1011 struct VOPC_instruction : public Instruction {
1012 };
1013 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1014
1015 struct VOP3A_instruction : public Instruction {
1016    bool abs[3];
1017    bool neg[3];
1018    uint8_t opsel : 4;
1019    uint8_t omod : 2;
1020    bool clamp : 1;
1021    uint32_t padding : 9;
1022 };
1023 static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1024
1025 struct VOP3P_instruction : public Instruction {
1026    bool neg_lo[3];
1027    bool neg_hi[3];
1028    uint8_t opsel_lo : 3;
1029    uint8_t opsel_hi : 3;
1030    bool clamp : 1;
1031    uint32_t padding : 9;
1032 };
1033 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1034
1035 /**
1036  * Data Parallel Primitives Format:
1037  * This format can be used for VOP1, VOP2 or VOPC instructions.
1038  * The swizzle applies to the src0 operand.
1039  *
1040  */
1041 struct DPP_instruction : public Instruction {
1042    bool abs[2];
1043    bool neg[2];
1044    uint16_t dpp_ctrl;
1045    uint8_t row_mask : 4;
1046    uint8_t bank_mask : 4;
1047    bool bound_ctrl : 1;
1048    uint32_t padding : 7;
1049 };
1050 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1051
1052 enum sdwa_sel : uint8_t {
1053     /* masks */
1054     sdwa_wordnum = 0x1,
1055     sdwa_bytenum = 0x3,
1056     sdwa_asuint = 0x7 | 0x10,
1057     sdwa_rasize = 0x3,
1058
1059     /* flags */
1060     sdwa_isword = 0x4,
1061     sdwa_sext = 0x8,
1062     sdwa_isra = 0x10,
1063
1064     /* specific values */
1065     sdwa_ubyte0 = 0,
1066     sdwa_ubyte1 = 1,
1067     sdwa_ubyte2 = 2,
1068     sdwa_ubyte3 = 3,
1069     sdwa_uword0 = sdwa_isword | 0,
1070     sdwa_uword1 = sdwa_isword | 1,
1071     sdwa_udword = 6,
1072
1073     sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
1074     sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
1075     sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
1076     sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
1077     sdwa_sword0 = sdwa_uword0 | sdwa_sext,
1078     sdwa_sword1 = sdwa_uword1 | sdwa_sext,
1079     sdwa_sdword = sdwa_udword | sdwa_sext,
1080
1081     /* register-allocated */
1082     sdwa_ubyte = 1 | sdwa_isra,
1083     sdwa_uword = 2 | sdwa_isra,
1084     sdwa_sbyte = sdwa_ubyte | sdwa_sext,
1085     sdwa_sword = sdwa_uword | sdwa_sext,
1086 };
1087
1088 /**
1089  * Sub-Dword Addressing Format:
1090  * This format can be used for VOP1, VOP2 or VOPC instructions.
1091  *
1092  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1093  * the definition doesn't have to be VCC on GFX9+.
1094  *
1095  */
1096 struct SDWA_instruction : public Instruction {
1097    /* these destination modifiers aren't available with VOPC except for
1098     * clamp on GFX8 */
1099    uint8_t sel[2];
1100    uint8_t dst_sel;
1101    bool neg[2];
1102    bool abs[2];
1103    bool dst_preserve : 1;
1104    bool clamp : 1;
1105    uint8_t omod : 2; /* GFX9+ */
1106    uint32_t padding : 4;
1107 };
1108 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1109
1110 struct Interp_instruction : public Instruction {
1111    uint8_t attribute;
1112    uint8_t component;
1113    uint16_t padding;
1114 };
1115 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1116
1117 /**
1118  * Local and Global Data Sharing instructions
1119  * Operand(0): ADDR - VGPR which supplies the address.
1120  * Operand(1): DATA0 - First data VGPR.
1121  * Operand(2): DATA1 - Second data VGPR.
1122  * Operand(n-1): M0 - LDS size.
1123  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1124  *
1125  */
1126 struct DS_instruction : public Instruction {
1127    memory_sync_info sync;
1128    bool gds;
1129    int16_t offset0;
1130    int8_t offset1;
1131    uint8_t padding;
1132 };
1133 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1134
1135 /**
1136  * Vector Memory Untyped-buffer Instructions
1137  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1138  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1139  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1140  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1141  *
1142  */
1143 struct MUBUF_instruction : public Instruction {
1144    memory_sync_info sync;
1145    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1146    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1147    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1148    bool glc : 1; /* globally coherent */
1149    bool dlc : 1; /* NAVI: device level coherent */
1150    bool slc : 1; /* system level coherent */
1151    bool tfe : 1; /* texture fail enable */
1152    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1153    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1154    uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1155    bool swizzled : 1;
1156    uint32_t padding1 : 18;
1157 };
1158 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1159
1160 /**
1161  * Vector Memory Typed-buffer Instructions
1162  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1163  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1164  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1165  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1166  *
1167  */
1168 struct MTBUF_instruction : public Instruction {
1169    memory_sync_info sync;
1170    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1171    uint8_t nfmt : 3; /* Numeric format of data in memory */
1172    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1173    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1174    bool glc : 1; /* globally coherent */
1175    bool dlc : 1; /* NAVI: device level coherent */
1176    bool slc : 1; /* system level coherent */
1177    bool tfe : 1; /* texture fail enable */
1178    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1179    uint32_t padding : 10;
1180    uint16_t offset; /* Unsigned byte offset - 12 bit */
1181 };
1182 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1183
1184 /**
1185  * Vector Memory Image Instructions
1186  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1187  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1188  *             or VDATA - Vector GPR for write data.
1189  * Operand(2): VADDR - Address source. Can carry an offset or an index.
1190  * Definition(0): VDATA - Vector GPR for read result.
1191  *
1192  */
1193 struct MIMG_instruction : public Instruction {
1194    memory_sync_info sync;
1195    uint8_t dmask; /* Data VGPR enable mask */
1196    uint8_t dim : 3; /* NAVI: dimensionality */
1197    bool unrm : 1; /* Force address to be un-normalized */
1198    bool dlc : 1; /* NAVI: device level coherent */
1199    bool glc : 1; /* globally coherent */
1200    bool slc : 1; /* system level coherent */
1201    bool tfe : 1; /* texture fail enable */
1202    bool da : 1; /* declare an array */
1203    bool lwe : 1; /* Force data to be un-normalized */
1204    bool r128 : 1; /* NAVI: Texture resource size */
1205    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1206    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1207    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1208    uint32_t padding : 18;
1209 };
1210 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1211
1212 /**
1213  * Flat/Scratch/Global Instructions
1214  * Operand(0): ADDR
1215  * Operand(1): SADDR
1216  * Operand(2) / Definition(0): DATA/VDST
1217  *
1218  */
1219 struct FLAT_instruction : public Instruction {
1220    memory_sync_info sync;
1221    bool slc : 1; /* system level coherent */
1222    bool glc : 1; /* globally coherent */
1223    bool dlc : 1; /* NAVI: device level coherent */
1224    bool lds : 1;
1225    bool nv : 1;
1226    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1227    uint32_t padding0 : 2;
1228    uint16_t offset; /* Vega/Navi only */
1229    uint16_t padding1;
1230 };
1231 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1232
1233 struct Export_instruction : public Instruction {
1234    uint8_t enabled_mask;
1235    uint8_t dest;
1236    bool compressed : 1;
1237    bool done : 1;
1238    bool valid_mask : 1;
1239    uint32_t padding : 13;
1240 };
1241 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1242
1243 struct Pseudo_instruction : public Instruction {
1244    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1245    bool tmp_in_scc;
1246    uint8_t padding;
1247 };
1248 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1249
1250 struct Pseudo_branch_instruction : public Instruction {
1251    /* target[0] is the block index of the branch target.
1252     * For conditional branches, target[1] contains the fall-through alternative.
1253     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1254     */
1255    uint32_t target[2];
1256 };
1257 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1258
1259 struct Pseudo_barrier_instruction : public Instruction {
1260    memory_sync_info sync;
1261    sync_scope exec_scope;
1262 };
1263 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1264
1265 enum ReduceOp : uint16_t {
1266    iadd8, iadd16, iadd32, iadd64,
1267    imul8, imul16, imul32, imul64,
1268           fadd16, fadd32, fadd64,
1269           fmul16, fmul32, fmul64,
1270    imin8, imin16, imin32, imin64,
1271    imax8, imax16, imax32, imax64,
1272    umin8, umin16, umin32, umin64,
1273    umax8, umax16, umax32, umax64,
1274           fmin16, fmin32, fmin64,
1275           fmax16, fmax32, fmax64,
1276    iand8, iand16, iand32, iand64,
1277    ior8, ior16, ior32, ior64,
1278    ixor8, ixor16, ixor32, ixor64,
1279 };
1280
1281 /**
1282  * Subgroup Reduction Instructions, everything except for the data to be
1283  * reduced and the result as inserted by setup_reduce_temp().
1284  * Operand(0): data to be reduced
1285  * Operand(1): reduce temporary
1286  * Operand(2): vector temporary
1287  * Definition(0): result
1288  * Definition(1): scalar temporary
1289  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1290  * Definition(3): scc clobber
1291  * Definition(4): vcc clobber
1292  *
1293  */
1294 struct Pseudo_reduction_instruction : public Instruction {
1295    ReduceOp reduce_op;
1296    uint16_t cluster_size; // must be 0 for scans
1297 };
1298 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1299
1300 struct instr_deleter_functor {
1301    void operator()(void* p) {
1302       free(p);
1303    }
1304 };
1305
1306 template<typename T>
1307 using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1308
1309 template<typename T>
1310 T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions)
1311 {
1312    std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1313    char *data = (char*) calloc(1, size);
1314    T* inst = (T*) data;
1315
1316    inst->opcode = opcode;
1317    inst->format = format;
1318
1319    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1320    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1321    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1322    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1323
1324    return inst;
1325 }
1326
1327 constexpr bool Instruction::usesModifiers() const noexcept
1328 {
1329    if (isDPP() || isSDWA())
1330       return true;
1331
1332    if (format == Format::VOP3P) {
1333       const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this);
1334       for (unsigned i = 0; i < operands.size(); i++) {
1335          if (vop3p->neg_lo[i] || vop3p->neg_hi[i])
1336             return true;
1337       }
1338       return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp;
1339    } else if (isVOP3()) {
1340       const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this);
1341       for (unsigned i = 0; i < operands.size(); i++) {
1342          if (vop3->abs[i] || vop3->neg[i])
1343             return true;
1344       }
1345       return vop3->opsel || vop3->clamp || vop3->omod;
1346    }
1347    return false;
1348 }
1349
1350 constexpr bool is_phi(Instruction* instr)
1351 {
1352    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1353 }
1354
1355 static inline bool is_phi(aco_ptr<Instruction>& instr)
1356 {
1357    return is_phi(instr.get());
1358 }
1359
1360 memory_sync_info get_sync_info(const Instruction* instr);
1361
1362 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
1363
1364 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1365 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
1366 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1367 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1368
1369 enum block_kind {
1370    /* uniform indicates that leaving this block,
1371     * all actives lanes stay active */
1372    block_kind_uniform = 1 << 0,
1373    block_kind_top_level = 1 << 1,
1374    block_kind_loop_preheader = 1 << 2,
1375    block_kind_loop_header = 1 << 3,
1376    block_kind_loop_exit = 1 << 4,
1377    block_kind_continue = 1 << 5,
1378    block_kind_break = 1 << 6,
1379    block_kind_continue_or_break = 1 << 7,
1380    block_kind_discard = 1 << 8,
1381    block_kind_branch = 1 << 9,
1382    block_kind_merge = 1 << 10,
1383    block_kind_invert = 1 << 11,
1384    block_kind_uses_discard_if = 1 << 12,
1385    block_kind_needs_lowering = 1 << 13,
1386    block_kind_uses_demote = 1 << 14,
1387    block_kind_export_end = 1 << 15,
1388 };
1389
1390
1391 struct RegisterDemand {
1392    constexpr RegisterDemand() = default;
1393    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept
1394       : vgpr{v}, sgpr{s} {}
1395    int16_t vgpr = 0;
1396    int16_t sgpr = 0;
1397
1398    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept {
1399       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1400    }
1401
1402    constexpr bool exceeds(const RegisterDemand other) const noexcept {
1403       return vgpr > other.vgpr || sgpr > other.sgpr;
1404    }
1405
1406    constexpr RegisterDemand operator+(const Temp t) const noexcept {
1407       if (t.type() == RegType::sgpr)
1408          return RegisterDemand( vgpr, sgpr + t.size() );
1409       else
1410          return RegisterDemand( vgpr + t.size(), sgpr );
1411    }
1412
1413    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept {
1414       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1415    }
1416
1417    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept {
1418       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1419    }
1420
1421    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept {
1422       vgpr += other.vgpr;
1423       sgpr += other.sgpr;
1424       return *this;
1425    }
1426
1427    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept {
1428       vgpr -= other.vgpr;
1429       sgpr -= other.sgpr;
1430       return *this;
1431    }
1432
1433    constexpr RegisterDemand& operator+=(const Temp t) noexcept {
1434       if (t.type() == RegType::sgpr)
1435          sgpr += t.size();
1436       else
1437          vgpr += t.size();
1438       return *this;
1439    }
1440
1441    constexpr RegisterDemand& operator-=(const Temp t) noexcept {
1442       if (t.type() == RegType::sgpr)
1443          sgpr -= t.size();
1444       else
1445          vgpr -= t.size();
1446       return *this;
1447    }
1448
1449    constexpr void update(const RegisterDemand other) noexcept {
1450       vgpr = std::max(vgpr, other.vgpr);
1451       sgpr = std::max(sgpr, other.sgpr);
1452    }
1453
1454 };
1455
1456 /* CFG */
1457 struct Block {
1458    float_mode fp_mode;
1459    unsigned index;
1460    unsigned offset = 0;
1461    std::vector<aco_ptr<Instruction>> instructions;
1462    std::vector<unsigned> logical_preds;
1463    std::vector<unsigned> linear_preds;
1464    std::vector<unsigned> logical_succs;
1465    std::vector<unsigned> linear_succs;
1466    RegisterDemand register_demand = RegisterDemand();
1467    uint16_t loop_nest_depth = 0;
1468    uint16_t kind = 0;
1469    int logical_idom = -1;
1470    int linear_idom = -1;
1471    Temp live_out_exec = Temp();
1472
1473    /* this information is needed for predecessors to blocks with phis when
1474     * moving out of ssa */
1475    bool scc_live_out = false;
1476    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1477
1478    Block(unsigned idx) : index(idx) {}
1479    Block() : index(0) {}
1480 };
1481
1482 using Stage = uint16_t;
1483
1484 /* software stages */
1485 static constexpr Stage sw_vs = 1 << 0;
1486 static constexpr Stage sw_gs = 1 << 1;
1487 static constexpr Stage sw_tcs = 1 << 2;
1488 static constexpr Stage sw_tes = 1 << 3;
1489 static constexpr Stage sw_fs = 1 << 4;
1490 static constexpr Stage sw_cs = 1 << 5;
1491 static constexpr Stage sw_gs_copy = 1 << 6;
1492 static constexpr Stage sw_mask = 0x7f;
1493
1494 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
1495 static constexpr Stage hw_vs = 1 << 7;
1496 static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1497 static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
1498 static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
1499 static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1500 static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1501 static constexpr Stage hw_fs = 1 << 13;
1502 static constexpr Stage hw_cs = 1 << 14;
1503 static constexpr Stage hw_mask = 0xff << 7;
1504
1505 /* possible settings of Program::stage */
1506 static constexpr Stage vertex_vs = sw_vs | hw_vs;
1507 static constexpr Stage fragment_fs = sw_fs | hw_fs;
1508 static constexpr Stage compute_cs = sw_cs | hw_cs;
1509 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
1510 static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
1511 /* GFX10/NGG */
1512 static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
1513 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
1514 static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
1515 static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
1516 /* GFX9 (and GFX10 if NGG isn't used) */
1517 static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
1518 static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
1519 static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
1520 /* pre-GFX9 */
1521 static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
1522 static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
1523 static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
1524 static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
1525 static constexpr Stage geometry_gs = sw_gs | hw_gs;
1526
1527 enum statistic {
1528    statistic_hash,
1529    statistic_instructions,
1530    statistic_copies,
1531    statistic_branches,
1532    statistic_cycles,
1533    statistic_vmem_clauses,
1534    statistic_smem_clauses,
1535    statistic_vmem_score,
1536    statistic_smem_score,
1537    statistic_sgpr_presched,
1538    statistic_vgpr_presched,
1539    num_statistics
1540 };
1541
1542 class Program final {
1543 public:
1544    float_mode next_fp_mode;
1545    std::vector<Block> blocks;
1546    RegisterDemand max_reg_demand = RegisterDemand();
1547    uint16_t num_waves = 0;
1548    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
1549    ac_shader_config* config;
1550    struct radv_shader_info *info;
1551    enum chip_class chip_class;
1552    enum radeon_family family;
1553    unsigned wave_size;
1554    RegClass lane_mask;
1555    Stage stage; /* Stage */
1556    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
1557    bool needs_wqm = false; /* there exists a p_wqm instruction */
1558    bool wb_smem_l1_on_end = false;
1559
1560    std::vector<uint8_t> constant_data;
1561    Temp private_segment_buffer;
1562    Temp scratch_offset;
1563
1564    uint16_t min_waves = 0;
1565    uint16_t lds_alloc_granule;
1566    uint32_t lds_limit; /* in bytes */
1567    bool has_16bank_lds;
1568    uint16_t vgpr_limit;
1569    uint16_t sgpr_limit;
1570    uint16_t physical_sgprs;
1571    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
1572    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
1573    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
1574
1575    bool xnack_enabled = false;
1576    bool sram_ecc_enabled = false;
1577    bool has_fast_fma32 = false;
1578
1579    bool needs_vcc = false;
1580    bool needs_flat_scr = false;
1581
1582    bool collect_statistics = false;
1583    uint32_t statistics[num_statistics];
1584
1585    struct {
1586       void (*func)(void *private_data,
1587                    enum radv_compiler_debug_level level,
1588                    const char *message);
1589       void *private_data;
1590    } debug;
1591
1592    uint32_t allocateId()
1593    {
1594       assert(allocationID <= 16777215);
1595       return allocationID++;
1596    }
1597
1598    uint32_t peekAllocationId()
1599    {
1600       return allocationID;
1601    }
1602
1603    void setAllocationId(uint32_t id)
1604    {
1605       allocationID = id;
1606    }
1607
1608    Block* create_and_insert_block() {
1609       blocks.emplace_back(blocks.size());
1610       blocks.back().fp_mode = next_fp_mode;
1611       return &blocks.back();
1612    }
1613
1614    Block* insert_block(Block&& block) {
1615       block.index = blocks.size();
1616       block.fp_mode = next_fp_mode;
1617       blocks.emplace_back(std::move(block));
1618       return &blocks.back();
1619    }
1620
1621 private:
1622    uint32_t allocationID = 1;
1623 };
1624
1625 struct TempHash {
1626    std::size_t operator()(Temp t) const {
1627       return t.id();
1628    }
1629 };
1630 using TempSet = std::unordered_set<Temp, TempHash>;
1631
1632 struct live {
1633    /* live temps out per block */
1634    std::vector<TempSet> live_out;
1635    /* register demand (sgpr/vgpr) per instruction per block */
1636    std::vector<std::vector<RegisterDemand>> register_demand;
1637 };
1638
1639 void init();
1640
1641 void init_program(Program *program, Stage stage, struct radv_shader_info *info,
1642                   enum chip_class chip_class, enum radeon_family family,
1643                   ac_shader_config *config);
1644
1645 void select_program(Program *program,
1646                     unsigned shader_count,
1647                     struct nir_shader *const *shaders,
1648                     ac_shader_config* config,
1649                     struct radv_shader_args *args);
1650 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
1651                            ac_shader_config* config,
1652                            struct radv_shader_args *args);
1653
1654 void lower_wqm(Program* program, live& live_vars,
1655                const struct radv_nir_compiler_options *options);
1656 void lower_phis(Program* program);
1657 void calc_min_waves(Program* program);
1658 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
1659 live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
1660 std::vector<uint16_t> dead_code_analysis(Program *program);
1661 void dominator_tree(Program* program);
1662 void insert_exec_mask(Program *program);
1663 void value_numbering(Program* program);
1664 void optimize(Program* program);
1665 void setup_reduce_temp(Program* program);
1666 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1667 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
1668 void ssa_elimination(Program* program);
1669 void lower_to_hw_instr(Program* program);
1670 void schedule_program(Program* program, live& live_vars);
1671 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
1672 void insert_wait_states(Program* program);
1673 void insert_NOPs(Program* program);
1674 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
1675 void print_asm(Program *program, std::vector<uint32_t>& binary,
1676                unsigned exec_size, std::ostream& out);
1677 bool validate_ir(Program* program);
1678 bool validate_ra(Program* program, const struct radv_nir_compiler_options *options);
1679 #ifndef NDEBUG
1680 void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL);
1681 #else
1682 #define perfwarn(program, cond, msg, ...) do {} while(0)
1683 #endif
1684
1685 void collect_presched_stats(Program *program);
1686 void collect_preasm_stats(Program *program);
1687 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code);
1688
1689 void aco_print_instr(const Instruction *instr, FILE *output);
1690 void aco_print_program(const Program *program, FILE *output);
1691
1692 void _aco_perfwarn(Program *program, const char *file, unsigned line,
1693                    const char *fmt, ...);
1694 void _aco_err(Program *program, const char *file, unsigned line,
1695               const char *fmt, ...);
1696
1697 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
1698 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
1699
1700 /* utilities for dealing with register demand */
1701 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
1702 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
1703 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before);
1704
1705 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
1706 uint16_t get_extra_sgprs(Program *program);
1707
1708 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
1709 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
1710 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs);
1711
1712 /* return number of addressable sgprs/vgprs for max_waves */
1713 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
1714 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves);
1715
1716 typedef struct {
1717    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
1718    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
1719    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
1720    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
1721    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
1722    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
1723    const char *name[static_cast<int>(aco_opcode::num_opcodes)];
1724    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
1725    /* sizes used for input/output modifiers and constants */
1726    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
1727    const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
1728 } Info;
1729
1730 extern const Info instr_info;
1731
1732 }
1733
1734 #endif /* ACO_IR_H */
1735