src/gpu-compute/gpu_dyn_inst.hh

   1 /*
   2  * Copyright (c) 2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Anthony Gutierrez
  34  */
  35
  36 #ifndef __GPU_DYN_INST_HH__
  37 #define __GPU_DYN_INST_HH__
  38
  39 #include <cstdint>
  40 #include <string>
  41
  42 #include "enums/MemType.hh"
  43 #include "enums/StorageClassType.hh"
  44 #include "gpu-compute/compute_unit.hh"
  45 #include "gpu-compute/gpu_exec_context.hh"
  46
  47 class GPUStaticInst;
  48
  49 template<typename T>
  50 class AtomicOpAnd : public TypedAtomicOpFunctor<T>
  51 {
  52   public:
  53     T a;
  54
  55     AtomicOpAnd(T _a) : a(_a) { }
  56     void execute(T *b) { *b &= a; }
  57 };
  58
  59 template<typename T>
  60 class AtomicOpOr : public TypedAtomicOpFunctor<T>
  61 {
  62   public:
  63     T a;
  64     AtomicOpOr(T _a) : a(_a) { }
  65     void execute(T *b) { *b |= a; }
  66 };
  67
  68 template<typename T>
  69 class AtomicOpXor : public TypedAtomicOpFunctor<T>
  70 {
  71   public:
  72     T a;
  73     AtomicOpXor(T _a) : a(_a) {}
  74     void execute(T *b) { *b ^= a; }
  75 };
  76
  77 template<typename T>
  78 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
  79 {
  80   public:
  81     T c;
  82     T s;
  83
  84     ComputeUnit *computeUnit;
  85
  86     AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
  87       : c(_c), s(_s), computeUnit(compute_unit) { }
  88
  89     void
  90     execute(T *b)
  91     {
  92         computeUnit->numCASOps++;
  93
  94         if (*b == c) {
  95             *b = s;
  96         } else {
  97             computeUnit->numFailedCASOps++;
  98         }
  99
 100         if (computeUnit->xact_cas_mode) {
 101             computeUnit->xactCasLoadMap.clear();
 102         }
 103     }
 104 };
 105
 106 template<typename T>
 107 class AtomicOpExch : public TypedAtomicOpFunctor<T>
 108 {
 109   public:
 110     T a;
 111     AtomicOpExch(T _a) : a(_a) { }
 112     void execute(T *b) { *b = a; }
 113 };
 114
 115 template<typename T>
 116 class AtomicOpAdd : public TypedAtomicOpFunctor<T>
 117 {
 118   public:
 119     T a;
 120     AtomicOpAdd(T _a) : a(_a) { }
 121     void execute(T *b) { *b += a; }
 122 };
 123
 124 template<typename T>
 125 class AtomicOpSub : public TypedAtomicOpFunctor<T>
 126 {
 127   public:
 128     T a;
 129     AtomicOpSub(T _a) : a(_a) { }
 130     void execute(T *b) { *b -= a; }
 131 };
 132
 133 template<typename T>
 134 class AtomicOpInc : public TypedAtomicOpFunctor<T>
 135 {
 136   public:
 137     AtomicOpInc() { }
 138     void execute(T *b) { *b += 1; }
 139 };
 140
 141 template<typename T>
 142 class AtomicOpDec : public TypedAtomicOpFunctor<T>
 143 {
 144   public:
 145     AtomicOpDec() {}
 146     void execute(T *b) { *b -= 1; }
 147 };
 148
 149 template<typename T>
 150 class AtomicOpMax : public TypedAtomicOpFunctor<T>
 151 {
 152   public:
 153     T a;
 154     AtomicOpMax(T _a) : a(_a) { }
 155
 156     void
 157     execute(T *b)
 158     {
 159         if (a > *b)
 160             *b = a;
 161     }
 162 };
 163
 164 template<typename T>
 165 class AtomicOpMin : public TypedAtomicOpFunctor<T>
 166 {
 167   public:
 168     T a;
 169     AtomicOpMin(T _a) : a(_a) {}
 170
 171     void
 172     execute(T *b)
 173     {
 174         if (a < *b)
 175             *b = a;
 176     }
 177 };
 178
 179 typedef enum
 180 {
 181     VT_32,
 182     VT_64,
 183 } vgpr_type;
 184
 185 class GPUDynInst : public GPUExecContext
 186 {
 187   public:
 188     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
 189                uint64_t instSeqNum);
 190     ~GPUDynInst();
 191     void execute(GPUDynInstPtr gpuDynInst);
 192     int numSrcRegOperands();
 193     int numDstRegOperands();
 194     int getNumOperands();
 195     bool isVectorRegister(int operandIdx);
 196     bool isScalarRegister(int operandIdx);
 197     bool isCondRegister(int operandIdx);
 198     int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
 199     int getOperandSize(int operandIdx);
 200     bool isDstOperand(int operandIdx);
 201     bool isSrcOperand(int operandIdx);
 202
 203     const std::string &disassemble() const;
 204
 205     uint64_t seqNum() const;
 206
 207     Enums::StorageClassType executedAs();
 208
 209     // The address of the memory operation
 210     std::vector<Addr> addr;
 211     Addr pAddr;
 212
 213     // The data to get written
 214     uint8_t *d_data;
 215     // Additional data (for atomics)
 216     uint8_t *a_data;
 217     // Additional data (for atomics)
 218     uint8_t *x_data;
 219     // The execution mask
 220     VectorMask exec_mask;
 221
 222     // The memory type (M_U32, M_S32, ...)
 223     Enums::MemType m_type;
 224
 225     // The equivalency class
 226     int equiv;
 227     // The return VGPR type (VT_32 or VT_64)
 228     vgpr_type v_type;
 229     // Number of VGPR's accessed (1, 2, or 4)
 230     int n_reg;
 231     // The return VGPR index
 232     int dst_reg;
 233     // There can be max 4 dest regs>
 234     int dst_reg_vec[4];
 235     // SIMD where the WF of the memory instruction has been mapped to
 236     int simdId;
 237     // unique id of the WF where the memory instruction belongs to
 238     int wfDynId;
 239     // The kernel id of the requesting wf
 240     int kern_id;
 241     // The CU id of the requesting wf
 242     int cu_id;
 243     // HW slot id where the WF is mapped to inside a SIMD unit
 244     int wfSlotId;
 245     // execution pipeline id where the memory instruction has been scheduled
 246     int pipeId;
 247     // The execution time of this operation
 248     Tick time;
 249     // The latency of this operation
 250     WaitClass latency;
 251     // A list of bank conflicts for the 4 cycles.
 252     uint32_t bc[4];
 253
 254     // A pointer to ROM
 255     uint8_t *rom;
 256     // The size of the READONLY segment
 257     int sz_rom;
 258
 259     // Initiate the specified memory operation, by creating a
 260     // memory request and sending it off to the memory system.
 261     void initiateAcc(GPUDynInstPtr gpuDynInst);
 262     // Complete the specified memory operation, by writing
 263     // value back to the RF in the case of a load or atomic
 264     // return or, in the case of a store, we do nothing
 265     void completeAcc(GPUDynInstPtr gpuDynInst);
 266
 267     void updateStats();
 268
 269     GPUStaticInst* staticInstruction() { return _staticInst; }
 270
 271     bool isALU() const;
 272     bool isBranch() const;
 273     bool isNop() const;
 274     bool isReturn() const;
 275     bool isUnconditionalJump() const;
 276     bool isSpecialOp() const;
 277     bool isWaitcnt() const;
 278
 279     bool isBarrier() const;
 280     bool isMemFence() const;
 281     bool isMemRef() const;
 282     bool isFlat() const;
 283     bool isLoad() const;
 284     bool isStore() const;
 285
 286     bool isAtomic() const;
 287     bool isAtomicNoRet() const;
 288     bool isAtomicRet() const;
 289
 290     bool isScalar() const;
 291     bool readsSCC() const;
 292     bool writesSCC() const;
 293     bool readsVCC() const;
 294     bool writesVCC() const;
 295
 296     bool isAtomicAnd() const;
 297     bool isAtomicOr() const;
 298     bool isAtomicXor() const;
 299     bool isAtomicCAS() const;
 300     bool isAtomicExch() const;
 301     bool isAtomicAdd() const;
 302     bool isAtomicSub() const;
 303     bool isAtomicInc() const;
 304     bool isAtomicDec() const;
 305     bool isAtomicMax() const;
 306     bool isAtomicMin() const;
 307
 308     bool isArgLoad() const;
 309     bool isGlobalMem() const;
 310     bool isLocalMem() const;
 311
 312     bool isArgSeg() const;
 313     bool isGlobalSeg() const;
 314     bool isGroupSeg() const;
 315     bool isKernArgSeg() const;
 316     bool isPrivateSeg() const;
 317     bool isReadOnlySeg() const;
 318     bool isSpillSeg() const;
 319
 320     bool isWorkitemScope() const;
 321     bool isWavefrontScope() const;
 322     bool isWorkgroupScope() const;
 323     bool isDeviceScope() const;
 324     bool isSystemScope() const;
 325     bool isNoScope() const;
 326
 327     bool isRelaxedOrder() const;
 328     bool isAcquire() const;
 329     bool isRelease() const;
 330     bool isAcquireRelease() const;
 331     bool isNoOrder() const;
 332
 333     bool isGloballyCoherent() const;
 334     bool isSystemCoherent() const;
 335
 336     /*
 337      * Loads/stores/atomics may have acquire/release semantics associated
 338      * withthem. Some protocols want to see the acquire/release as separate
 339      * requests from the load/store/atomic. We implement that separation
 340      * using continuations (i.e., a function pointer with an object associated
 341      * with it). When, for example, the front-end generates a store with
 342      * release semantics, we will first issue a normal store and set the
 343      * continuation in the GPUDynInst to a function that generate a
 344      * release request. That continuation will be called when the normal
 345      * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
 346      * continuation will be called in the context of the same GPUDynInst
 347      * that generated the initial store.
 348      */
 349     std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
 350
 351     // when true, call execContinuation when response arrives
 352     bool useContinuation;
 353
 354     template<typename c0> AtomicOpFunctor*
 355     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
 356     {
 357         if (isAtomicAnd()) {
 358             return new AtomicOpAnd<c0>(*reg0);
 359         } else if (isAtomicOr()) {
 360             return new AtomicOpOr<c0>(*reg0);
 361         } else if (isAtomicXor()) {
 362             return new AtomicOpXor<c0>(*reg0);
 363         } else if (isAtomicCAS()) {
 364             return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
 365         } else if (isAtomicExch()) {
 366             return new AtomicOpExch<c0>(*reg0);
 367         } else if (isAtomicAdd()) {
 368             return new AtomicOpAdd<c0>(*reg0);
 369         } else if (isAtomicSub()) {
 370             return new AtomicOpSub<c0>(*reg0);
 371         } else if (isAtomicInc()) {
 372             return new AtomicOpInc<c0>();
 373         } else if (isAtomicDec()) {
 374             return new AtomicOpDec<c0>();
 375         } else if (isAtomicMax()) {
 376             return new AtomicOpMax<c0>(*reg0);
 377         } else if (isAtomicMin()) {
 378             return new AtomicOpMin<c0>(*reg0);
 379         } else {
 380             fatal("Unrecognized atomic operation");
 381         }
 382     }
 383
 384     void
 385     setRequestFlags(Request *req, bool setMemOrder=true)
 386     {
 387         // currently these are the easy scopes to deduce
 388         if (isPrivateSeg()) {
 389             req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
 390         } else if (isSpillSeg()) {
 391             req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
 392         } else if (isGlobalSeg()) {
 393             req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
 394         } else if (isReadOnlySeg()) {
 395             req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
 396         } else if (isGroupSeg()) {
 397             req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
 398         } else if (isFlat()) {
 399             // TODO: translate to correct scope
 400             assert(false);
 401         } else {
 402             fatal("%s has bad segment type\n", disassemble());
 403         }
 404
 405         if (isWavefrontScope()) {
 406             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 407                                         Request::WAVEFRONT_SCOPE);
 408         } else if (isWorkgroupScope()) {
 409             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 410                                         Request::WORKGROUP_SCOPE);
 411         } else if (isDeviceScope()) {
 412             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 413                                         Request::DEVICE_SCOPE);
 414         } else if (isSystemScope()) {
 415             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 416                                         Request::SYSTEM_SCOPE);
 417         } else if (!isNoScope() && !isWorkitemScope()) {
 418             fatal("%s has bad scope type\n", disassemble());
 419         }
 420
 421         if (setMemOrder) {
 422             // set acquire and release flags
 423             if (isAcquire()) {
 424                 req->setFlags(Request::ACQUIRE);
 425             } else if (isRelease()) {
 426                 req->setFlags(Request::RELEASE);
 427             } else if (isAcquireRelease()) {
 428                 req->setFlags(Request::ACQUIRE | Request::RELEASE);
 429             } else if (!isNoOrder()) {
 430                 fatal("%s has bad memory order\n", disassemble());
 431             }
 432         }
 433
 434         // set atomic type
 435         // currently, the instruction genenerator only produces atomic return
 436         // but a magic instruction can produce atomic no return
 437         if (isAtomicRet()) {
 438             req->setFlags(Request::ATOMIC_RETURN_OP);
 439         } else if (isAtomicNoRet()) {
 440             req->setFlags(Request::ATOMIC_NO_RETURN_OP);
 441         }
 442     }
 443
 444     // Map returned packets and the addresses they satisfy with which lane they
 445     // were requested from
 446     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
 447     StatusVector memStatusVector;
 448
 449     // Track the status of memory requests per lane, a bit per lane
 450     VectorMask statusBitVector;
 451     // for ld_v# or st_v#
 452     std::vector<int> statusVector;
 453     std::vector<int> tlbHitLevel;
 454
 455   private:
 456     GPUStaticInst *_staticInst;
 457     uint64_t _seqNum;
 458 };
 459
 460 #endif // __GPU_DYN_INST_HH__