src/gpu-compute/gpu_dyn_inst.hh

   1 /*
   2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Anthony Gutierrez
  34  */
  35
  36 #ifndef __GPU_DYN_INST_HH__
  37 #define __GPU_DYN_INST_HH__
  38
  39 #include <cstdint>
  40 #include <string>
  41
  42 #include "enums/MemType.hh"
  43 #include "enums/StorageClassType.hh"
  44 #include "gpu-compute/compute_unit.hh"
  45 #include "gpu-compute/gpu_exec_context.hh"
  46
  47 class GPUStaticInst;
  48
  49 template<typename T>
  50 class AtomicOpAnd : public TypedAtomicOpFunctor<T>
  51 {
  52   public:
  53     T a;
  54
  55     AtomicOpAnd(T _a) : a(_a) { }
  56     void execute(T *b) { *b &= a; }
  57     AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
  58 };
  59
  60 template<typename T>
  61 class AtomicOpOr : public TypedAtomicOpFunctor<T>
  62 {
  63   public:
  64     T a;
  65     AtomicOpOr(T _a) : a(_a) { }
  66     void execute(T *b) { *b |= a; }
  67     AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
  68 };
  69
  70 template<typename T>
  71 class AtomicOpXor : public TypedAtomicOpFunctor<T>
  72 {
  73   public:
  74     T a;
  75     AtomicOpXor(T _a) : a(_a) {}
  76     void execute(T *b) { *b ^= a; }
  77     AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
  78 };
  79
  80 template<typename T>
  81 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
  82 {
  83   public:
  84     T c;
  85     T s;
  86
  87     ComputeUnit *computeUnit;
  88
  89     AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
  90       : c(_c), s(_s), computeUnit(compute_unit) { }
  91
  92     void
  93     execute(T *b)
  94     {
  95         computeUnit->numCASOps++;
  96
  97         if (*b == c) {
  98             *b = s;
  99         } else {
 100             computeUnit->numFailedCASOps++;
 101         }
 102
 103         if (computeUnit->xact_cas_mode) {
 104             computeUnit->xactCasLoadMap.clear();
 105         }
 106     }
 107     AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
 108 };
 109
 110 template<typename T>
 111 class AtomicOpExch : public TypedAtomicOpFunctor<T>
 112 {
 113   public:
 114     T a;
 115     AtomicOpExch(T _a) : a(_a) { }
 116     void execute(T *b) { *b = a; }
 117     AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
 118 };
 119
 120 template<typename T>
 121 class AtomicOpAdd : public TypedAtomicOpFunctor<T>
 122 {
 123   public:
 124     T a;
 125     AtomicOpAdd(T _a) : a(_a) { }
 126     void execute(T *b) { *b += a; }
 127     AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
 128 };
 129
 130 template<typename T>
 131 class AtomicOpSub : public TypedAtomicOpFunctor<T>
 132 {
 133   public:
 134     T a;
 135     AtomicOpSub(T _a) : a(_a) { }
 136     void execute(T *b) { *b -= a; }
 137     AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
 138 };
 139
 140 template<typename T>
 141 class AtomicOpInc : public TypedAtomicOpFunctor<T>
 142 {
 143   public:
 144     AtomicOpInc() { }
 145     void execute(T *b) { *b += 1; }
 146     AtomicOpFunctor* clone () { return new AtomicOpInc(); }
 147 };
 148
 149 template<typename T>
 150 class AtomicOpDec : public TypedAtomicOpFunctor<T>
 151 {
 152   public:
 153     AtomicOpDec() {}
 154     void execute(T *b) { *b -= 1; }
 155     AtomicOpFunctor* clone () { return new AtomicOpDec(); }
 156 };
 157
 158 template<typename T>
 159 class AtomicOpMax : public TypedAtomicOpFunctor<T>
 160 {
 161   public:
 162     T a;
 163     AtomicOpMax(T _a) : a(_a) { }
 164
 165     void
 166     execute(T *b)
 167     {
 168         if (a > *b)
 169             *b = a;
 170     }
 171     AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
 172 };
 173
 174 template<typename T>
 175 class AtomicOpMin : public TypedAtomicOpFunctor<T>
 176 {
 177   public:
 178     T a;
 179     AtomicOpMin(T _a) : a(_a) {}
 180
 181     void
 182     execute(T *b)
 183     {
 184         if (a < *b)
 185             *b = a;
 186     }
 187     AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
 188 };
 189
 190 typedef enum
 191 {
 192     VT_32,
 193     VT_64,
 194 } vgpr_type;
 195
 196 class GPUDynInst : public GPUExecContext
 197 {
 198   public:
 199     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
 200                uint64_t instSeqNum);
 201     ~GPUDynInst();
 202     void execute(GPUDynInstPtr gpuDynInst);
 203     int numSrcRegOperands();
 204     int numDstRegOperands();
 205     int getNumOperands();
 206     bool isVectorRegister(int operandIdx);
 207     bool isScalarRegister(int operandIdx);
 208     bool isCondRegister(int operandIdx);
 209     int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
 210     int getOperandSize(int operandIdx);
 211     bool isDstOperand(int operandIdx);
 212     bool isSrcOperand(int operandIdx);
 213
 214     const std::string &disassemble() const;
 215
 216     uint64_t seqNum() const;
 217
 218     Enums::StorageClassType executedAs();
 219
 220     // The address of the memory operation
 221     std::vector<Addr> addr;
 222     Addr pAddr;
 223
 224     // The data to get written
 225     uint8_t *d_data;
 226     // Additional data (for atomics)
 227     uint8_t *a_data;
 228     // Additional data (for atomics)
 229     uint8_t *x_data;
 230     // The execution mask
 231     VectorMask exec_mask;
 232
 233     // The memory type (M_U32, M_S32, ...)
 234     Enums::MemType m_type;
 235
 236     // The equivalency class
 237     int equiv;
 238     // The return VGPR type (VT_32 or VT_64)
 239     vgpr_type v_type;
 240     // Number of VGPR's accessed (1, 2, or 4)
 241     int n_reg;
 242     // The return VGPR index
 243     int dst_reg;
 244     // There can be max 4 dest regs>
 245     int dst_reg_vec[4];
 246     // SIMD where the WF of the memory instruction has been mapped to
 247     int simdId;
 248     // unique id of the WF where the memory instruction belongs to
 249     int wfDynId;
 250     // The kernel id of the requesting wf
 251     int kern_id;
 252     // The CU id of the requesting wf
 253     int cu_id;
 254     // HW slot id where the WF is mapped to inside a SIMD unit
 255     int wfSlotId;
 256     // execution pipeline id where the memory instruction has been scheduled
 257     int pipeId;
 258     // The execution time of this operation
 259     Tick time;
 260     // The latency of this operation
 261     WaitClass latency;
 262     // A list of bank conflicts for the 4 cycles.
 263     uint32_t bc[4];
 264
 265     // A pointer to ROM
 266     uint8_t *rom;
 267     // The size of the READONLY segment
 268     int sz_rom;
 269
 270     // Initiate the specified memory operation, by creating a
 271     // memory request and sending it off to the memory system.
 272     void initiateAcc(GPUDynInstPtr gpuDynInst);
 273     // Complete the specified memory operation, by writing
 274     // value back to the RF in the case of a load or atomic
 275     // return or, in the case of a store, we do nothing
 276     void completeAcc(GPUDynInstPtr gpuDynInst);
 277
 278     void updateStats();
 279
 280     GPUStaticInst* staticInstruction() { return _staticInst; }
 281
 282     bool isALU() const;
 283     bool isBranch() const;
 284     bool isNop() const;
 285     bool isReturn() const;
 286     bool isUnconditionalJump() const;
 287     bool isSpecialOp() const;
 288     bool isWaitcnt() const;
 289
 290     bool isBarrier() const;
 291     bool isMemFence() const;
 292     bool isMemRef() const;
 293     bool isFlat() const;
 294     bool isLoad() const;
 295     bool isStore() const;
 296
 297     bool isAtomic() const;
 298     bool isAtomicNoRet() const;
 299     bool isAtomicRet() const;
 300
 301     bool isScalar() const;
 302     bool readsSCC() const;
 303     bool writesSCC() const;
 304     bool readsVCC() const;
 305     bool writesVCC() const;
 306
 307     bool isAtomicAnd() const;
 308     bool isAtomicOr() const;
 309     bool isAtomicXor() const;
 310     bool isAtomicCAS() const;
 311     bool isAtomicExch() const;
 312     bool isAtomicAdd() const;
 313     bool isAtomicSub() const;
 314     bool isAtomicInc() const;
 315     bool isAtomicDec() const;
 316     bool isAtomicMax() const;
 317     bool isAtomicMin() const;
 318
 319     bool isArgLoad() const;
 320     bool isGlobalMem() const;
 321     bool isLocalMem() const;
 322
 323     bool isArgSeg() const;
 324     bool isGlobalSeg() const;
 325     bool isGroupSeg() const;
 326     bool isKernArgSeg() const;
 327     bool isPrivateSeg() const;
 328     bool isReadOnlySeg() const;
 329     bool isSpillSeg() const;
 330
 331     bool isWorkitemScope() const;
 332     bool isWavefrontScope() const;
 333     bool isWorkgroupScope() const;
 334     bool isDeviceScope() const;
 335     bool isSystemScope() const;
 336     bool isNoScope() const;
 337
 338     bool isRelaxedOrder() const;
 339     bool isAcquire() const;
 340     bool isRelease() const;
 341     bool isAcquireRelease() const;
 342     bool isNoOrder() const;
 343
 344     bool isGloballyCoherent() const;
 345     bool isSystemCoherent() const;
 346
 347     /*
 348      * Loads/stores/atomics may have acquire/release semantics associated
 349      * withthem. Some protocols want to see the acquire/release as separate
 350      * requests from the load/store/atomic. We implement that separation
 351      * using continuations (i.e., a function pointer with an object associated
 352      * with it). When, for example, the front-end generates a store with
 353      * release semantics, we will first issue a normal store and set the
 354      * continuation in the GPUDynInst to a function that generate a
 355      * release request. That continuation will be called when the normal
 356      * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
 357      * continuation will be called in the context of the same GPUDynInst
 358      * that generated the initial store.
 359      */
 360     std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
 361
 362     // when true, call execContinuation when response arrives
 363     bool useContinuation;
 364
 365     template<typename c0> AtomicOpFunctor*
 366     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
 367     {
 368         if (isAtomicAnd()) {
 369             return new AtomicOpAnd<c0>(*reg0);
 370         } else if (isAtomicOr()) {
 371             return new AtomicOpOr<c0>(*reg0);
 372         } else if (isAtomicXor()) {
 373             return new AtomicOpXor<c0>(*reg0);
 374         } else if (isAtomicCAS()) {
 375             return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
 376         } else if (isAtomicExch()) {
 377             return new AtomicOpExch<c0>(*reg0);
 378         } else if (isAtomicAdd()) {
 379             return new AtomicOpAdd<c0>(*reg0);
 380         } else if (isAtomicSub()) {
 381             return new AtomicOpSub<c0>(*reg0);
 382         } else if (isAtomicInc()) {
 383             return new AtomicOpInc<c0>();
 384         } else if (isAtomicDec()) {
 385             return new AtomicOpDec<c0>();
 386         } else if (isAtomicMax()) {
 387             return new AtomicOpMax<c0>(*reg0);
 388         } else if (isAtomicMin()) {
 389             return new AtomicOpMin<c0>(*reg0);
 390         } else {
 391             fatal("Unrecognized atomic operation");
 392         }
 393     }
 394
 395     void
 396     setRequestFlags(RequestPtr req, bool setMemOrder=true)
 397     {
 398         // currently these are the easy scopes to deduce
 399         if (isPrivateSeg()) {
 400             req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
 401         } else if (isSpillSeg()) {
 402             req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
 403         } else if (isGlobalSeg()) {
 404             req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
 405         } else if (isReadOnlySeg()) {
 406             req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
 407         } else if (isGroupSeg()) {
 408             req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
 409         } else if (isFlat()) {
 410             // TODO: translate to correct scope
 411             assert(false);
 412         } else {
 413             fatal("%s has bad segment type\n", disassemble());
 414         }
 415
 416         if (isWavefrontScope()) {
 417             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 418                                         Request::WAVEFRONT_SCOPE);
 419         } else if (isWorkgroupScope()) {
 420             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 421                                         Request::WORKGROUP_SCOPE);
 422         } else if (isDeviceScope()) {
 423             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 424                                         Request::DEVICE_SCOPE);
 425         } else if (isSystemScope()) {
 426             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 427                                         Request::SYSTEM_SCOPE);
 428         } else if (!isNoScope() && !isWorkitemScope()) {
 429             fatal("%s has bad scope type\n", disassemble());
 430         }
 431
 432         if (setMemOrder) {
 433             // set acquire and release flags
 434             if (isAcquire()) {
 435                 req->setFlags(Request::ACQUIRE);
 436             } else if (isRelease()) {
 437                 req->setFlags(Request::RELEASE);
 438             } else if (isAcquireRelease()) {
 439                 req->setFlags(Request::ACQUIRE | Request::RELEASE);
 440             } else if (!isNoOrder()) {
 441                 fatal("%s has bad memory order\n", disassemble());
 442             }
 443         }
 444
 445         // set atomic type
 446         // currently, the instruction genenerator only produces atomic return
 447         // but a magic instruction can produce atomic no return
 448         if (isAtomicRet()) {
 449             req->setFlags(Request::ATOMIC_RETURN_OP);
 450         } else if (isAtomicNoRet()) {
 451             req->setFlags(Request::ATOMIC_NO_RETURN_OP);
 452         }
 453     }
 454
 455     // Map returned packets and the addresses they satisfy with which lane they
 456     // were requested from
 457     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
 458     StatusVector memStatusVector;
 459
 460     // Track the status of memory requests per lane, a bit per lane
 461     VectorMask statusBitVector;
 462     // for ld_v# or st_v#
 463     std::vector<int> statusVector;
 464     std::vector<int> tlbHitLevel;
 465
 466   private:
 467     GPUStaticInst *_staticInst;
 468     uint64_t _seqNum;
 469 };
 470
 471 #endif // __GPU_DYN_INST_HH__