src/gpu-compute/gpu_dyn_inst.hh

   1 /*
   2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef __GPU_DYN_INST_HH__
  35 #define __GPU_DYN_INST_HH__
  36
  37 #include <cstdint>
  38 #include <string>
  39
  40 #include "base/amo.hh"
  41 #include "base/logging.hh"
  42 #include "enums/MemType.hh"
  43 #include "enums/StorageClassType.hh"
  44 #include "gpu-compute/compute_unit.hh"
  45 #include "gpu-compute/gpu_exec_context.hh"
  46
  47 class GPUStaticInst;
  48
  49 template<typename T>
  50 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
  51 {
  52   public:
  53     T c;
  54     T s;
  55
  56     ComputeUnit *computeUnit;
  57
  58     AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
  59       : c(_c), s(_s), computeUnit(compute_unit) { }
  60
  61     void
  62     execute(T *b)
  63     {
  64         computeUnit->numCASOps++;
  65
  66         if (*b == c) {
  67             *b = s;
  68         } else {
  69             computeUnit->numFailedCASOps++;
  70         }
  71
  72         if (computeUnit->xact_cas_mode) {
  73             computeUnit->xactCasLoadMap.clear();
  74         }
  75     }
  76     AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
  77 };
  78
  79 typedef enum
  80 {
  81     VT_32,
  82     VT_64,
  83 } vgpr_type;
  84
  85 class GPUDynInst : public GPUExecContext
  86 {
  87   public:
  88     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
  89                uint64_t instSeqNum);
  90     ~GPUDynInst();
  91     void execute(GPUDynInstPtr gpuDynInst);
  92     int numSrcRegOperands();
  93     int numDstRegOperands();
  94     int getNumOperands();
  95     bool isVectorRegister(int operandIdx);
  96     bool isScalarRegister(int operandIdx);
  97     bool isCondRegister(int operandIdx);
  98     int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
  99     int getOperandSize(int operandIdx);
 100     bool isDstOperand(int operandIdx);
 101     bool isSrcOperand(int operandIdx);
 102
 103     const std::string &disassemble() const;
 104
 105     uint64_t seqNum() const;
 106
 107     Enums::StorageClassType executedAs();
 108
 109     // The address of the memory operation
 110     std::vector<Addr> addr;
 111     Addr pAddr;
 112
 113     // The data to get written
 114     uint8_t *d_data;
 115     // Additional data (for atomics)
 116     uint8_t *a_data;
 117     // Additional data (for atomics)
 118     uint8_t *x_data;
 119     // The execution mask
 120     VectorMask exec_mask;
 121
 122     // The memory type (M_U32, M_S32, ...)
 123     Enums::MemType m_type;
 124
 125     // The equivalency class
 126     int equiv;
 127     // The return VGPR type (VT_32 or VT_64)
 128     vgpr_type v_type;
 129     // Number of VGPR's accessed (1, 2, or 4)
 130     int n_reg;
 131     // The return VGPR index
 132     int dst_reg;
 133     // There can be max 4 dest regs>
 134     int dst_reg_vec[4];
 135     // SIMD where the WF of the memory instruction has been mapped to
 136     int simdId;
 137     // unique id of the WF where the memory instruction belongs to
 138     int wfDynId;
 139     // The kernel id of the requesting wf
 140     int kern_id;
 141     // The CU id of the requesting wf
 142     int cu_id;
 143     // HW slot id where the WF is mapped to inside a SIMD unit
 144     int wfSlotId;
 145     // execution pipeline id where the memory instruction has been scheduled
 146     int pipeId;
 147     // The execution time of this operation
 148     Tick time;
 149     // The latency of this operation
 150     WaitClass latency;
 151     // A list of bank conflicts for the 4 cycles.
 152     uint32_t bc[4];
 153
 154     // A pointer to ROM
 155     uint8_t *rom;
 156     // The size of the READONLY segment
 157     int sz_rom;
 158
 159     // Initiate the specified memory operation, by creating a
 160     // memory request and sending it off to the memory system.
 161     void initiateAcc(GPUDynInstPtr gpuDynInst);
 162     // Complete the specified memory operation, by writing
 163     // value back to the RF in the case of a load or atomic
 164     // return or, in the case of a store, we do nothing
 165     void completeAcc(GPUDynInstPtr gpuDynInst);
 166
 167     void updateStats();
 168
 169     GPUStaticInst* staticInstruction() { return _staticInst; }
 170
 171     bool isALU() const;
 172     bool isBranch() const;
 173     bool isNop() const;
 174     bool isReturn() const;
 175     bool isUnconditionalJump() const;
 176     bool isSpecialOp() const;
 177     bool isWaitcnt() const;
 178
 179     bool isBarrier() const;
 180     bool isMemFence() const;
 181     bool isMemRef() const;
 182     bool isFlat() const;
 183     bool isLoad() const;
 184     bool isStore() const;
 185
 186     bool isAtomic() const;
 187     bool isAtomicNoRet() const;
 188     bool isAtomicRet() const;
 189
 190     bool isScalar() const;
 191     bool readsSCC() const;
 192     bool writesSCC() const;
 193     bool readsVCC() const;
 194     bool writesVCC() const;
 195
 196     bool isAtomicAnd() const;
 197     bool isAtomicOr() const;
 198     bool isAtomicXor() const;
 199     bool isAtomicCAS() const;
 200     bool isAtomicExch() const;
 201     bool isAtomicAdd() const;
 202     bool isAtomicSub() const;
 203     bool isAtomicInc() const;
 204     bool isAtomicDec() const;
 205     bool isAtomicMax() const;
 206     bool isAtomicMin() const;
 207
 208     bool isArgLoad() const;
 209     bool isGlobalMem() const;
 210     bool isLocalMem() const;
 211
 212     bool isArgSeg() const;
 213     bool isGlobalSeg() const;
 214     bool isGroupSeg() const;
 215     bool isKernArgSeg() const;
 216     bool isPrivateSeg() const;
 217     bool isReadOnlySeg() const;
 218     bool isSpillSeg() const;
 219
 220     bool isWorkitemScope() const;
 221     bool isWavefrontScope() const;
 222     bool isWorkgroupScope() const;
 223     bool isDeviceScope() const;
 224     bool isSystemScope() const;
 225     bool isNoScope() const;
 226
 227     bool isRelaxedOrder() const;
 228     bool isAcquire() const;
 229     bool isRelease() const;
 230     bool isAcquireRelease() const;
 231     bool isNoOrder() const;
 232
 233     bool isGloballyCoherent() const;
 234     bool isSystemCoherent() const;
 235
 236     /*
 237      * Loads/stores/atomics may have acquire/release semantics associated
 238      * withthem. Some protocols want to see the acquire/release as separate
 239      * requests from the load/store/atomic. We implement that separation
 240      * using continuations (i.e., a function pointer with an object associated
 241      * with it). When, for example, the front-end generates a store with
 242      * release semantics, we will first issue a normal store and set the
 243      * continuation in the GPUDynInst to a function that generate a
 244      * release request. That continuation will be called when the normal
 245      * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
 246      * continuation will be called in the context of the same GPUDynInst
 247      * that generated the initial store.
 248      */
 249     std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
 250
 251     // when true, call execContinuation when response arrives
 252     bool useContinuation;
 253
 254     template<typename c0> AtomicOpFunctorPtr
 255     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
 256     {
 257         if (isAtomicAnd()) {
 258             return m5::make_unique<AtomicOpAnd<c0>>(*reg0);
 259         } else if (isAtomicOr()) {
 260             return m5::make_unique<AtomicOpOr<c0>>(*reg0);
 261         } else if (isAtomicXor()) {
 262             return m5::make_unique<AtomicOpXor<c0>>(*reg0);
 263         } else if (isAtomicCAS()) {
 264             return m5::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
 265         } else if (isAtomicExch()) {
 266             return m5::make_unique<AtomicOpExch<c0>>(*reg0);
 267         } else if (isAtomicAdd()) {
 268             return m5::make_unique<AtomicOpAdd<c0>>(*reg0);
 269         } else if (isAtomicSub()) {
 270             return m5::make_unique<AtomicOpSub<c0>>(*reg0);
 271         } else if (isAtomicInc()) {
 272             return m5::make_unique<AtomicOpInc<c0>>();
 273         } else if (isAtomicDec()) {
 274             return m5::make_unique<AtomicOpDec<c0>>();
 275         } else if (isAtomicMax()) {
 276             return m5::make_unique<AtomicOpMax<c0>>(*reg0);
 277         } else if (isAtomicMin()) {
 278             return m5::make_unique<AtomicOpMin<c0>>(*reg0);
 279         } else {
 280             fatal("Unrecognized atomic operation");
 281         }
 282     }
 283
 284     void
 285     setRequestFlags(RequestPtr req, bool setMemOrder=true)
 286     {
 287         // currently these are the easy scopes to deduce
 288         if (isPrivateSeg()) {
 289             req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
 290         } else if (isSpillSeg()) {
 291             req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
 292         } else if (isGlobalSeg()) {
 293             req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
 294         } else if (isReadOnlySeg()) {
 295             req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
 296         } else if (isGroupSeg()) {
 297             req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
 298         } else if (isFlat()) {
 299             panic("TODO: translate to correct scope");
 300         } else {
 301             fatal("%s has bad segment type\n", disassemble());
 302         }
 303
 304         if (isWavefrontScope()) {
 305             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 306                                         Request::WAVEFRONT_SCOPE);
 307         } else if (isWorkgroupScope()) {
 308             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 309                                         Request::WORKGROUP_SCOPE);
 310         } else if (isDeviceScope()) {
 311             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 312                                         Request::DEVICE_SCOPE);
 313         } else if (isSystemScope()) {
 314             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
 315                                         Request::SYSTEM_SCOPE);
 316         } else if (!isNoScope() && !isWorkitemScope()) {
 317             fatal("%s has bad scope type\n", disassemble());
 318         }
 319
 320         if (setMemOrder) {
 321             // set acquire and release flags
 322             if (isAcquire()) {
 323                 req->setFlags(Request::ACQUIRE);
 324             } else if (isRelease()) {
 325                 req->setFlags(Request::RELEASE);
 326             } else if (isAcquireRelease()) {
 327                 req->setFlags(Request::ACQUIRE | Request::RELEASE);
 328             } else if (!isNoOrder()) {
 329                 fatal("%s has bad memory order\n", disassemble());
 330             }
 331         }
 332
 333         // set atomic type
 334         // currently, the instruction genenerator only produces atomic return
 335         // but a magic instruction can produce atomic no return
 336         if (isAtomicRet()) {
 337             req->setFlags(Request::ATOMIC_RETURN_OP);
 338         } else if (isAtomicNoRet()) {
 339             req->setFlags(Request::ATOMIC_NO_RETURN_OP);
 340         }
 341     }
 342
 343     // Map returned packets and the addresses they satisfy with which lane they
 344     // were requested from
 345     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
 346     StatusVector memStatusVector;
 347
 348     // Track the status of memory requests per lane, a bit per lane
 349     VectorMask statusBitVector;
 350     // for ld_v# or st_v#
 351     std::vector<int> statusVector;
 352     std::vector<int> tlbHitLevel;
 353
 354   private:
 355     GPUStaticInst *_staticInst;
 356     uint64_t _seqNum;
 357 };
 358
 359 #endif // __GPU_DYN_INST_HH__