src/gpu-compute/wavefront.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Lisa Hsu
  34  */
  35
  36 #ifndef __WAVEFRONT_HH__
  37 #define __WAVEFRONT_HH__
  38
  39 #include <cassert>
  40 #include <deque>
  41 #include <memory>
  42 #include <stack>
  43 #include <vector>
  44
  45 #include "arch/gpu_isa.hh"
  46 #include "base/logging.hh"
  47 #include "base/types.hh"
  48 #include "config/the_gpu_isa.hh"
  49 #include "gpu-compute/condition_register_state.hh"
  50 #include "gpu-compute/lds_state.hh"
  51 #include "gpu-compute/misc.hh"
  52 #include "gpu-compute/ndrange.hh"
  53 #include "params/Wavefront.hh"
  54 #include "sim/sim_object.hh"
  55
  56 static const int MAX_NUM_INSTS_PER_WF = 12;
  57
  58 /**
  59  * A reconvergence stack entry conveys the necessary state to implement
  60  * control flow divergence.
  61  */
  62 struct ReconvergenceStackEntry {
  63     /**
  64      * PC of current instruction.
  65      */
  66     uint32_t pc;
  67     /**
  68      * PC of the immediate post-dominator instruction, i.e., the value of
  69      * @a pc for the first instruction that will be executed by the wavefront
  70      * when a reconvergence point is reached.
  71      */
  72     uint32_t rpc;
  73     /**
  74      * Execution mask.
  75      */
  76     VectorMask execMask;
  77 };
  78
  79 /*
  80  * Arguments for the hsail opcode call, are user defined and variable length.
  81  * The hardware/finalizer can support arguments in hardware or use memory to
  82  * pass arguments. For now, let's assume that an unlimited number of arguments
  83  * are supported in hardware (the compiler inlines functions whenver it can
  84  * anyways, so unless someone is interested in the implications of linking/
  85  * library functions, I think this is a reasonable assumption given the typical
  86  * size of an OpenCL kernel).
  87  *
  88  * Note that call args are different than kernel arguments:
  89  *   * All work-items in a kernel refer the same set of kernel arguments
  90  *   * Each work-item has it's on set of call args. So a call argument at
  91  *     address 0x4 is different for work-item 0 and work-item 1.
  92  *
  93  * Ok, the table below shows an example of how we organize the call arguments in
  94  * the CallArgMem class.
  95  *
  96  * int foo(int arg1, double arg2)
  97  *  ___________________________________________________
  98  * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
  99  * |---------------------------------------------------|
 100  * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
 101  * |---------------------------------------------------|
 102  * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
 103  *  ___________________________________________________
 104  */
 105 class CallArgMem
 106 {
 107   public:
 108     // pointer to buffer for storing function arguments
 109     uint8_t *mem;
 110     int wfSize;
 111     // size of function args
 112     int funcArgsSizePerItem;
 113
 114     template<typename CType>
 115     int
 116     getLaneOffset(int lane, int addr)
 117     {
 118         return addr * wfSize + sizeof(CType) * lane;
 119     }
 120
 121     CallArgMem(int func_args_size_per_item, int wf_size)
 122         : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
 123     {
 124         mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
 125     }
 126
 127     ~CallArgMem()
 128     {
 129         free(mem);
 130     }
 131
 132     template<typename CType>
 133     uint8_t*
 134     getLaneAddr(int lane, int addr)
 135     {
 136         return mem + getLaneOffset<CType>(lane, addr);
 137     }
 138
 139     template<typename CType>
 140     void
 141     setLaneAddr(int lane, int addr, CType val)
 142     {
 143         *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
 144     }
 145 };
 146
 147 class Wavefront : public SimObject
 148 {
 149   public:
 150     enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
 151     enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
 152
 153     // Base pointer for array of instruction pointers
 154     uint64_t basePtr;
 155
 156     uint32_t oldBarrierCnt;
 157     uint32_t barrierCnt;
 158     uint32_t barrierId;
 159     uint32_t barrierSlots;
 160     status_e status;
 161     // HW slot id where the WF is mapped to inside a SIMD unit
 162     int wfSlotId;
 163     int kernId;
 164     // SIMD unit where the WV has been scheduled
 165     int simdId;
 166     // pointer to parent CU
 167     ComputeUnit *computeUnit;
 168
 169     std::deque<GPUDynInstPtr> instructionBuffer;
 170
 171     bool pendingFetch;
 172     bool dropFetch;
 173
 174     // Condition Register State (for HSAIL simulations only)
 175     class ConditionRegisterState *condRegState;
 176     // number of single precision VGPRs required by WF
 177     uint32_t maxSpVgprs;
 178     // number of double precision VGPRs required by WF
 179     uint32_t maxDpVgprs;
 180     // map virtual to physical vector register
 181     uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
 182     void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
 183     bool isGmInstruction(GPUDynInstPtr ii);
 184     bool isLmInstruction(GPUDynInstPtr ii);
 185     bool isOldestInstGMem();
 186     bool isOldestInstLMem();
 187     bool isOldestInstPrivMem();
 188     bool isOldestInstFlatMem();
 189     bool isOldestInstALU();
 190     bool isOldestInstBarrier();
 191     // used for passing spill address to DDInstGPU
 192     std::vector<Addr> lastAddr;
 193     std::vector<uint32_t> workItemId[3];
 194     std::vector<uint32_t> workItemFlatId;
 195     /* kernel launch parameters */
 196     uint32_t workGroupId[3];
 197     uint32_t workGroupSz[3];
 198     uint32_t gridSz[3];
 199     uint32_t wgId;
 200     uint32_t wgSz;
 201     /* the actual WG size can differ than the maximum size */
 202     uint32_t actualWgSz[3];
 203     uint32_t actualWgSzTotal;
 204     void computeActualWgSz(NDRange *ndr);
 205     // wavefront id within a workgroup
 206     uint32_t wfId;
 207     uint32_t maxDynWaveId;
 208     uint32_t dispatchId;
 209     // outstanding global+local memory requests
 210     uint32_t outstandingReqs;
 211     // memory requests between scoreboard
 212     // and execute stage not yet executed
 213     uint32_t memReqsInPipe;
 214     // outstanding global memory write requests
 215     uint32_t outstandingReqsWrGm;
 216     // outstanding local memory write requests
 217     uint32_t outstandingReqsWrLm;
 218     // outstanding global memory read requests
 219     uint32_t outstandingReqsRdGm;
 220     // outstanding local memory read requests
 221     uint32_t outstandingReqsRdLm;
 222     uint32_t rdLmReqsInPipe;
 223     uint32_t rdGmReqsInPipe;
 224     uint32_t wrLmReqsInPipe;
 225     uint32_t wrGmReqsInPipe;
 226
 227     int memTraceBusy;
 228     uint64_t lastTrace;
 229     // number of vector registers reserved by WF
 230     int reservedVectorRegs;
 231     // Index into the Vector Register File's namespace where the WF's registers
 232     // will live while the WF is executed
 233     uint32_t startVgprIndex;
 234
 235     // Old value of destination gpr (for trace)
 236     std::vector<uint32_t> oldVgpr;
 237     // Id of destination gpr (for trace)
 238     uint32_t oldVgprId;
 239     // Tick count of last old_vgpr copy
 240     uint64_t oldVgprTcnt;
 241
 242     // Old value of destination gpr (for trace)
 243     std::vector<uint64_t> oldDgpr;
 244     // Id of destination gpr (for trace)
 245     uint32_t oldDgprId;
 246     // Tick count of last old_vgpr copy
 247     uint64_t oldDgprTcnt;
 248
 249     // Execution mask at wavefront start
 250     VectorMask initMask;
 251
 252     // number of barriers this WF has joined
 253     std::vector<int> barCnt;
 254     int maxBarCnt;
 255     // Flag to stall a wave on barrier
 256     bool stalledAtBarrier;
 257
 258     // a pointer to the fraction of the LDS allocated
 259     // to this workgroup (thus this wavefront)
 260     LdsChunk *ldsChunk;
 261
 262     // A pointer to the spill area
 263     Addr spillBase;
 264     // The size of the spill area
 265     uint32_t spillSizePerItem;
 266     // The vector width of the spill area
 267     uint32_t spillWidth;
 268
 269     // A pointer to the private memory area
 270     Addr privBase;
 271     // The size of the private memory area
 272     uint32_t privSizePerItem;
 273
 274     // A pointer ot the read-only memory area
 275     Addr roBase;
 276     // size of the read-only memory area
 277     uint32_t roSize;
 278
 279     // pointer to buffer for storing kernel arguments
 280     uint8_t *kernelArgs;
 281     // unique WF id over all WFs executed across all CUs
 282     uint64_t wfDynId;
 283
 284     // number of times instruction issue for this wavefront is blocked
 285     // due to VRF port availability
 286     Stats::Scalar numTimesBlockedDueVrfPortAvail;
 287     // number of times an instruction of a WF is blocked from being issued
 288     // due to WAR and WAW dependencies
 289     Stats::Scalar numTimesBlockedDueWAXDependencies;
 290     // number of times an instruction of a WF is blocked from being issued
 291     // due to WAR and WAW dependencies
 292     Stats::Scalar numTimesBlockedDueRAWDependencies;
 293     // distribution of executed instructions based on their register
 294     // operands; this is used to highlight the load on the VRF
 295     Stats::Distribution srcRegOpDist;
 296     Stats::Distribution dstRegOpDist;
 297
 298     // Functions to operate on call argument memory
 299     // argument memory for hsail call instruction
 300     CallArgMem *callArgMem;
 301     void
 302     initCallArgMem(int func_args_size_per_item, int wf_size)
 303     {
 304         callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
 305     }
 306
 307     template<typename CType>
 308     CType
 309     readCallArgMem(int lane, int addr)
 310     {
 311         return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
 312     }
 313
 314     template<typename CType>
 315     void
 316     writeCallArgMem(int lane, int addr, CType val)
 317     {
 318         callArgMem->setLaneAddr<CType>(lane, addr, val);
 319     }
 320
 321     typedef WavefrontParams Params;
 322     Wavefront(const Params *p);
 323     ~Wavefront();
 324     virtual void init();
 325
 326     void
 327     setParent(ComputeUnit *cu)
 328     {
 329         computeUnit = cu;
 330     }
 331
 332     void start(uint64_t _wfDynId, uint64_t _base_ptr);
 333     void exec();
 334     void updateResources();
 335     int ready(itype_e type);
 336     bool instructionBufferHasBranch();
 337     void regStats();
 338     VectorMask getPred() { return execMask() & initMask; }
 339
 340     bool waitingAtBarrier(int lane);
 341
 342     void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
 343                                   const VectorMask& exec_mask);
 344
 345     void popFromReconvergenceStack();
 346
 347     uint32_t pc() const;
 348
 349     uint32_t rpc() const;
 350
 351     VectorMask execMask() const;
 352
 353     bool execMask(int lane) const;
 354
 355     void pc(uint32_t new_pc);
 356
 357     void discardFetch();
 358
 359     /**
 360      * Returns the size of the static hardware context of a particular wavefront
 361      * This should be updated everytime the context is changed
 362      */
 363     uint32_t getStaticContextSize() const;
 364
 365     /**
 366      * Returns the hardware context as a stream of bytes
 367      * This method is designed for HSAIL execution
 368      */
 369     void getContext(const void *out);
 370
 371     /**
 372      * Sets the hardware context fromt a stream of bytes
 373      * This method is designed for HSAIL execution
 374      */
 375     void setContext(const void *in);
 376
 377     TheGpuISA::GPUISA&
 378     gpuISA()
 379     {
 380         return _gpuISA;
 381     }
 382
 383   private:
 384     TheGpuISA::GPUISA _gpuISA;
 385     /**
 386      * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
 387      * to be visited by the wavefront, and the associated execution masks. The
 388      * reconvergence stack grows every time the wavefront reaches a divergence
 389      * point (branch instruction), and shrinks every time the wavefront
 390      * reaches a reconvergence point (immediate post-dominator instruction).
 391      */
 392     std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
 393 };
 394
 395 #endif // __WAVEFRONT_HH__