src/gpu-compute/wavefront.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Lisa Hsu
  34  */
  35
  36 #ifndef __WAVEFRONT_HH__
  37 #define __WAVEFRONT_HH__
  38
  39 #include <cassert>
  40 #include <deque>
  41 #include <memory>
  42 #include <stack>
  43 #include <vector>
  44
  45 #include "base/misc.hh"
  46 #include "base/types.hh"
  47 #include "gpu-compute/condition_register_state.hh"
  48 #include "gpu-compute/lds_state.hh"
  49 #include "gpu-compute/misc.hh"
  50 #include "params/Wavefront.hh"
  51 #include "sim/sim_object.hh"
  52
  53 static const int MAX_NUM_INSTS_PER_WF = 12;
  54
  55 /*
  56  * Arguments for the hsail opcode call, are user defined and variable length.
  57  * The hardware/finalizer can support arguments in hardware or use memory to
  58  * pass arguments. For now, let's assume that an unlimited number of arguments
  59  * are supported in hardware (the compiler inlines functions whenver it can
  60  * anyways, so unless someone is interested in the implications of linking/
  61  * library functions, I think this is a reasonable assumption given the typical
  62  * size of an OpenCL kernel).
  63  *
  64  * Note that call args are different than kernel arguments:
  65  *   * All work-items in a kernel refer the same set of kernel arguments
  66  *   * Each work-item has it's on set of call args. So a call argument at
  67  *     address 0x4 is different for work-item 0 and work-item 1.
  68  *
  69  * Ok, the table below shows an example of how we organize the call arguments in
  70  * the CallArgMem class.
  71  *
  72  * int foo(int arg1, double arg2)
  73  *  ___________________________________________________
  74  * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
  75  * |---------------------------------------------------|
  76  * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
  77  * |---------------------------------------------------|
  78  * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
  79  *  ___________________________________________________
  80  */
  81 class CallArgMem
  82 {
  83   public:
  84     // pointer to buffer for storing function arguments
  85     uint8_t *mem;
  86     // size of function args
  87     int funcArgsSizePerItem;
  88
  89     template<typename CType>
  90     int
  91     getLaneOffset(int lane, int addr)
  92     {
  93         return addr * VSZ + sizeof(CType) * lane;
  94     }
  95
  96     CallArgMem(int func_args_size_per_item)
  97       : funcArgsSizePerItem(func_args_size_per_item)
  98     {
  99         mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
 100     }
 101
 102     ~CallArgMem()
 103     {
 104         free(mem);
 105     }
 106
 107     template<typename CType>
 108     uint8_t*
 109     getLaneAddr(int lane, int addr)
 110     {
 111         return mem + getLaneOffset<CType>(lane, addr);
 112     }
 113
 114     template<typename CType>
 115     void
 116     setLaneAddr(int lane, int addr, CType val)
 117     {
 118         *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
 119     }
 120 };
 121
 122 /**
 123  * A reconvergence stack entry conveys the necessary state to implement
 124  * control flow divergence.
 125  */
 126 class ReconvergenceStackEntry {
 127
 128   public:
 129     ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
 130                             VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
 131                             execMask(new_mask) {
 132     }
 133
 134     /**
 135      * PC of current instruction.
 136      */
 137     uint32_t pc;
 138     /**
 139      * PC of the immediate post-dominator instruction, i.e., the value of
 140      * @a pc for the first instruction that will be executed by the wavefront
 141      * when a reconvergence point is reached.
 142      */
 143     uint32_t rpc;
 144     /**
 145      * Execution mask.
 146      */
 147     VectorMask execMask;
 148 };
 149
 150 class Wavefront : public SimObject
 151 {
 152   public:
 153     enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
 154     enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
 155
 156     // Base pointer for array of instruction pointers
 157     uint64_t base_ptr;
 158
 159     uint32_t old_barrier_cnt;
 160     uint32_t barrier_cnt;
 161     uint32_t barrier_id;
 162     uint32_t barrier_slots;
 163     status_e status;
 164     // HW slot id where the WF is mapped to inside a SIMD unit
 165     int wfSlotId;
 166     int kern_id;
 167     // SIMD unit where the WV has been scheduled
 168     int simdId;
 169     // pointer to parent CU
 170     ComputeUnit *computeUnit;
 171
 172     std::deque<GPUDynInstPtr> instructionBuffer;
 173
 174     bool pendingFetch;
 175     bool dropFetch;
 176
 177     // Condition Register State (for HSAIL simulations only)
 178     class ConditionRegisterState *condRegState;
 179     // number of single precision VGPRs required by WF
 180     uint32_t maxSpVgprs;
 181     // number of double precision VGPRs required by WF
 182     uint32_t maxDpVgprs;
 183     // map virtual to physical vector register
 184     uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
 185     void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
 186     bool isGmInstruction(GPUDynInstPtr ii);
 187     bool isLmInstruction(GPUDynInstPtr ii);
 188     bool isOldestInstGMem();
 189     bool isOldestInstLMem();
 190     bool isOldestInstPrivMem();
 191     bool isOldestInstFlatMem();
 192     bool isOldestInstALU();
 193     bool isOldestInstBarrier();
 194     // used for passing spill address to DDInstGPU
 195     uint64_t last_addr[VSZ];
 196     uint32_t workitemid[3][VSZ];
 197     uint32_t workitemFlatId[VSZ];
 198     uint32_t workgroupid[3];
 199     uint32_t workgroupsz[3];
 200     uint32_t gridsz[3];
 201     uint32_t wg_id;
 202     uint32_t wg_sz;
 203     uint32_t dynwaveid;
 204     uint32_t maxdynwaveid;
 205     uint32_t dispatchid;
 206     // outstanding global+local memory requests
 207     uint32_t outstanding_reqs;
 208     // memory requests between scoreboard
 209     // and execute stage not yet executed
 210     uint32_t mem_reqs_in_pipe;
 211     // outstanding global memory write requests
 212     uint32_t outstanding_reqs_wr_gm;
 213     // outstanding local memory write requests
 214     uint32_t outstanding_reqs_wr_lm;
 215     // outstanding global memory read requests
 216     uint32_t outstanding_reqs_rd_gm;
 217     // outstanding local memory read requests
 218     uint32_t outstanding_reqs_rd_lm;
 219     uint32_t rd_lm_reqs_in_pipe;
 220     uint32_t rd_gm_reqs_in_pipe;
 221     uint32_t wr_lm_reqs_in_pipe;
 222     uint32_t wr_gm_reqs_in_pipe;
 223
 224     int mem_trace_busy;
 225     uint64_t last_trace;
 226     // number of vector registers reserved by WF
 227     int reservedVectorRegs;
 228     // Index into the Vector Register File's namespace where the WF's registers
 229     // will live while the WF is executed
 230     uint32_t startVgprIndex;
 231
 232     // Old value of destination gpr (for trace)
 233     uint32_t old_vgpr[VSZ];
 234     // Id of destination gpr (for trace)
 235     uint32_t old_vgpr_id;
 236     // Tick count of last old_vgpr copy
 237     uint64_t old_vgpr_tcnt;
 238
 239     // Old value of destination gpr (for trace)
 240     uint64_t old_dgpr[VSZ];
 241     // Id of destination gpr (for trace)
 242     uint32_t old_dgpr_id;
 243     // Tick count of last old_vgpr copy
 244     uint64_t old_dgpr_tcnt;
 245
 246     // Execution mask at wavefront start
 247     VectorMask init_mask;
 248
 249     // number of barriers this WF has joined
 250     int bar_cnt[VSZ];
 251     int max_bar_cnt;
 252     // Flag to stall a wave on barrier
 253     bool stalledAtBarrier;
 254
 255     // a pointer to the fraction of the LDS allocated
 256     // to this workgroup (thus this wavefront)
 257     LdsChunk *ldsChunk;
 258
 259     // A pointer to the spill area
 260     Addr spillBase;
 261     // The size of the spill area
 262     uint32_t spillSizePerItem;
 263     // The vector width of the spill area
 264     uint32_t spillWidth;
 265
 266     // A pointer to the private memory area
 267     Addr privBase;
 268     // The size of the private memory area
 269     uint32_t privSizePerItem;
 270
 271     // A pointer ot the read-only memory area
 272     Addr roBase;
 273     // size of the read-only memory area
 274     uint32_t roSize;
 275
 276     // pointer to buffer for storing kernel arguments
 277     uint8_t *kernelArgs;
 278     // unique WF id over all WFs executed across all CUs
 279     uint64_t wfDynId;
 280
 281     // number of times instruction issue for this wavefront is blocked
 282     // due to VRF port availability
 283     Stats::Scalar numTimesBlockedDueVrfPortAvail;
 284     // number of times an instruction of a WF is blocked from being issued
 285     // due to WAR and WAW dependencies
 286     Stats::Scalar numTimesBlockedDueWAXDependencies;
 287     // number of times an instruction of a WF is blocked from being issued
 288     // due to WAR and WAW dependencies
 289     Stats::Scalar numTimesBlockedDueRAWDependencies;
 290     // distribution of executed instructions based on their register
 291     // operands; this is used to highlight the load on the VRF
 292     Stats::Distribution srcRegOpDist;
 293     Stats::Distribution dstRegOpDist;
 294
 295     // Functions to operate on call argument memory
 296     // argument memory for hsail call instruction
 297     CallArgMem *callArgMem;
 298     void
 299     initCallArgMem(int func_args_size_per_item)
 300     {
 301         callArgMem = new CallArgMem(func_args_size_per_item);
 302     }
 303
 304     template<typename CType>
 305     CType
 306     readCallArgMem(int lane, int addr)
 307     {
 308         return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
 309     }
 310
 311     template<typename CType>
 312     void
 313     writeCallArgMem(int lane, int addr, CType val)
 314     {
 315         callArgMem->setLaneAddr<CType>(lane, addr, val);
 316     }
 317
 318     typedef WavefrontParams Params;
 319     Wavefront(const Params *p);
 320     ~Wavefront();
 321     virtual void init();
 322
 323     void
 324     setParent(ComputeUnit *cu)
 325     {
 326         computeUnit = cu;
 327     }
 328
 329     void start(uint64_t _wfDynId, uint64_t _base_ptr);
 330
 331     void exec();
 332     void updateResources();
 333     int ready(itype_e type);
 334     bool instructionBufferHasBranch();
 335     void regStats();
 336     VectorMask get_pred() { return execMask() & init_mask; }
 337
 338     bool waitingAtBarrier(int lane);
 339
 340     void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
 341                                   const VectorMask& exec_mask);
 342
 343     void popFromReconvergenceStack();
 344
 345     uint32_t pc() const;
 346
 347     uint32_t rpc() const;
 348
 349     VectorMask execMask() const;
 350
 351     bool execMask(int lane) const;
 352
 353     void pc(uint32_t new_pc);
 354
 355     void discardFetch();
 356
 357   private:
 358     /**
 359      * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
 360      * to be visited by the wavefront, and the associated execution masks. The
 361      * reconvergence stack grows every time the wavefront reaches a divergence
 362      * point (branch instruction), and shrinks every time the wavefront
 363      * reaches a reconvergence point (immediate post-dominator instruction).
 364      */
 365     std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
 366 };
 367
 368 #endif // __WAVEFRONT_HH__