2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
36 #ifndef __WAVEFRONT_HH__
37 #define __WAVEFRONT_HH__
45 #include "arch/gpu_isa.hh"
46 #include "base/misc.hh"
47 #include "base/types.hh"
48 #include "config/the_gpu_isa.hh"
49 #include "gpu-compute/condition_register_state.hh"
50 #include "gpu-compute/lds_state.hh"
51 #include "gpu-compute/misc.hh"
52 #include "gpu-compute/ndrange.hh"
53 #include "params/Wavefront.hh"
54 #include "sim/sim_object.hh"
56 static const int MAX_NUM_INSTS_PER_WF = 12;
59 * A reconvergence stack entry conveys the necessary state to implement
60 * control flow divergence.
62 struct ReconvergenceStackEntry {
64 * PC of current instruction.
68 * PC of the immediate post-dominator instruction, i.e., the value of
69 * @a pc for the first instruction that will be executed by the wavefront
70 * when a reconvergence point is reached.
80 * Arguments for the hsail opcode call, are user defined and variable length.
81 * The hardware/finalizer can support arguments in hardware or use memory to
82 * pass arguments. For now, let's assume that an unlimited number of arguments
83 * are supported in hardware (the compiler inlines functions whenver it can
84 * anyways, so unless someone is interested in the implications of linking/
85 * library functions, I think this is a reasonable assumption given the typical
86 * size of an OpenCL kernel).
88 * Note that call args are different than kernel arguments:
89 * * All work-items in a kernel refer the same set of kernel arguments
90 * * Each work-item has it's on set of call args. So a call argument at
91 * address 0x4 is different for work-item 0 and work-item 1.
93 * Ok, the table below shows an example of how we organize the call arguments in
94 * the CallArgMem class.
96 * int foo(int arg1, double arg2)
97 * ___________________________________________________
98 * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
99 * |---------------------------------------------------|
100 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
101 * |---------------------------------------------------|
102 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
103 * ___________________________________________________
108 // pointer to buffer for storing function arguments
111 // size of function args
112 int funcArgsSizePerItem;
114 template<typename CType>
116 getLaneOffset(int lane, int addr)
118 return addr * wfSize + sizeof(CType) * lane;
121 CallArgMem(int func_args_size_per_item, int wf_size)
122 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
124 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
132 template<typename CType>
134 getLaneAddr(int lane, int addr)
136 return mem + getLaneOffset<CType>(lane, addr);
139 template<typename CType>
141 setLaneAddr(int lane, int addr, CType val)
143 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
147 class Wavefront : public SimObject
150 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
151 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
153 // Base pointer for array of instruction pointers
156 uint32_t oldBarrierCnt;
159 uint32_t barrierSlots;
161 // HW slot id where the WF is mapped to inside a SIMD unit
164 // SIMD unit where the WV has been scheduled
166 // pointer to parent CU
167 ComputeUnit *computeUnit;
169 std::deque<GPUDynInstPtr> instructionBuffer;
174 // Condition Register State (for HSAIL simulations only)
175 class ConditionRegisterState *condRegState;
176 // number of single precision VGPRs required by WF
178 // number of double precision VGPRs required by WF
180 // map virtual to physical vector register
181 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
182 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
183 bool isGmInstruction(GPUDynInstPtr ii);
184 bool isLmInstruction(GPUDynInstPtr ii);
185 bool isOldestInstGMem();
186 bool isOldestInstLMem();
187 bool isOldestInstPrivMem();
188 bool isOldestInstFlatMem();
189 bool isOldestInstALU();
190 bool isOldestInstBarrier();
191 // used for passing spill address to DDInstGPU
192 std::vector<Addr> lastAddr;
193 std::vector<uint32_t> workItemId[3];
194 std::vector<uint32_t> workItemFlatId;
195 /* kernel launch parameters */
196 uint32_t workGroupId[3];
197 uint32_t workGroupSz[3];
201 /* the actual WG size can differ than the maximum size */
202 uint32_t actualWgSz[3];
203 uint32_t actualWgSzTotal;
204 void computeActualWgSz(NDRange *ndr);
205 // wavefront id within a workgroup
207 uint32_t maxDynWaveId;
209 // outstanding global+local memory requests
210 uint32_t outstandingReqs;
211 // memory requests between scoreboard
212 // and execute stage not yet executed
213 uint32_t memReqsInPipe;
214 // outstanding global memory write requests
215 uint32_t outstandingReqsWrGm;
216 // outstanding local memory write requests
217 uint32_t outstandingReqsWrLm;
218 // outstanding global memory read requests
219 uint32_t outstandingReqsRdGm;
220 // outstanding local memory read requests
221 uint32_t outstandingReqsRdLm;
222 uint32_t rdLmReqsInPipe;
223 uint32_t rdGmReqsInPipe;
224 uint32_t wrLmReqsInPipe;
225 uint32_t wrGmReqsInPipe;
229 // number of vector registers reserved by WF
230 int reservedVectorRegs;
231 // Index into the Vector Register File's namespace where the WF's registers
232 // will live while the WF is executed
233 uint32_t startVgprIndex;
235 // Old value of destination gpr (for trace)
236 std::vector<uint32_t> oldVgpr;
237 // Id of destination gpr (for trace)
239 // Tick count of last old_vgpr copy
240 uint64_t oldVgprTcnt;
242 // Old value of destination gpr (for trace)
243 std::vector<uint64_t> oldDgpr;
244 // Id of destination gpr (for trace)
246 // Tick count of last old_vgpr copy
247 uint64_t oldDgprTcnt;
249 // Execution mask at wavefront start
252 // number of barriers this WF has joined
253 std::vector<int> barCnt;
255 // Flag to stall a wave on barrier
256 bool stalledAtBarrier;
258 // a pointer to the fraction of the LDS allocated
259 // to this workgroup (thus this wavefront)
262 // A pointer to the spill area
264 // The size of the spill area
265 uint32_t spillSizePerItem;
266 // The vector width of the spill area
269 // A pointer to the private memory area
271 // The size of the private memory area
272 uint32_t privSizePerItem;
274 // A pointer ot the read-only memory area
276 // size of the read-only memory area
279 // pointer to buffer for storing kernel arguments
281 // unique WF id over all WFs executed across all CUs
284 // number of times instruction issue for this wavefront is blocked
285 // due to VRF port availability
286 Stats::Scalar numTimesBlockedDueVrfPortAvail;
287 // number of times an instruction of a WF is blocked from being issued
288 // due to WAR and WAW dependencies
289 Stats::Scalar numTimesBlockedDueWAXDependencies;
290 // number of times an instruction of a WF is blocked from being issued
291 // due to WAR and WAW dependencies
292 Stats::Scalar numTimesBlockedDueRAWDependencies;
293 // distribution of executed instructions based on their register
294 // operands; this is used to highlight the load on the VRF
295 Stats::Distribution srcRegOpDist;
296 Stats::Distribution dstRegOpDist;
298 // Functions to operate on call argument memory
299 // argument memory for hsail call instruction
300 CallArgMem *callArgMem;
302 initCallArgMem(int func_args_size_per_item, int wf_size)
304 callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
307 template<typename CType>
309 readCallArgMem(int lane, int addr)
311 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
314 template<typename CType>
316 writeCallArgMem(int lane, int addr, CType val)
318 callArgMem->setLaneAddr<CType>(lane, addr, val);
321 typedef WavefrontParams Params;
322 Wavefront(const Params *p);
327 setParent(ComputeUnit *cu)
332 void start(uint64_t _wfDynId, uint64_t _base_ptr);
334 void updateResources();
335 int ready(itype_e type);
336 bool instructionBufferHasBranch();
338 VectorMask getPred() { return execMask() & initMask; }
340 bool waitingAtBarrier(int lane);
342 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
343 const VectorMask& exec_mask);
345 void popFromReconvergenceStack();
349 uint32_t rpc() const;
351 VectorMask execMask() const;
353 bool execMask(int lane) const;
355 void pc(uint32_t new_pc);
360 * Returns the size of the static hardware context of a particular wavefront
361 * This should be updated everytime the context is changed
363 uint32_t getStaticContextSize() const;
366 * Returns the hardware context as a stream of bytes
367 * This method is designed for HSAIL execution
369 void getContext(const void *out);
372 * Sets the hardware context fromt a stream of bytes
373 * This method is designed for HSAIL execution
375 void setContext(const void *in);
384 TheGpuISA::GPUISA _gpuISA;
386 * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
387 * to be visited by the wavefront, and the associated execution masks. The
388 * reconvergence stack grows every time the wavefront reaches a divergence
389 * point (branch instruction), and shrinks every time the wavefront
390 * reaches a reconvergence point (immediate post-dominator instruction).
392 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
395 #endif // __WAVEFRONT_HH__