2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: John Kalamatianos, Anthony Gutierrez
36 #ifndef __COMPUTE_UNIT_HH__
37 #define __COMPUTE_UNIT_HH__
41 #include <unordered_map>
44 #include "base/callback.hh"
45 #include "base/statistics.hh"
46 #include "base/types.hh"
47 #include "enums/PrefetchType.hh"
48 #include "gpu-compute/exec_stage.hh"
49 #include "gpu-compute/fetch_stage.hh"
50 #include "gpu-compute/global_memory_pipeline.hh"
51 #include "gpu-compute/local_memory_pipeline.hh"
52 #include "gpu-compute/qstruct.hh"
53 #include "gpu-compute/schedule_stage.hh"
54 #include "gpu-compute/scoreboard_check_stage.hh"
55 #include "mem/mem_object.hh"
56 #include "mem/port.hh"
58 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
59 static const int MAX_WIDTH_FOR_MEM_INST = 32;
63 class VectorRegisterFile;
65 struct ComputeUnitParams;
73 // List of execution units
87 TLB_MISS_CACHE_MISS = 0,
93 class ComputeUnit : public MemObject
96 FetchStage fetchStage;
97 ScoreboardCheckStage scoreboardCheckStage;
98 ScheduleStage scheduleStage;
100 GlobalMemPipeline globalMemoryPipe;
101 LocalMemPipeline localMemoryPipe;
103 // Buffers used to communicate between various pipeline stages
105 // List of waves which are ready to be scheduled.
106 // Each execution resource has a ready list. readyList is
107 // used to communicate between scoreboardCheck stage and
109 // TODO: make enum to index readyList
110 std::vector<std::vector<Wavefront*>> readyList;
112 // Stores the status of waves. A READY implies the
113 // wave is ready to be scheduled this cycle and
114 // is already present in the readyList. waveStatusList is
115 // used to communicate between scoreboardCheck stage and
117 // TODO: convert std::pair to a class to increase readability
118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
120 // List of waves which will be dispatched to
121 // each execution resource. A FILLED implies
122 // dispatch list is non-empty and
123 // execution unit has something to execute
124 // this cycle. Currently, the dispatch list of
125 // an execution resource can hold only one wave because
126 // an execution resource can execute only one wave in a cycle.
127 // dispatchList is used to communicate between schedule
129 // TODO: convert std::pair to a class to increase readability
130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's
134 typedef ComputeUnitParams Params;
135 std::vector<std::vector<Wavefront*>> wfList;
138 // array of vector register files, one per SIMD
139 std::vector<VectorRegisterFile*> vrf;
140 // Number of vector ALU units (SIMDs) in CU
142 // number of pipe stages for bypassing data to next dependent single
143 // precision vector instruction inside the vector ALU pipeline
144 int spBypassPipeLength;
145 // number of pipe stages for bypassing data to next dependent double
146 // precision vector instruction inside the vector ALU pipeline
147 int dpBypassPipeLength;
148 // number of cycles per issue period
151 // Number of global and local memory execution resources in CU
154 // tracks the last cycle a vector instruction was executed on a SIMD
155 std::vector<uint64_t> lastExecCycle;
157 // true if we allow a separate TLB per lane
159 // if 0, TLB prefetching is off.
161 // if fixed-stride prefetching, this is the stride.
164 std::vector<Addr> lastVaddrCU;
165 std::vector<std::vector<Addr>> lastVaddrSimd;
166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
167 Enums::PrefetchType prefetchType;
168 EXEC_POLICY exec_policy;
173 bool localMemBarrier;
176 * for Counting page accesses
178 * cuExitCallback inherits from Callback. When you register a callback
179 * function as an exit callback, it will get added to an exit callback
180 * queue, such that on simulation exit, all callbacks in the callback
181 * queue will have their process() function called.
187 // vector of Vector ALU (MACC) pipelines
188 std::vector<WaitClass> aluPipe;
189 // minimum issue period per SIMD unit (in cycles)
190 std::vector<WaitClass> wfWait;
192 // Resource control for Vector Register File->Global Memory pipe buses
193 std::vector<WaitClass> vrfToGlobalMemPipeBus;
194 // Resource control for Vector Register File->Local Memory pipe buses
195 std::vector<WaitClass> vrfToLocalMemPipeBus;
198 // Resource control for global memory to VRF data/address bus
199 WaitClass glbMemToVrfBus;
200 // Resource control for local memory to VRF data/address bus
201 WaitClass locMemToVrfBus;
203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
208 Tick req_tick_latency;
209 Tick resp_tick_latency;
211 // number of vector registers being reserved for each SIMD unit
212 std::vector<int> vectorRegsReserved;
213 // number of vector registers per SIMD unit
214 uint32_t numVecRegsPerSimd;
215 // Support for scheduling VGPR status update events
216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
217 std::vector<uint64_t> timestampVec;
218 std::vector<uint8_t> statusVec;
221 registerEvent(uint32_t simdId,
223 uint32_t operandSize,
226 regIdxVec.push_back(std::make_pair(simdId, regIdx));
227 timestampVec.push_back(when);
228 statusVec.push_back(newStatus);
229 if (operandSize > 4) {
230 regIdxVec.push_back(std::make_pair(simdId,
232 numVecRegsPerSimd)));
233 timestampVec.push_back(when);
234 statusVec.push_back(newStatus);
240 // this hash map will keep track of page divergence
241 // per memory instruction per wavefront. The hash map
242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
243 std::map<Addr, int> pagesTouched;
245 ComputeUnit(const Params *p);
247 int spBypassLength() { return spBypassPipeLength; };
248 int dpBypassLength() { return dpBypassPipeLength; };
249 int storeBusLength() { return numCyclesPerStoreTransfer; };
250 int loadBusLength() { return numCyclesPerLoadTransfer; };
251 int wfSize() const { return wavefrontSize; };
253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
255 void initiateFetch(Wavefront *wavefront);
256 void fetch(PacketPtr pkt, Wavefront *wavefront);
257 void fillKernelState(Wavefront *w, NDRange *ndr);
259 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
262 void StartWorkgroup(NDRange *ndr);
263 int ReadyWorkgroup(NDRange *ndr);
265 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
266 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
267 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
268 int GlbMemUnitId() { return GLBMEM_PIPE; }
269 int ShrMemUnitId() { return LDSMEM_PIPE; }
270 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
271 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
272 /* This function cycles through all the wavefronts in all the phases to see
273 * if all of the wavefronts which should be associated with one barrier
274 * (denoted with _barrier_id), are all at the same barrier in the program
275 * (denoted by bcnt). When the number at the barrier matches bslots, then
278 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
279 bool cedeSIMD(int simdId, int wfSlotId);
281 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
283 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
284 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
286 bool kernelLaunch=true,
287 RequestPtr req=nullptr);
288 void handleMemPacket(PacketPtr pkt, int memport_index);
289 bool processTimingPacket(PacketPtr pkt);
290 void processFetchReturn(PacketPtr pkt);
291 void updatePageDivergenceDist(Addr addr);
293 MasterID masterId() { return _masterId; }
296 bool isSimdDone(uint32_t) const;
304 Stats::Scalar vALUInsts;
305 Stats::Formula vALUInstsPerWF;
306 Stats::Scalar sALUInsts;
307 Stats::Formula sALUInstsPerWF;
308 Stats::Scalar instCyclesVALU;
309 Stats::Scalar instCyclesSALU;
310 Stats::Scalar threadCyclesVALU;
311 Stats::Formula vALUUtilization;
312 Stats::Scalar ldsNoFlatInsts;
313 Stats::Formula ldsNoFlatInstsPerWF;
314 Stats::Scalar flatVMemInsts;
315 Stats::Formula flatVMemInstsPerWF;
316 Stats::Scalar flatLDSInsts;
317 Stats::Formula flatLDSInstsPerWF;
318 Stats::Scalar vectorMemWrites;
319 Stats::Formula vectorMemWritesPerWF;
320 Stats::Scalar vectorMemReads;
321 Stats::Formula vectorMemReadsPerWF;
322 Stats::Scalar scalarMemWrites;
323 Stats::Formula scalarMemWritesPerWF;
324 Stats::Scalar scalarMemReads;
325 Stats::Formula scalarMemReadsPerWF;
327 void updateInstStats(GPUDynInstPtr gpuDynInst);
329 // the following stats compute the avg. TLB accesslatency per
330 // uncoalesced request (only for data)
331 Stats::Scalar tlbRequests;
332 Stats::Scalar tlbCycles;
333 Stats::Formula tlbLatency;
334 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
335 Stats::Vector hitsPerTLBLevel;
337 Stats::Scalar ldsBankAccesses;
338 Stats::Distribution ldsBankConflictDist;
340 // over all memory instructions executed over all wavefronts
341 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
342 Stats::Distribution pageDivergenceDist;
343 Stats::Scalar dynamicGMemInstrCnt;
344 Stats::Scalar dynamicLMemInstrCnt;
346 Stats::Scalar wgBlockedDueLdsAllocation;
347 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
348 // when the instruction is committed, this number is still incremented by 1
349 Stats::Scalar numInstrExecuted;
350 // Number of cycles among successive instruction executions across all
351 // wavefronts of the same CU
352 Stats::Distribution execRateDist;
353 // number of individual vector operations executed
354 Stats::Scalar numVecOpsExecuted;
355 // Total cycles that something is running on the GPU
356 Stats::Scalar totalCycles;
357 Stats::Formula vpc; // vector ops per cycle
358 Stats::Formula ipc; // vector instructions per cycle
359 Stats::Distribution controlFlowDivergenceDist;
360 Stats::Distribution activeLanesPerGMemInstrDist;
361 Stats::Distribution activeLanesPerLMemInstrDist;
362 // number of vector ALU instructions received
363 Stats::Formula numALUInstsExecuted;
364 // number of times a WG can not start due to lack of free VGPRs in SIMDs
365 Stats::Scalar numTimesWgBlockedDueVgprAlloc;
366 Stats::Scalar numCASOps;
367 Stats::Scalar numFailedCASOps;
368 Stats::Scalar completedWfs;
369 // flag per vector SIMD unit that is set when there is at least one
370 // WV that has a vector ALU instruction as the oldest in its
371 // Instruction Buffer: Defined in the Scoreboard stage, consumed
372 // by the Execute stage.
373 std::vector<bool> vectorAluInstAvail;
374 // number of available (oldest) LDS instructions that could have
375 // been issued to the LDS at a specific issue slot
377 // number of available Global memory instructions that could have
378 // been issued to TCP at a specific issue slot
391 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
393 int cacheLineSize() const { return _cacheLineSize; }
396 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
398 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
399 pageDataStruct pageAccesses;
401 class CUExitCallback : public Callback
404 ComputeUnit *computeUnit;
407 virtual ~CUExitCallback() { }
409 CUExitCallback(ComputeUnit *_cu)
418 CUExitCallback *cuExitCallback;
420 /** Data access Port **/
421 class DataPort : public MasterPort
424 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
425 : MasterPort(_name, _cu), computeUnit(_cu),
430 struct SenderState : public Packet::SenderState
432 GPUDynInstPtr _gpuDynInst;
434 Packet::SenderState *saved;
436 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
437 Packet::SenderState *sender_state=nullptr)
438 : _gpuDynInst(gpuDynInst),
439 port_index(_port_index),
440 saved(sender_state) { }
443 void processMemReqEvent(PacketPtr pkt);
444 EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
446 void processMemRespEvent(PacketPtr pkt);
447 EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
449 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
452 ComputeUnit *computeUnit;
455 virtual bool recvTimingResp(PacketPtr pkt);
456 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
457 virtual void recvFunctional(PacketPtr pkt) { }
458 virtual void recvRangeChange() { }
459 virtual void recvReqRetry();
462 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
470 // Instruction cache access port
471 class SQCPort : public MasterPort
474 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
475 : MasterPort(_name, _cu), computeUnit(_cu),
480 struct SenderState : public Packet::SenderState
482 Wavefront *wavefront;
483 Packet::SenderState *saved;
485 SenderState(Wavefront *_wavefront, Packet::SenderState
486 *sender_state=nullptr)
487 : wavefront(_wavefront), saved(sender_state) { }
490 std::deque<std::pair<PacketPtr, Wavefront*>> retries;
493 ComputeUnit *computeUnit;
496 virtual bool recvTimingResp(PacketPtr pkt);
497 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
498 virtual void recvFunctional(PacketPtr pkt) { }
499 virtual void recvRangeChange() { }
500 virtual void recvReqRetry();
503 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
510 /** Data TLB port **/
511 class DTLBPort : public MasterPort
514 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
515 : MasterPort(_name, _cu), computeUnit(_cu),
516 index(_index), stalled(false)
519 bool isStalled() { return stalled; }
520 void stallPort() { stalled = true; }
521 void unstallPort() { stalled = false; }
524 * here we queue all the translation requests that were
525 * not successfully sent.
527 std::deque<PacketPtr> retries;
529 /** SenderState is information carried along with the packet
530 * throughout the TLB hierarchy
532 struct SenderState: public Packet::SenderState
534 // the memInst that this is associated with
535 GPUDynInstPtr _gpuDynInst;
537 // the lane in the memInst this is associated with, so we send
538 // the memory request down the right port
541 // constructor used for packets involved in timing accesses
542 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
543 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
548 ComputeUnit *computeUnit;
552 virtual bool recvTimingResp(PacketPtr pkt);
553 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
554 virtual void recvFunctional(PacketPtr pkt) { }
555 virtual void recvRangeChange() { }
556 virtual void recvReqRetry();
559 class ITLBPort : public MasterPort
562 ITLBPort(const std::string &_name, ComputeUnit *_cu)
563 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
566 bool isStalled() { return stalled; }
567 void stallPort() { stalled = true; }
568 void unstallPort() { stalled = false; }
571 * here we queue all the translation requests that were
572 * not successfully sent.
574 std::deque<PacketPtr> retries;
576 /** SenderState is information carried along with the packet
577 * throughout the TLB hierarchy
579 struct SenderState: public Packet::SenderState
581 // The wavefront associated with this request
582 Wavefront *wavefront;
584 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
588 ComputeUnit *computeUnit;
591 virtual bool recvTimingResp(PacketPtr pkt);
592 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
593 virtual void recvFunctional(PacketPtr pkt) { }
594 virtual void recvRangeChange() { }
595 virtual void recvReqRetry();
599 * the port intended to communicate between the CU and its LDS
601 class LDSPort : public MasterPort
604 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
605 : MasterPort(_name, _cu, _id), computeUnit(_cu)
609 bool isStalled() const { return stalled; }
610 void stallPort() { stalled = true; }
611 void unstallPort() { stalled = false; }
614 * here we queue all the requests that were
615 * not successfully sent.
617 std::queue<PacketPtr> retries;
620 * SenderState is information carried along with the packet, esp. the
623 class SenderState: public Packet::SenderState
626 // The actual read/write/atomic request that goes with this command
627 GPUDynInstPtr _gpuDynInst = nullptr;
630 SenderState(GPUDynInstPtr gpuDynInst):
631 _gpuDynInst(gpuDynInst)
643 sendTimingReq(PacketPtr pkt);
647 bool stalled = false; ///< whether or not it is stalled
649 ComputeUnit *computeUnit;
652 recvTimingResp(PacketPtr pkt);
655 recvAtomic(PacketPtr pkt) { return 0; }
658 recvFunctional(PacketPtr pkt)
671 /** The port to access the Local Data Store
672 * Can be connected to a LDS object
674 LDSPort *ldsPort = nullptr;
682 /** The memory port for SIMD data accesses.
683 * Can be connected to PhysMem for Ruby for timing simulations
685 std::vector<DataPort*> memPort;
686 // port to the TLB hierarchy (i.e., the L1 TLB)
687 std::vector<DTLBPort*> tlbPort;
688 // port to the SQC (i.e. the I-cache)
690 // port to the SQC TLB (there's a separate TLB for each I-cache)
691 ITLBPort *sqcTLBPort;
693 virtual BaseMasterPort&
694 getMasterPort(const std::string &if_name, PortID idx)
696 if (if_name == "memory_port") {
697 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
699 return *memPort[idx];
700 } else if (if_name == "translation_port") {
701 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
703 return *tlbPort[idx];
704 } else if (if_name == "sqc_port") {
705 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
708 } else if (if_name == "sqc_tlb_port") {
709 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
711 } else if (if_name == "ldsPort") {
713 fatal("an LDS port was already allocated");
715 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
718 panic("incorrect port name");
727 waveIdentifier(int _simdId, int _wfSlotId)
728 : simdId(_simdId), wfSlotId(_wfSlotId) { }
737 std::list<waveIdentifier> waveIDQueue;
739 std::map<unsigned, waveQueue> xactCasLoadMap;
741 uint64_t getAndIncSeqNum() { return globalSeqNum++; }
744 const int _cacheLineSize;
745 uint64_t globalSeqNum;
747 GPUStaticInst *kernelLaunchInst;
750 #endif // __COMPUTE_UNIT_HH__