2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #ifndef __COMPUTE_UNIT_HH__
35 #define __COMPUTE_UNIT_HH__
39 #include <unordered_set>
42 #include "base/callback.hh"
43 #include "base/statistics.hh"
44 #include "base/types.hh"
45 #include "config/the_gpu_isa.hh"
46 #include "enums/PrefetchType.hh"
47 #include "gpu-compute/comm.hh"
48 #include "gpu-compute/exec_stage.hh"
49 #include "gpu-compute/fetch_stage.hh"
50 #include "gpu-compute/global_memory_pipeline.hh"
51 #include "gpu-compute/hsa_queue_entry.hh"
52 #include "gpu-compute/local_memory_pipeline.hh"
53 #include "gpu-compute/register_manager.hh"
54 #include "gpu-compute/scalar_memory_pipeline.hh"
55 #include "gpu-compute/schedule_stage.hh"
56 #include "gpu-compute/scoreboard_check_stage.hh"
57 #include "mem/port.hh"
58 #include "mem/token_port.hh"
59 #include "sim/clocked_object.hh"
63 class ScalarRegisterFile;
65 class VectorRegisterFile;
67 struct ComputeUnitParams;
77 TLB_MISS_CACHE_MISS = 0,
84 * WF barrier slots. This represents the barrier resource for
85 * WF-level barriers (i.e., barriers to sync WFs within a WG).
90 WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
94 static const int InvalidID = -1;
103 * Number of WFs that have not yet reached the barrier.
106 numYetToReachBarrier() const
108 return _maxBarrierCnt - _numAtBarrier;
112 maxBarrierCnt() const
114 return _maxBarrierCnt;
118 * Set the maximum barrier count (i.e., the number of WFs that are
119 * participating in the barrier).
122 setMaxBarrierCnt(int max_barrier_cnt)
124 _maxBarrierCnt = max_barrier_cnt;
128 * Mark that a WF has reached the barrier.
133 assert(_numAtBarrier < _maxBarrierCnt);
138 * Have all WFs participating in this barrier reached the barrier?
139 * If so, then the barrier is satisfied and WFs may proceed past
145 return _numAtBarrier == _maxBarrierCnt;
149 * Decrement the number of WFs that are participating in this barrier.
150 * This should be called when a WF exits.
155 assert(_maxBarrierCnt > 0);
160 * Release this barrier resource so it can be used by other WGs. This
161 * is generally called when a WG has finished.
171 * Reset the barrier. This is used to reset the barrier, usually when
172 * a dynamic instance of a barrier has been satisfied.
182 * The number of WFs in the WG that have reached the barrier. Once
183 * the number of WFs that reach a barrier matches the number of WFs
184 * in the WG, the barrier is satisfied.
189 * The maximum number of WFs that can reach this barrier. This is
190 * essentially the number of WFs in the WG, and a barrier is satisfied
191 * when the number of WFs that reach the barrier equal this value. If
192 * a WF exits early it must decrement this value so that it is no
193 * longer considered for this barrier.
198 class ComputeUnit : public ClockedObject
203 // Execution resources
205 // The ordering of units is:
212 // Note: the ordering of units is important and the code assumes the
213 // above ordering. However, there may be more than one resource of
214 // each type (e.g., 4 VALUs or 2 SALUs)
216 int numVectorGlobalMemUnits;
217 // Resource control for global memory to VRF data/address bus
218 WaitClass glbMemToVrfBus;
219 // Resource control for Vector Register File->Global Memory pipe buses
220 WaitClass vrfToGlobalMemPipeBus;
221 // Resource control for Vector Global Memory execution unit
222 WaitClass vectorGlobalMemUnit;
224 int numVectorSharedMemUnits;
225 // Resource control for local memory to VRF data/address bus
226 WaitClass locMemToVrfBus;
227 // Resource control for Vector Register File->Local Memory pipe buses
228 WaitClass vrfToLocalMemPipeBus;
229 // Resource control for Vector Shared/Local Memory execution unit
230 WaitClass vectorSharedMemUnit;
232 int numScalarMemUnits;
233 // Resource control for scalar memory to SRF data/address bus
234 WaitClass scalarMemToSrfBus;
235 // Resource control for Scalar Register File->Scalar Memory pipe buses
236 WaitClass srfToScalarMemPipeBus;
237 // Resource control for Scalar Memory execution unit
238 WaitClass scalarMemUnit;
240 // vector ALU execution resources
242 std::vector<WaitClass> vectorALUs;
244 // scalar ALU execution resources
246 std::vector<WaitClass> scalarALUs;
248 // Return total number of execution units on this CU
249 int numExeUnits() const;
250 // index into readyList of the first memory unit
251 int firstMemUnit() const;
252 // index into readyList of the last memory unit
253 int lastMemUnit() const;
254 // index into scalarALUs vector of SALU used by the wavefront
255 int mapWaveToScalarAlu(Wavefront *w) const;
256 // index into readyList of SALU used by wavefront
257 int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
258 // index into readyList of Global Memory unit used by wavefront
259 int mapWaveToGlobalMem(Wavefront *w) const;
260 // index into readyList of Local Memory unit used by wavefront
261 int mapWaveToLocalMem(Wavefront *w) const;
262 // index into readyList of Scalar Memory unit used by wavefront
263 int mapWaveToScalarMem(Wavefront *w) const;
265 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
266 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
267 int numCyclesPerStoreTransfer; // number of cycles per vector store
268 int numCyclesPerLoadTransfer; // number of cycles per vector load
270 // track presence of dynamic instructions in the Schedule pipeline
271 // stage. This is used to check the readiness of the oldest,
272 // non-dispatched instruction of every WF in the Scoreboard stage.
273 std::unordered_set<uint64_t> pipeMap;
275 RegisterManager* registerManager;
277 FetchStage fetchStage;
278 ScoreboardCheckStage scoreboardCheckStage;
279 ScheduleStage scheduleStage;
281 GlobalMemPipeline globalMemoryPipe;
282 LocalMemPipeline localMemoryPipe;
283 ScalarMemPipeline scalarMemoryPipe;
285 EventFunctionWrapper tickEvent;
287 typedef ComputeUnitParams Params;
288 std::vector<std::vector<Wavefront*>> wfList;
291 // array of vector register files, one per SIMD
292 std::vector<VectorRegisterFile*> vrf;
293 // array of scalar register files, one per SIMD
294 std::vector<ScalarRegisterFile*> srf;
296 // Width per VALU/SIMD unit: number of work items that can be executed
297 // on the vector ALU simultaneously in a SIMD unit
299 // number of pipe stages for bypassing data to next dependent single
300 // precision vector instruction inside the vector ALU pipeline
301 int spBypassPipeLength;
302 // number of pipe stages for bypassing data to next dependent double
303 // precision vector instruction inside the vector ALU pipeline
304 int dpBypassPipeLength;
305 // number of pipe stages for scalar ALU
306 int scalarPipeStages;
307 // number of pipe stages for operand collection & distribution network
308 int operandNetworkLength;
309 // number of cycles per instruction issue period
312 // VRF to GM Bus latency
313 Cycles vrf_gm_bus_latency;
314 // SRF to Scalar Mem Bus latency
315 Cycles srf_scm_bus_latency;
316 // VRF to LM Bus latency
317 Cycles vrf_lm_bus_latency;
319 // tracks the last cycle a vector instruction was executed on a SIMD
320 std::vector<uint64_t> lastExecCycle;
322 // Track the amount of interleaving between wavefronts on each SIMD.
323 // This stat is sampled using instExecPerSimd to compute the number of
324 // instructions that have been executed on a SIMD between a WF executing
325 // two successive instructions.
326 Stats::VectorDistribution instInterleave;
328 // tracks the number of dyn inst executed per SIMD
329 std::vector<uint64_t> instExecPerSimd;
331 // true if we allow a separate TLB per lane
333 // if 0, TLB prefetching is off.
335 // if fixed-stride prefetching, this is the stride.
338 std::vector<Addr> lastVaddrCU;
339 std::vector<std::vector<Addr>> lastVaddrSimd;
340 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
341 Enums::PrefetchType prefetchType;
342 EXEC_POLICY exec_policy;
345 // Idle CU timeout in ticks
349 bool localMemBarrier;
352 * for Counting page accesses
358 Tick req_tick_latency;
359 Tick resp_tick_latency;
362 * Number of WFs to schedule to each SIMD. This vector is populated
363 * by hasDispResources(), and consumed by the subsequent call to
364 * dispWorkgroup(), to schedule the specified number of WFs to the
365 * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
367 std::vector<int> numWfsToSched;
369 // number of currently reserved vector registers per SIMD unit
370 std::vector<int> vectorRegsReserved;
371 // number of currently reserved scalar registers per SIMD unit
372 std::vector<int> scalarRegsReserved;
373 // number of vector registers per SIMD unit
374 int numVecRegsPerSimd;
375 // number of available scalar registers per SIMD unit
376 int numScalarRegsPerSimd;
378 // this hash map will keep track of page divergence
379 // per memory instruction per wavefront. The hash map
380 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
381 std::map<Addr, int> pagesTouched;
383 void insertInPipeMap(Wavefront *w);
384 void deleteFromPipeMap(Wavefront *w);
386 ComputeUnit(const Params *p);
390 int oprNetPipeLength() const { return operandNetworkLength; }
391 int simdUnitWidth() const { return simdWidth; }
392 int spBypassLength() const { return spBypassPipeLength; }
393 int dpBypassLength() const { return dpBypassPipeLength; }
394 int scalarPipeLength() const { return scalarPipeStages; }
395 int storeBusLength() const { return numCyclesPerStoreTransfer; }
396 int loadBusLength() const { return numCyclesPerLoadTransfer; }
397 int wfSize() const { return wavefrontSize; }
400 void initiateFetch(Wavefront *wavefront);
401 void fetch(PacketPtr pkt, Wavefront *wavefront);
402 void fillKernelState(Wavefront *w, HSAQueueEntry *task);
404 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
405 HSAQueueEntry *task, int bar_id,
406 bool fetchContext=false);
408 void doInvalidate(RequestPtr req, int kernId);
409 void doFlush(GPUDynInstPtr gpuDynInst);
411 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
412 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
414 int cacheLineSize() const { return _cacheLineSize; }
415 int getCacheLineBits() const { return cacheLineBits; }
419 barrierSlot(int bar_id)
421 assert(bar_id > WFBarrier::InvalidID);
422 return wfBarrierSlots.at(bar_id);
428 assert(freeBarrierIds.size());
429 auto free_bar_id = freeBarrierIds.begin();
430 int bar_id = *free_bar_id;
431 freeBarrierIds.erase(free_bar_id);
436 int numYetToReachBarrier(int bar_id);
437 bool allAtBarrier(int bar_id);
438 void incNumAtBarrier(int bar_id);
439 int numAtBarrier(int bar_id);
440 int maxBarrierCnt(int bar_id);
441 void resetBarrier(int bar_id);
442 void decMaxBarrierCnt(int bar_id);
443 void releaseBarrier(int bar_id);
444 void releaseWFsFromBarrier(int bar_id);
445 int numBarrierSlots() const { return _numBarrierSlots; }
447 template<typename c0, typename c1>
448 void doSmReturn(GPUDynInstPtr gpuDynInst);
450 virtual void init() override;
451 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
452 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
453 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
455 RequestPtr req=nullptr);
456 void handleMemPacket(PacketPtr pkt, int memport_index);
457 bool processTimingPacket(PacketPtr pkt);
458 void processFetchReturn(PacketPtr pkt);
459 void updatePageDivergenceDist(Addr addr);
461 MasterID masterId() { return _masterId; }
464 bool isVectorAluIdle(uint32_t simdId) const;
472 Stats::Scalar vALUInsts;
473 Stats::Formula vALUInstsPerWF;
474 Stats::Scalar sALUInsts;
475 Stats::Formula sALUInstsPerWF;
476 Stats::Scalar instCyclesVALU;
477 Stats::Scalar instCyclesSALU;
478 Stats::Scalar threadCyclesVALU;
479 Stats::Formula vALUUtilization;
480 Stats::Scalar ldsNoFlatInsts;
481 Stats::Formula ldsNoFlatInstsPerWF;
482 Stats::Scalar flatVMemInsts;
483 Stats::Formula flatVMemInstsPerWF;
484 Stats::Scalar flatLDSInsts;
485 Stats::Formula flatLDSInstsPerWF;
486 Stats::Scalar vectorMemWrites;
487 Stats::Formula vectorMemWritesPerWF;
488 Stats::Scalar vectorMemReads;
489 Stats::Formula vectorMemReadsPerWF;
490 Stats::Scalar scalarMemWrites;
491 Stats::Formula scalarMemWritesPerWF;
492 Stats::Scalar scalarMemReads;
493 Stats::Formula scalarMemReadsPerWF;
495 Stats::Formula vectorMemReadsPerKiloInst;
496 Stats::Formula vectorMemWritesPerKiloInst;
497 Stats::Formula vectorMemInstsPerKiloInst;
498 Stats::Formula scalarMemReadsPerKiloInst;
499 Stats::Formula scalarMemWritesPerKiloInst;
500 Stats::Formula scalarMemInstsPerKiloInst;
502 // Cycles required to send register source (addr and data) from
503 // register files to memory pipeline, per SIMD.
504 Stats::Vector instCyclesVMemPerSimd;
505 Stats::Vector instCyclesScMemPerSimd;
506 Stats::Vector instCyclesLdsPerSimd;
508 Stats::Scalar globalReads;
509 Stats::Scalar globalWrites;
510 Stats::Formula globalMemInsts;
511 Stats::Scalar argReads;
512 Stats::Scalar argWrites;
513 Stats::Formula argMemInsts;
514 Stats::Scalar spillReads;
515 Stats::Scalar spillWrites;
516 Stats::Formula spillMemInsts;
517 Stats::Scalar groupReads;
518 Stats::Scalar groupWrites;
519 Stats::Formula groupMemInsts;
520 Stats::Scalar privReads;
521 Stats::Scalar privWrites;
522 Stats::Formula privMemInsts;
523 Stats::Scalar readonlyReads;
524 Stats::Scalar readonlyWrites;
525 Stats::Formula readonlyMemInsts;
526 Stats::Scalar kernargReads;
527 Stats::Scalar kernargWrites;
528 Stats::Formula kernargMemInsts;
531 Stats::Distribution waveLevelParallelism;
533 void updateInstStats(GPUDynInstPtr gpuDynInst);
535 // the following stats compute the avg. TLB accesslatency per
536 // uncoalesced request (only for data)
537 Stats::Scalar tlbRequests;
538 Stats::Scalar tlbCycles;
539 Stats::Formula tlbLatency;
540 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
541 Stats::Vector hitsPerTLBLevel;
543 Stats::Scalar ldsBankAccesses;
544 Stats::Distribution ldsBankConflictDist;
546 // over all memory instructions executed over all wavefronts
547 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
548 Stats::Distribution pageDivergenceDist;
549 // count of non-flat global memory vector instructions executed
550 Stats::Scalar dynamicGMemInstrCnt;
551 // count of flat global memory vector instructions executed
552 Stats::Scalar dynamicFlatMemInstrCnt;
553 Stats::Scalar dynamicLMemInstrCnt;
555 Stats::Scalar wgBlockedDueBarrierAllocation;
556 Stats::Scalar wgBlockedDueLdsAllocation;
557 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
558 // active when the instruction is committed, this number is still
560 Stats::Scalar numInstrExecuted;
561 // Number of cycles among successive instruction executions across all
562 // wavefronts of the same CU
563 Stats::Distribution execRateDist;
564 // number of individual vector operations executed
565 Stats::Scalar numVecOpsExecuted;
566 // number of individual f16 vector operations executed
567 Stats::Scalar numVecOpsExecutedF16;
568 // number of individual f32 vector operations executed
569 Stats::Scalar numVecOpsExecutedF32;
570 // number of individual f64 vector operations executed
571 Stats::Scalar numVecOpsExecutedF64;
572 // number of individual FMA 16,32,64 vector operations executed
573 Stats::Scalar numVecOpsExecutedFMA16;
574 Stats::Scalar numVecOpsExecutedFMA32;
575 Stats::Scalar numVecOpsExecutedFMA64;
576 // number of individual MAC 16,32,64 vector operations executed
577 Stats::Scalar numVecOpsExecutedMAC16;
578 Stats::Scalar numVecOpsExecutedMAC32;
579 Stats::Scalar numVecOpsExecutedMAC64;
580 // number of individual MAD 16,32,64 vector operations executed
581 Stats::Scalar numVecOpsExecutedMAD16;
582 Stats::Scalar numVecOpsExecutedMAD32;
583 Stats::Scalar numVecOpsExecutedMAD64;
584 // total number of two op FP vector operations executed
585 Stats::Scalar numVecOpsExecutedTwoOpFP;
586 // Total cycles that something is running on the GPU
587 Stats::Scalar totalCycles;
588 Stats::Formula vpc; // vector ops per cycle
589 Stats::Formula vpc_f16; // vector ops per cycle
590 Stats::Formula vpc_f32; // vector ops per cycle
591 Stats::Formula vpc_f64; // vector ops per cycle
592 Stats::Formula ipc; // vector instructions per cycle
593 Stats::Distribution controlFlowDivergenceDist;
594 Stats::Distribution activeLanesPerGMemInstrDist;
595 Stats::Distribution activeLanesPerLMemInstrDist;
596 // number of vector ALU instructions received
597 Stats::Formula numALUInstsExecuted;
598 // number of times a WG can not start due to lack of free VGPRs in SIMDs
599 Stats::Scalar numTimesWgBlockedDueVgprAlloc;
600 // number of times a WG can not start due to lack of free SGPRs in SIMDs
601 Stats::Scalar numTimesWgBlockedDueSgprAlloc;
602 Stats::Scalar numCASOps;
603 Stats::Scalar numFailedCASOps;
604 Stats::Scalar completedWfs;
605 Stats::Scalar completedWGs;
607 // distrubtion in latency difference between first and last cache block
609 Stats::Distribution headTailLatency;
621 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
624 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
626 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
627 pageDataStruct pageAccesses;
631 class GMTokenPort : public TokenMasterPort
634 GMTokenPort(const std::string& name, SimObject *owner,
635 PortID id = InvalidPortID)
636 : TokenMasterPort(name, owner, id)
641 bool recvTimingResp(PacketPtr) { return false; }
642 void recvReqRetry() { }
645 // Manager for the number of tokens available to this compute unit to
646 // send global memory request packets to the coalescer this is only used
647 // between global memory pipe and TCP coalescer.
648 TokenManager *memPortTokens;
649 GMTokenPort gmTokenPort;
651 /** Data access Port **/
652 class DataPort : public RequestPort
655 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
656 : RequestPort(_name, _cu), computeUnit(_cu),
661 struct SenderState : public Packet::SenderState
663 GPUDynInstPtr _gpuDynInst;
665 Packet::SenderState *saved;
667 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
668 Packet::SenderState *sender_state=nullptr)
669 : _gpuDynInst(gpuDynInst),
670 port_index(_port_index),
671 saved(sender_state) { }
674 void processMemReqEvent(PacketPtr pkt);
675 EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
677 void processMemRespEvent(PacketPtr pkt);
678 EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
680 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
683 ComputeUnit *computeUnit;
686 virtual bool recvTimingResp(PacketPtr pkt);
687 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
688 virtual void recvFunctional(PacketPtr pkt) { }
689 virtual void recvRangeChange() { }
690 virtual void recvReqRetry();
693 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
701 // Scalar data cache access port
702 class ScalarDataPort : public RequestPort
705 ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
707 : RequestPort(_name, _cu, _index), computeUnit(_cu), index(_index)
712 bool recvTimingResp(PacketPtr pkt) override;
713 void recvReqRetry() override;
715 struct SenderState : public Packet::SenderState
717 SenderState(GPUDynInstPtr gpuDynInst,
718 Packet::SenderState *sender_state=nullptr)
719 : _gpuDynInst(gpuDynInst), saved(sender_state)
723 GPUDynInstPtr _gpuDynInst;
724 Packet::SenderState *saved;
727 class MemReqEvent : public Event
730 ScalarDataPort *scalarDataPort;
734 MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
735 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
737 setFlags(Event::AutoDelete);
741 const char *description() const;
744 std::deque<PacketPtr> retries;
747 ComputeUnit *computeUnit;
751 // Instruction cache access port
752 class SQCPort : public RequestPort
755 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
756 : RequestPort(_name, _cu), computeUnit(_cu),
761 struct SenderState : public Packet::SenderState
763 Wavefront *wavefront;
764 Packet::SenderState *saved;
765 // kernel id to be used in handling I-Cache invalidate response
768 SenderState(Wavefront *_wavefront, Packet::SenderState
769 *sender_state=nullptr, int _kernId=-1)
770 : wavefront(_wavefront), saved(sender_state),
774 std::deque<std::pair<PacketPtr, Wavefront*>> retries;
777 ComputeUnit *computeUnit;
780 virtual bool recvTimingResp(PacketPtr pkt);
781 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
782 virtual void recvFunctional(PacketPtr pkt) { }
783 virtual void recvRangeChange() { }
784 virtual void recvReqRetry();
787 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
794 /** Data TLB port **/
795 class DTLBPort : public RequestPort
798 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
799 : RequestPort(_name, _cu), computeUnit(_cu),
800 index(_index), stalled(false)
803 bool isStalled() { return stalled; }
804 void stallPort() { stalled = true; }
805 void unstallPort() { stalled = false; }
808 * here we queue all the translation requests that were
809 * not successfully sent.
811 std::deque<PacketPtr> retries;
813 /** SenderState is information carried along with the packet
814 * throughout the TLB hierarchy
816 struct SenderState: public Packet::SenderState
818 // the memInst that this is associated with
819 GPUDynInstPtr _gpuDynInst;
821 // the lane in the memInst this is associated with, so we send
822 // the memory request down the right port
825 // constructor used for packets involved in timing accesses
826 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
827 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
832 ComputeUnit *computeUnit;
836 virtual bool recvTimingResp(PacketPtr pkt);
837 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
838 virtual void recvFunctional(PacketPtr pkt) { }
839 virtual void recvRangeChange() { }
840 virtual void recvReqRetry();
843 class ScalarDTLBPort : public RequestPort
846 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
847 : RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
851 struct SenderState : public Packet::SenderState
853 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
854 GPUDynInstPtr _gpuDynInst;
857 bool recvTimingResp(PacketPtr pkt) override;
858 void recvReqRetry() override { assert(false); }
860 bool isStalled() const { return stalled; }
861 void stallPort() { stalled = true; }
862 void unstallPort() { stalled = false; }
864 std::deque<PacketPtr> retries;
867 ComputeUnit *computeUnit;
871 class ITLBPort : public RequestPort
874 ITLBPort(const std::string &_name, ComputeUnit *_cu)
875 : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
878 bool isStalled() { return stalled; }
879 void stallPort() { stalled = true; }
880 void unstallPort() { stalled = false; }
883 * here we queue all the translation requests that were
884 * not successfully sent.
886 std::deque<PacketPtr> retries;
888 /** SenderState is information carried along with the packet
889 * throughout the TLB hierarchy
891 struct SenderState: public Packet::SenderState
893 // The wavefront associated with this request
894 Wavefront *wavefront;
896 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
900 ComputeUnit *computeUnit;
903 virtual bool recvTimingResp(PacketPtr pkt);
904 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
905 virtual void recvFunctional(PacketPtr pkt) { }
906 virtual void recvRangeChange() { }
907 virtual void recvReqRetry();
911 * the port intended to communicate between the CU and its LDS
913 class LDSPort : public RequestPort
916 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
917 : RequestPort(_name, _cu, _id), computeUnit(_cu)
921 bool isStalled() const { return stalled; }
922 void stallPort() { stalled = true; }
923 void unstallPort() { stalled = false; }
926 * here we queue all the requests that were
927 * not successfully sent.
929 std::queue<PacketPtr> retries;
932 * SenderState is information carried along with the packet, esp. the
935 class SenderState: public Packet::SenderState
938 // The actual read/write/atomic request that goes with this command
939 GPUDynInstPtr _gpuDynInst = nullptr;
942 SenderState(GPUDynInstPtr gpuDynInst):
943 _gpuDynInst(gpuDynInst)
955 sendTimingReq(PacketPtr pkt);
959 bool stalled = false; ///< whether or not it is stalled
961 ComputeUnit *computeUnit;
964 recvTimingResp(PacketPtr pkt);
967 recvAtomic(PacketPtr pkt) { return 0; }
970 recvFunctional(PacketPtr pkt)
983 /** The port to access the Local Data Store
984 * Can be connected to a LDS object
986 LDSPort *ldsPort = nullptr;
997 return memPortTokens;
1000 /** The memory port for SIMD data accesses.
1001 * Can be connected to PhysMem for Ruby for timing simulations
1003 std::vector<DataPort*> memPort;
1004 // port to the TLB hierarchy (i.e., the L1 TLB)
1005 std::vector<DTLBPort*> tlbPort;
1006 // port to the scalar data cache
1007 ScalarDataPort *scalarDataPort;
1008 // port to the scalar data TLB
1009 ScalarDTLBPort *scalarDTLBPort;
1010 // port to the SQC (i.e. the I-cache)
1012 // port to the SQC TLB (there's a separate TLB for each I-cache)
1013 ITLBPort *sqcTLBPort;
1016 getPort(const std::string &if_name, PortID idx) override
1018 if (if_name == "memory_port") {
1019 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
1021 return *memPort[idx];
1022 } else if (if_name == "translation_port") {
1023 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
1025 return *tlbPort[idx];
1026 } else if (if_name == "scalar_port") {
1027 scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
1029 return *scalarDataPort;
1030 } else if (if_name == "scalar_tlb_port") {
1031 scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
1033 return *scalarDTLBPort;
1034 } else if (if_name == "sqc_port") {
1035 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
1038 } else if (if_name == "sqc_tlb_port") {
1039 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
1041 } else if (if_name == "ldsPort") {
1043 fatal("an LDS port was already allocated");
1045 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
1047 } else if (if_name == "gmTokenPort") {
1050 panic("incorrect port name");
1054 InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
1057 const int _cacheLineSize;
1058 const int _numBarrierSlots;
1060 InstSeqNum globalSeqNum;
1064 * TODO: Update these comments once the pipe stage interface has
1065 * been fully refactored.
1067 * Pipeline stage interfaces.
1069 * Buffers used to communicate between various pipeline stages
1070 * List of waves which will be dispatched to
1071 * each execution resource. An EXREADY implies
1072 * dispatch list is non-empty and
1073 * execution unit has something to execute
1074 * this cycle. Currently, the dispatch list of
1075 * an execution resource can hold only one wave because
1076 * an execution resource can execute only one wave in a cycle.
1077 * dispatchList is used to communicate between schedule
1080 * At a high level, the following intra-/inter-stage communication occurs:
1081 * SCB to SCH: readyList provides per exec resource list of waves that
1082 * passed dependency and readiness checks. If selected by
1083 * scheduler, attempt to add wave to schList conditional on
1085 * SCH: schList holds waves that are gathering operands or waiting
1086 * for execution resource availability. Once ready, waves are
1087 * placed on the dispatchList as candidates for execution. A wave
1088 * may spend multiple cycles in SCH stage, on the schList due to
1089 * RF access conflicts or execution resource contention.
1090 * SCH to EX: dispatchList holds waves that are ready to be executed.
1091 * LM/FLAT arbitration may remove an LM wave and place it
1092 * back on the schList. RF model may also force a wave back
1093 * to the schList if using the detailed model.
1095 ScoreboardCheckToSchedule scoreboardCheckToSchedule;
1096 ScheduleToExecute scheduleToExecute;
1099 * The barrier slots for this CU.
1101 std::vector<WFBarrier> wfBarrierSlots;
1103 * A set used to easily retrieve a free barrier ID.
1105 std::unordered_set<int> freeBarrierIds;
1107 // hold the time of the arrival of the first cache block related to
1108 // a particular GPUDynInst. This is used to calculate the difference
1109 // between the first and last chace block arrival times.
1110 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1113 #endif // __COMPUTE_UNIT_HH__