gpu-compute: update port terminology
[gem5.git] / src / gpu-compute / compute_unit.hh
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef __COMPUTE_UNIT_HH__
35 #define __COMPUTE_UNIT_HH__
36
37 #include <deque>
38 #include <map>
39 #include <unordered_set>
40 #include <vector>
41
42 #include "base/callback.hh"
43 #include "base/statistics.hh"
44 #include "base/types.hh"
45 #include "config/the_gpu_isa.hh"
46 #include "enums/PrefetchType.hh"
47 #include "gpu-compute/comm.hh"
48 #include "gpu-compute/exec_stage.hh"
49 #include "gpu-compute/fetch_stage.hh"
50 #include "gpu-compute/global_memory_pipeline.hh"
51 #include "gpu-compute/hsa_queue_entry.hh"
52 #include "gpu-compute/local_memory_pipeline.hh"
53 #include "gpu-compute/register_manager.hh"
54 #include "gpu-compute/scalar_memory_pipeline.hh"
55 #include "gpu-compute/schedule_stage.hh"
56 #include "gpu-compute/scoreboard_check_stage.hh"
57 #include "mem/port.hh"
58 #include "mem/token_port.hh"
59 #include "sim/clocked_object.hh"
60
61 class HSAQueueEntry;
62 class LdsChunk;
63 class ScalarRegisterFile;
64 class Shader;
65 class VectorRegisterFile;
66
67 struct ComputeUnitParams;
68
69 enum EXEC_POLICY
70 {
71 OLDEST = 0,
72 RR
73 };
74
75 enum TLB_CACHE
76 {
77 TLB_MISS_CACHE_MISS = 0,
78 TLB_MISS_CACHE_HIT,
79 TLB_HIT_CACHE_MISS,
80 TLB_HIT_CACHE_HIT
81 };
82
83 /**
84 * WF barrier slots. This represents the barrier resource for
85 * WF-level barriers (i.e., barriers to sync WFs within a WG).
86 */
87 class WFBarrier
88 {
89 public:
90 WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
91 {
92 }
93
94 static const int InvalidID = -1;
95
96 int
97 numAtBarrier() const
98 {
99 return _numAtBarrier;
100 }
101
102 /**
103 * Number of WFs that have not yet reached the barrier.
104 */
105 int
106 numYetToReachBarrier() const
107 {
108 return _maxBarrierCnt - _numAtBarrier;
109 }
110
111 int
112 maxBarrierCnt() const
113 {
114 return _maxBarrierCnt;
115 }
116
117 /**
118 * Set the maximum barrier count (i.e., the number of WFs that are
119 * participating in the barrier).
120 */
121 void
122 setMaxBarrierCnt(int max_barrier_cnt)
123 {
124 _maxBarrierCnt = max_barrier_cnt;
125 }
126
127 /**
128 * Mark that a WF has reached the barrier.
129 */
130 void
131 incNumAtBarrier()
132 {
133 assert(_numAtBarrier < _maxBarrierCnt);
134 ++_numAtBarrier;
135 }
136
137 /**
138 * Have all WFs participating in this barrier reached the barrier?
139 * If so, then the barrier is satisfied and WFs may proceed past
140 * the barrier.
141 */
142 bool
143 allAtBarrier() const
144 {
145 return _numAtBarrier == _maxBarrierCnt;
146 }
147
148 /**
149 * Decrement the number of WFs that are participating in this barrier.
150 * This should be called when a WF exits.
151 */
152 void
153 decMaxBarrierCnt()
154 {
155 assert(_maxBarrierCnt > 0);
156 --_maxBarrierCnt;
157 }
158
159 /**
160 * Release this barrier resource so it can be used by other WGs. This
161 * is generally called when a WG has finished.
162 */
163 void
164 release()
165 {
166 _numAtBarrier = 0;
167 _maxBarrierCnt = 0;
168 }
169
170 /**
171 * Reset the barrier. This is used to reset the barrier, usually when
172 * a dynamic instance of a barrier has been satisfied.
173 */
174 void
175 reset()
176 {
177 _numAtBarrier = 0;
178 }
179
180 private:
181 /**
182 * The number of WFs in the WG that have reached the barrier. Once
183 * the number of WFs that reach a barrier matches the number of WFs
184 * in the WG, the barrier is satisfied.
185 */
186 int _numAtBarrier;
187
188 /**
189 * The maximum number of WFs that can reach this barrier. This is
190 * essentially the number of WFs in the WG, and a barrier is satisfied
191 * when the number of WFs that reach the barrier equal this value. If
192 * a WF exits early it must decrement this value so that it is no
193 * longer considered for this barrier.
194 */
195 int _maxBarrierCnt;
196 };
197
198 class ComputeUnit : public ClockedObject
199 {
200 public:
201
202
203 // Execution resources
204 //
205 // The ordering of units is:
206 // Vector ALUs
207 // Scalar ALUs
208 // GM Pipe
209 // LM Pipe
210 // Scalar Mem Pipe
211 //
212 // Note: the ordering of units is important and the code assumes the
213 // above ordering. However, there may be more than one resource of
214 // each type (e.g., 4 VALUs or 2 SALUs)
215
216 int numVectorGlobalMemUnits;
217 // Resource control for global memory to VRF data/address bus
218 WaitClass glbMemToVrfBus;
219 // Resource control for Vector Register File->Global Memory pipe buses
220 WaitClass vrfToGlobalMemPipeBus;
221 // Resource control for Vector Global Memory execution unit
222 WaitClass vectorGlobalMemUnit;
223
224 int numVectorSharedMemUnits;
225 // Resource control for local memory to VRF data/address bus
226 WaitClass locMemToVrfBus;
227 // Resource control for Vector Register File->Local Memory pipe buses
228 WaitClass vrfToLocalMemPipeBus;
229 // Resource control for Vector Shared/Local Memory execution unit
230 WaitClass vectorSharedMemUnit;
231
232 int numScalarMemUnits;
233 // Resource control for scalar memory to SRF data/address bus
234 WaitClass scalarMemToSrfBus;
235 // Resource control for Scalar Register File->Scalar Memory pipe buses
236 WaitClass srfToScalarMemPipeBus;
237 // Resource control for Scalar Memory execution unit
238 WaitClass scalarMemUnit;
239
240 // vector ALU execution resources
241 int numVectorALUs;
242 std::vector<WaitClass> vectorALUs;
243
244 // scalar ALU execution resources
245 int numScalarALUs;
246 std::vector<WaitClass> scalarALUs;
247
248 // Return total number of execution units on this CU
249 int numExeUnits() const;
250 // index into readyList of the first memory unit
251 int firstMemUnit() const;
252 // index into readyList of the last memory unit
253 int lastMemUnit() const;
254 // index into scalarALUs vector of SALU used by the wavefront
255 int mapWaveToScalarAlu(Wavefront *w) const;
256 // index into readyList of SALU used by wavefront
257 int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
258 // index into readyList of Global Memory unit used by wavefront
259 int mapWaveToGlobalMem(Wavefront *w) const;
260 // index into readyList of Local Memory unit used by wavefront
261 int mapWaveToLocalMem(Wavefront *w) const;
262 // index into readyList of Scalar Memory unit used by wavefront
263 int mapWaveToScalarMem(Wavefront *w) const;
264
265 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
266 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
267 int numCyclesPerStoreTransfer; // number of cycles per vector store
268 int numCyclesPerLoadTransfer; // number of cycles per vector load
269
270 // track presence of dynamic instructions in the Schedule pipeline
271 // stage. This is used to check the readiness of the oldest,
272 // non-dispatched instruction of every WF in the Scoreboard stage.
273 std::unordered_set<uint64_t> pipeMap;
274
275 RegisterManager* registerManager;
276
277 FetchStage fetchStage;
278 ScoreboardCheckStage scoreboardCheckStage;
279 ScheduleStage scheduleStage;
280 ExecStage execStage;
281 GlobalMemPipeline globalMemoryPipe;
282 LocalMemPipeline localMemoryPipe;
283 ScalarMemPipeline scalarMemoryPipe;
284
285 EventFunctionWrapper tickEvent;
286
287 typedef ComputeUnitParams Params;
288 std::vector<std::vector<Wavefront*>> wfList;
289 int cu_id;
290
291 // array of vector register files, one per SIMD
292 std::vector<VectorRegisterFile*> vrf;
293 // array of scalar register files, one per SIMD
294 std::vector<ScalarRegisterFile*> srf;
295
296 // Width per VALU/SIMD unit: number of work items that can be executed
297 // on the vector ALU simultaneously in a SIMD unit
298 int simdWidth;
299 // number of pipe stages for bypassing data to next dependent single
300 // precision vector instruction inside the vector ALU pipeline
301 int spBypassPipeLength;
302 // number of pipe stages for bypassing data to next dependent double
303 // precision vector instruction inside the vector ALU pipeline
304 int dpBypassPipeLength;
305 // number of pipe stages for scalar ALU
306 int scalarPipeStages;
307 // number of pipe stages for operand collection & distribution network
308 int operandNetworkLength;
309 // number of cycles per instruction issue period
310 Cycles issuePeriod;
311
312 // VRF to GM Bus latency
313 Cycles vrf_gm_bus_latency;
314 // SRF to Scalar Mem Bus latency
315 Cycles srf_scm_bus_latency;
316 // VRF to LM Bus latency
317 Cycles vrf_lm_bus_latency;
318
319 // tracks the last cycle a vector instruction was executed on a SIMD
320 std::vector<uint64_t> lastExecCycle;
321
322 // Track the amount of interleaving between wavefronts on each SIMD.
323 // This stat is sampled using instExecPerSimd to compute the number of
324 // instructions that have been executed on a SIMD between a WF executing
325 // two successive instructions.
326 Stats::VectorDistribution instInterleave;
327
328 // tracks the number of dyn inst executed per SIMD
329 std::vector<uint64_t> instExecPerSimd;
330
331 // true if we allow a separate TLB per lane
332 bool perLaneTLB;
333 // if 0, TLB prefetching is off.
334 int prefetchDepth;
335 // if fixed-stride prefetching, this is the stride.
336 int prefetchStride;
337
338 std::vector<Addr> lastVaddrCU;
339 std::vector<std::vector<Addr>> lastVaddrSimd;
340 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
341 Enums::PrefetchType prefetchType;
342 EXEC_POLICY exec_policy;
343
344 bool debugSegFault;
345 // Idle CU timeout in ticks
346 Tick idleCUTimeout;
347 int idleWfs;
348 bool functionalTLB;
349 bool localMemBarrier;
350
351 /*
352 * for Counting page accesses
353 */
354 bool countPages;
355
356 Shader *shader;
357
358 Tick req_tick_latency;
359 Tick resp_tick_latency;
360
361 /**
362 * Number of WFs to schedule to each SIMD. This vector is populated
363 * by hasDispResources(), and consumed by the subsequent call to
364 * dispWorkgroup(), to schedule the specified number of WFs to the
365 * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
366 */
367 std::vector<int> numWfsToSched;
368
369 // number of currently reserved vector registers per SIMD unit
370 std::vector<int> vectorRegsReserved;
371 // number of currently reserved scalar registers per SIMD unit
372 std::vector<int> scalarRegsReserved;
373 // number of vector registers per SIMD unit
374 int numVecRegsPerSimd;
375 // number of available scalar registers per SIMD unit
376 int numScalarRegsPerSimd;
377
378 // this hash map will keep track of page divergence
379 // per memory instruction per wavefront. The hash map
380 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
381 std::map<Addr, int> pagesTouched;
382
383 void insertInPipeMap(Wavefront *w);
384 void deleteFromPipeMap(Wavefront *w);
385
386 ComputeUnit(const Params *p);
387 ~ComputeUnit();
388
389 // Timing Functions
390 int oprNetPipeLength() const { return operandNetworkLength; }
391 int simdUnitWidth() const { return simdWidth; }
392 int spBypassLength() const { return spBypassPipeLength; }
393 int dpBypassLength() const { return dpBypassPipeLength; }
394 int scalarPipeLength() const { return scalarPipeStages; }
395 int storeBusLength() const { return numCyclesPerStoreTransfer; }
396 int loadBusLength() const { return numCyclesPerLoadTransfer; }
397 int wfSize() const { return wavefrontSize; }
398
399 void exec();
400 void initiateFetch(Wavefront *wavefront);
401 void fetch(PacketPtr pkt, Wavefront *wavefront);
402 void fillKernelState(Wavefront *w, HSAQueueEntry *task);
403
404 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
405 HSAQueueEntry *task, int bar_id,
406 bool fetchContext=false);
407
408 void doInvalidate(RequestPtr req, int kernId);
409 void doFlush(GPUDynInstPtr gpuDynInst);
410
411 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
412 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
413
414 int cacheLineSize() const { return _cacheLineSize; }
415 int getCacheLineBits() const { return cacheLineBits; }
416
417 private:
418 WFBarrier&
419 barrierSlot(int bar_id)
420 {
421 assert(bar_id > WFBarrier::InvalidID);
422 return wfBarrierSlots.at(bar_id);
423 }
424
425 int
426 getFreeBarrierId()
427 {
428 assert(freeBarrierIds.size());
429 auto free_bar_id = freeBarrierIds.begin();
430 int bar_id = *free_bar_id;
431 freeBarrierIds.erase(free_bar_id);
432 return bar_id;
433 }
434
435 public:
436 int numYetToReachBarrier(int bar_id);
437 bool allAtBarrier(int bar_id);
438 void incNumAtBarrier(int bar_id);
439 int numAtBarrier(int bar_id);
440 int maxBarrierCnt(int bar_id);
441 void resetBarrier(int bar_id);
442 void decMaxBarrierCnt(int bar_id);
443 void releaseBarrier(int bar_id);
444 void releaseWFsFromBarrier(int bar_id);
445 int numBarrierSlots() const { return _numBarrierSlots; }
446
447 template<typename c0, typename c1>
448 void doSmReturn(GPUDynInstPtr gpuDynInst);
449
450 virtual void init() override;
451 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
452 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
453 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
454 bool kernelMemSync,
455 RequestPtr req=nullptr);
456 void handleMemPacket(PacketPtr pkt, int memport_index);
457 bool processTimingPacket(PacketPtr pkt);
458 void processFetchReturn(PacketPtr pkt);
459 void updatePageDivergenceDist(Addr addr);
460
461 MasterID masterId() { return _masterId; }
462
463 bool isDone() const;
464 bool isVectorAluIdle(uint32_t simdId) const;
465
466 protected:
467 MasterID _masterId;
468
469 LdsState &lds;
470
471 public:
472 Stats::Scalar vALUInsts;
473 Stats::Formula vALUInstsPerWF;
474 Stats::Scalar sALUInsts;
475 Stats::Formula sALUInstsPerWF;
476 Stats::Scalar instCyclesVALU;
477 Stats::Scalar instCyclesSALU;
478 Stats::Scalar threadCyclesVALU;
479 Stats::Formula vALUUtilization;
480 Stats::Scalar ldsNoFlatInsts;
481 Stats::Formula ldsNoFlatInstsPerWF;
482 Stats::Scalar flatVMemInsts;
483 Stats::Formula flatVMemInstsPerWF;
484 Stats::Scalar flatLDSInsts;
485 Stats::Formula flatLDSInstsPerWF;
486 Stats::Scalar vectorMemWrites;
487 Stats::Formula vectorMemWritesPerWF;
488 Stats::Scalar vectorMemReads;
489 Stats::Formula vectorMemReadsPerWF;
490 Stats::Scalar scalarMemWrites;
491 Stats::Formula scalarMemWritesPerWF;
492 Stats::Scalar scalarMemReads;
493 Stats::Formula scalarMemReadsPerWF;
494
495 Stats::Formula vectorMemReadsPerKiloInst;
496 Stats::Formula vectorMemWritesPerKiloInst;
497 Stats::Formula vectorMemInstsPerKiloInst;
498 Stats::Formula scalarMemReadsPerKiloInst;
499 Stats::Formula scalarMemWritesPerKiloInst;
500 Stats::Formula scalarMemInstsPerKiloInst;
501
502 // Cycles required to send register source (addr and data) from
503 // register files to memory pipeline, per SIMD.
504 Stats::Vector instCyclesVMemPerSimd;
505 Stats::Vector instCyclesScMemPerSimd;
506 Stats::Vector instCyclesLdsPerSimd;
507
508 Stats::Scalar globalReads;
509 Stats::Scalar globalWrites;
510 Stats::Formula globalMemInsts;
511 Stats::Scalar argReads;
512 Stats::Scalar argWrites;
513 Stats::Formula argMemInsts;
514 Stats::Scalar spillReads;
515 Stats::Scalar spillWrites;
516 Stats::Formula spillMemInsts;
517 Stats::Scalar groupReads;
518 Stats::Scalar groupWrites;
519 Stats::Formula groupMemInsts;
520 Stats::Scalar privReads;
521 Stats::Scalar privWrites;
522 Stats::Formula privMemInsts;
523 Stats::Scalar readonlyReads;
524 Stats::Scalar readonlyWrites;
525 Stats::Formula readonlyMemInsts;
526 Stats::Scalar kernargReads;
527 Stats::Scalar kernargWrites;
528 Stats::Formula kernargMemInsts;
529
530 int activeWaves;
531 Stats::Distribution waveLevelParallelism;
532
533 void updateInstStats(GPUDynInstPtr gpuDynInst);
534
535 // the following stats compute the avg. TLB accesslatency per
536 // uncoalesced request (only for data)
537 Stats::Scalar tlbRequests;
538 Stats::Scalar tlbCycles;
539 Stats::Formula tlbLatency;
540 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
541 Stats::Vector hitsPerTLBLevel;
542
543 Stats::Scalar ldsBankAccesses;
544 Stats::Distribution ldsBankConflictDist;
545
546 // over all memory instructions executed over all wavefronts
547 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
548 Stats::Distribution pageDivergenceDist;
549 // count of non-flat global memory vector instructions executed
550 Stats::Scalar dynamicGMemInstrCnt;
551 // count of flat global memory vector instructions executed
552 Stats::Scalar dynamicFlatMemInstrCnt;
553 Stats::Scalar dynamicLMemInstrCnt;
554
555 Stats::Scalar wgBlockedDueBarrierAllocation;
556 Stats::Scalar wgBlockedDueLdsAllocation;
557 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
558 // active when the instruction is committed, this number is still
559 // incremented by 1
560 Stats::Scalar numInstrExecuted;
561 // Number of cycles among successive instruction executions across all
562 // wavefronts of the same CU
563 Stats::Distribution execRateDist;
564 // number of individual vector operations executed
565 Stats::Scalar numVecOpsExecuted;
566 // number of individual f16 vector operations executed
567 Stats::Scalar numVecOpsExecutedF16;
568 // number of individual f32 vector operations executed
569 Stats::Scalar numVecOpsExecutedF32;
570 // number of individual f64 vector operations executed
571 Stats::Scalar numVecOpsExecutedF64;
572 // number of individual FMA 16,32,64 vector operations executed
573 Stats::Scalar numVecOpsExecutedFMA16;
574 Stats::Scalar numVecOpsExecutedFMA32;
575 Stats::Scalar numVecOpsExecutedFMA64;
576 // number of individual MAC 16,32,64 vector operations executed
577 Stats::Scalar numVecOpsExecutedMAC16;
578 Stats::Scalar numVecOpsExecutedMAC32;
579 Stats::Scalar numVecOpsExecutedMAC64;
580 // number of individual MAD 16,32,64 vector operations executed
581 Stats::Scalar numVecOpsExecutedMAD16;
582 Stats::Scalar numVecOpsExecutedMAD32;
583 Stats::Scalar numVecOpsExecutedMAD64;
584 // total number of two op FP vector operations executed
585 Stats::Scalar numVecOpsExecutedTwoOpFP;
586 // Total cycles that something is running on the GPU
587 Stats::Scalar totalCycles;
588 Stats::Formula vpc; // vector ops per cycle
589 Stats::Formula vpc_f16; // vector ops per cycle
590 Stats::Formula vpc_f32; // vector ops per cycle
591 Stats::Formula vpc_f64; // vector ops per cycle
592 Stats::Formula ipc; // vector instructions per cycle
593 Stats::Distribution controlFlowDivergenceDist;
594 Stats::Distribution activeLanesPerGMemInstrDist;
595 Stats::Distribution activeLanesPerLMemInstrDist;
596 // number of vector ALU instructions received
597 Stats::Formula numALUInstsExecuted;
598 // number of times a WG can not start due to lack of free VGPRs in SIMDs
599 Stats::Scalar numTimesWgBlockedDueVgprAlloc;
600 // number of times a WG can not start due to lack of free SGPRs in SIMDs
601 Stats::Scalar numTimesWgBlockedDueSgprAlloc;
602 Stats::Scalar numCASOps;
603 Stats::Scalar numFailedCASOps;
604 Stats::Scalar completedWfs;
605 Stats::Scalar completedWGs;
606
607 // distrubtion in latency difference between first and last cache block
608 // arrival ticks
609 Stats::Distribution headTailLatency;
610
611 void
612 regStats() override;
613
614 LdsState &
615 getLds() const
616 {
617 return lds;
618 }
619
620 int32_t
621 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
622
623 bool
624 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
625
626 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
627 pageDataStruct pageAccesses;
628
629 void exitCallback();
630
631 class GMTokenPort : public TokenMasterPort
632 {
633 public:
634 GMTokenPort(const std::string& name, SimObject *owner,
635 PortID id = InvalidPortID)
636 : TokenMasterPort(name, owner, id)
637 { }
638 ~GMTokenPort() { }
639
640 protected:
641 bool recvTimingResp(PacketPtr) { return false; }
642 void recvReqRetry() { }
643 };
644
645 // Manager for the number of tokens available to this compute unit to
646 // send global memory request packets to the coalescer this is only used
647 // between global memory pipe and TCP coalescer.
648 TokenManager *memPortTokens;
649 GMTokenPort gmTokenPort;
650
651 /** Data access Port **/
652 class DataPort : public RequestPort
653 {
654 public:
655 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
656 : RequestPort(_name, _cu), computeUnit(_cu),
657 index(_index) { }
658
659 bool snoopRangeSent;
660
661 struct SenderState : public Packet::SenderState
662 {
663 GPUDynInstPtr _gpuDynInst;
664 int port_index;
665 Packet::SenderState *saved;
666
667 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
668 Packet::SenderState *sender_state=nullptr)
669 : _gpuDynInst(gpuDynInst),
670 port_index(_port_index),
671 saved(sender_state) { }
672 };
673
674 void processMemReqEvent(PacketPtr pkt);
675 EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
676
677 void processMemRespEvent(PacketPtr pkt);
678 EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
679
680 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
681
682 protected:
683 ComputeUnit *computeUnit;
684 int index;
685
686 virtual bool recvTimingResp(PacketPtr pkt);
687 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
688 virtual void recvFunctional(PacketPtr pkt) { }
689 virtual void recvRangeChange() { }
690 virtual void recvReqRetry();
691
692 virtual void
693 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
694 {
695 resp.clear();
696 snoop = true;
697 }
698
699 };
700
701 // Scalar data cache access port
702 class ScalarDataPort : public RequestPort
703 {
704 public:
705 ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
706 PortID _index)
707 : RequestPort(_name, _cu, _index), computeUnit(_cu), index(_index)
708 {
709 (void)index;
710 }
711
712 bool recvTimingResp(PacketPtr pkt) override;
713 void recvReqRetry() override;
714
715 struct SenderState : public Packet::SenderState
716 {
717 SenderState(GPUDynInstPtr gpuDynInst,
718 Packet::SenderState *sender_state=nullptr)
719 : _gpuDynInst(gpuDynInst), saved(sender_state)
720 {
721 }
722
723 GPUDynInstPtr _gpuDynInst;
724 Packet::SenderState *saved;
725 };
726
727 class MemReqEvent : public Event
728 {
729 private:
730 ScalarDataPort *scalarDataPort;
731 PacketPtr pkt;
732
733 public:
734 MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
735 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
736 {
737 setFlags(Event::AutoDelete);
738 }
739
740 void process();
741 const char *description() const;
742 };
743
744 std::deque<PacketPtr> retries;
745
746 private:
747 ComputeUnit *computeUnit;
748 PortID index;
749 };
750
751 // Instruction cache access port
752 class SQCPort : public RequestPort
753 {
754 public:
755 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
756 : RequestPort(_name, _cu), computeUnit(_cu),
757 index(_index) { }
758
759 bool snoopRangeSent;
760
761 struct SenderState : public Packet::SenderState
762 {
763 Wavefront *wavefront;
764 Packet::SenderState *saved;
765 // kernel id to be used in handling I-Cache invalidate response
766 int kernId;
767
768 SenderState(Wavefront *_wavefront, Packet::SenderState
769 *sender_state=nullptr, int _kernId=-1)
770 : wavefront(_wavefront), saved(sender_state),
771 kernId(_kernId){ }
772 };
773
774 std::deque<std::pair<PacketPtr, Wavefront*>> retries;
775
776 protected:
777 ComputeUnit *computeUnit;
778 int index;
779
780 virtual bool recvTimingResp(PacketPtr pkt);
781 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
782 virtual void recvFunctional(PacketPtr pkt) { }
783 virtual void recvRangeChange() { }
784 virtual void recvReqRetry();
785
786 virtual void
787 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
788 {
789 resp.clear();
790 snoop = true;
791 }
792 };
793
794 /** Data TLB port **/
795 class DTLBPort : public RequestPort
796 {
797 public:
798 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
799 : RequestPort(_name, _cu), computeUnit(_cu),
800 index(_index), stalled(false)
801 { }
802
803 bool isStalled() { return stalled; }
804 void stallPort() { stalled = true; }
805 void unstallPort() { stalled = false; }
806
807 /**
808 * here we queue all the translation requests that were
809 * not successfully sent.
810 */
811 std::deque<PacketPtr> retries;
812
813 /** SenderState is information carried along with the packet
814 * throughout the TLB hierarchy
815 */
816 struct SenderState: public Packet::SenderState
817 {
818 // the memInst that this is associated with
819 GPUDynInstPtr _gpuDynInst;
820
821 // the lane in the memInst this is associated with, so we send
822 // the memory request down the right port
823 int portIndex;
824
825 // constructor used for packets involved in timing accesses
826 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
827 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
828
829 };
830
831 protected:
832 ComputeUnit *computeUnit;
833 int index;
834 bool stalled;
835
836 virtual bool recvTimingResp(PacketPtr pkt);
837 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
838 virtual void recvFunctional(PacketPtr pkt) { }
839 virtual void recvRangeChange() { }
840 virtual void recvReqRetry();
841 };
842
843 class ScalarDTLBPort : public RequestPort
844 {
845 public:
846 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
847 : RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
848 {
849 }
850
851 struct SenderState : public Packet::SenderState
852 {
853 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
854 GPUDynInstPtr _gpuDynInst;
855 };
856
857 bool recvTimingResp(PacketPtr pkt) override;
858 void recvReqRetry() override { assert(false); }
859
860 bool isStalled() const { return stalled; }
861 void stallPort() { stalled = true; }
862 void unstallPort() { stalled = false; }
863
864 std::deque<PacketPtr> retries;
865
866 private:
867 ComputeUnit *computeUnit;
868 bool stalled;
869 };
870
871 class ITLBPort : public RequestPort
872 {
873 public:
874 ITLBPort(const std::string &_name, ComputeUnit *_cu)
875 : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
876
877
878 bool isStalled() { return stalled; }
879 void stallPort() { stalled = true; }
880 void unstallPort() { stalled = false; }
881
882 /**
883 * here we queue all the translation requests that were
884 * not successfully sent.
885 */
886 std::deque<PacketPtr> retries;
887
888 /** SenderState is information carried along with the packet
889 * throughout the TLB hierarchy
890 */
891 struct SenderState: public Packet::SenderState
892 {
893 // The wavefront associated with this request
894 Wavefront *wavefront;
895
896 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
897 };
898
899 protected:
900 ComputeUnit *computeUnit;
901 bool stalled;
902
903 virtual bool recvTimingResp(PacketPtr pkt);
904 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
905 virtual void recvFunctional(PacketPtr pkt) { }
906 virtual void recvRangeChange() { }
907 virtual void recvReqRetry();
908 };
909
910 /**
911 * the port intended to communicate between the CU and its LDS
912 */
913 class LDSPort : public RequestPort
914 {
915 public:
916 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
917 : RequestPort(_name, _cu, _id), computeUnit(_cu)
918 {
919 }
920
921 bool isStalled() const { return stalled; }
922 void stallPort() { stalled = true; }
923 void unstallPort() { stalled = false; }
924
925 /**
926 * here we queue all the requests that were
927 * not successfully sent.
928 */
929 std::queue<PacketPtr> retries;
930
931 /**
932 * SenderState is information carried along with the packet, esp. the
933 * GPUDynInstPtr
934 */
935 class SenderState: public Packet::SenderState
936 {
937 protected:
938 // The actual read/write/atomic request that goes with this command
939 GPUDynInstPtr _gpuDynInst = nullptr;
940
941 public:
942 SenderState(GPUDynInstPtr gpuDynInst):
943 _gpuDynInst(gpuDynInst)
944 {
945 }
946
947 GPUDynInstPtr
948 getMemInst() const
949 {
950 return _gpuDynInst;
951 }
952 };
953
954 virtual bool
955 sendTimingReq(PacketPtr pkt);
956
957 protected:
958
959 bool stalled = false; ///< whether or not it is stalled
960
961 ComputeUnit *computeUnit;
962
963 virtual bool
964 recvTimingResp(PacketPtr pkt);
965
966 virtual Tick
967 recvAtomic(PacketPtr pkt) { return 0; }
968
969 virtual void
970 recvFunctional(PacketPtr pkt)
971 {
972 }
973
974 virtual void
975 recvRangeChange()
976 {
977 }
978
979 virtual void
980 recvReqRetry();
981 };
982
983 /** The port to access the Local Data Store
984 * Can be connected to a LDS object
985 */
986 LDSPort *ldsPort = nullptr;
987
988 LDSPort *
989 getLdsPort() const
990 {
991 return ldsPort;
992 }
993
994 TokenManager *
995 getTokenManager()
996 {
997 return memPortTokens;
998 }
999
1000 /** The memory port for SIMD data accesses.
1001 * Can be connected to PhysMem for Ruby for timing simulations
1002 */
1003 std::vector<DataPort*> memPort;
1004 // port to the TLB hierarchy (i.e., the L1 TLB)
1005 std::vector<DTLBPort*> tlbPort;
1006 // port to the scalar data cache
1007 ScalarDataPort *scalarDataPort;
1008 // port to the scalar data TLB
1009 ScalarDTLBPort *scalarDTLBPort;
1010 // port to the SQC (i.e. the I-cache)
1011 SQCPort *sqcPort;
1012 // port to the SQC TLB (there's a separate TLB for each I-cache)
1013 ITLBPort *sqcTLBPort;
1014
1015 Port &
1016 getPort(const std::string &if_name, PortID idx) override
1017 {
1018 if (if_name == "memory_port") {
1019 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
1020 this, idx);
1021 return *memPort[idx];
1022 } else if (if_name == "translation_port") {
1023 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
1024 this, idx);
1025 return *tlbPort[idx];
1026 } else if (if_name == "scalar_port") {
1027 scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
1028 idx), this, idx);
1029 return *scalarDataPort;
1030 } else if (if_name == "scalar_tlb_port") {
1031 scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
1032 this);
1033 return *scalarDTLBPort;
1034 } else if (if_name == "sqc_port") {
1035 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
1036 this, idx);
1037 return *sqcPort;
1038 } else if (if_name == "sqc_tlb_port") {
1039 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
1040 return *sqcTLBPort;
1041 } else if (if_name == "ldsPort") {
1042 if (ldsPort) {
1043 fatal("an LDS port was already allocated");
1044 }
1045 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
1046 return *ldsPort;
1047 } else if (if_name == "gmTokenPort") {
1048 return gmTokenPort;
1049 } else {
1050 panic("incorrect port name");
1051 }
1052 }
1053
1054 InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
1055
1056 private:
1057 const int _cacheLineSize;
1058 const int _numBarrierSlots;
1059 int cacheLineBits;
1060 InstSeqNum globalSeqNum;
1061 int wavefrontSize;
1062
1063 /**
1064 * TODO: Update these comments once the pipe stage interface has
1065 * been fully refactored.
1066 *
1067 * Pipeline stage interfaces.
1068 *
1069 * Buffers used to communicate between various pipeline stages
1070 * List of waves which will be dispatched to
1071 * each execution resource. An EXREADY implies
1072 * dispatch list is non-empty and
1073 * execution unit has something to execute
1074 * this cycle. Currently, the dispatch list of
1075 * an execution resource can hold only one wave because
1076 * an execution resource can execute only one wave in a cycle.
1077 * dispatchList is used to communicate between schedule
1078 * and exec stage
1079 *
1080 * At a high level, the following intra-/inter-stage communication occurs:
1081 * SCB to SCH: readyList provides per exec resource list of waves that
1082 * passed dependency and readiness checks. If selected by
1083 * scheduler, attempt to add wave to schList conditional on
1084 * RF support.
1085 * SCH: schList holds waves that are gathering operands or waiting
1086 * for execution resource availability. Once ready, waves are
1087 * placed on the dispatchList as candidates for execution. A wave
1088 * may spend multiple cycles in SCH stage, on the schList due to
1089 * RF access conflicts or execution resource contention.
1090 * SCH to EX: dispatchList holds waves that are ready to be executed.
1091 * LM/FLAT arbitration may remove an LM wave and place it
1092 * back on the schList. RF model may also force a wave back
1093 * to the schList if using the detailed model.
1094 */
1095 ScoreboardCheckToSchedule scoreboardCheckToSchedule;
1096 ScheduleToExecute scheduleToExecute;
1097
1098 /**
1099 * The barrier slots for this CU.
1100 */
1101 std::vector<WFBarrier> wfBarrierSlots;
1102 /**
1103 * A set used to easily retrieve a free barrier ID.
1104 */
1105 std::unordered_set<int> freeBarrierIds;
1106
1107 // hold the time of the arrival of the first cache block related to
1108 // a particular GPUDynInst. This is used to calculate the difference
1109 // between the first and last chace block arrival times.
1110 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1111 };
1112
1113 #endif // __COMPUTE_UNIT_HH__