2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/compute_unit.hh"
38 #include "arch/x86/isa_traits.hh"
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
50 #include "gpu-compute/dispatcher.hh"
51 #include "gpu-compute/gpu_dyn_inst.hh"
52 #include "gpu-compute/gpu_static_inst.hh"
53 #include "gpu-compute/scalar_register_file.hh"
54 #include "gpu-compute/shader.hh"
55 #include "gpu-compute/simple_pool_manager.hh"
56 #include "gpu-compute/vector_register_file.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
62 ComputeUnit::ComputeUnit(const Params
*p
) : ClockedObject(p
),
63 numVectorGlobalMemUnits(p
->num_global_mem_pipes
),
64 numVectorSharedMemUnits(p
->num_shared_mem_pipes
),
65 numScalarMemUnits(p
->num_scalar_mem_pipes
),
66 numVectorALUs(p
->num_SIMDs
),
67 numScalarALUs(p
->num_scalar_cores
),
68 vrfToCoalescerBusWidth(p
->vrf_to_coalescer_bus_width
),
69 coalescerToVrfBusWidth(p
->coalescer_to_vrf_bus_width
),
70 registerManager(p
->register_manager
),
72 scoreboardCheckStage(p
, *this, scoreboardCheckToSchedule
),
73 scheduleStage(p
, *this, scoreboardCheckToSchedule
, scheduleToExecute
),
74 execStage(p
, *this, scheduleToExecute
),
75 globalMemoryPipe(p
, *this),
76 localMemoryPipe(p
, *this),
77 scalarMemoryPipe(p
, *this),
78 tickEvent([this]{ exec(); }, "Compute unit tick event",
79 false, Event::CPU_Tick_Pri
),
81 vrf(p
->vector_register_file
), srf(p
->scalar_register_file
),
82 simdWidth(p
->simd_width
),
83 spBypassPipeLength(p
->spbypass_pipe_length
),
84 dpBypassPipeLength(p
->dpbypass_pipe_length
),
85 scalarPipeStages(p
->scalar_pipe_length
),
86 operandNetworkLength(p
->operand_network_length
),
87 issuePeriod(p
->issue_period
),
88 vrf_gm_bus_latency(p
->vrf_gm_bus_latency
),
89 srf_scm_bus_latency(p
->srf_scm_bus_latency
),
90 vrf_lm_bus_latency(p
->vrf_lm_bus_latency
),
91 perLaneTLB(p
->perLaneTLB
), prefetchDepth(p
->prefetch_depth
),
92 prefetchStride(p
->prefetch_stride
), prefetchType(p
->prefetch_prev_type
),
93 debugSegFault(p
->debugSegFault
),
94 functionalTLB(p
->functionalTLB
), localMemBarrier(p
->localMemBarrier
),
95 countPages(p
->countPages
),
96 req_tick_latency(p
->mem_req_latency
* p
->clk_domain
->clockPeriod()),
97 resp_tick_latency(p
->mem_resp_latency
* p
->clk_domain
->clockPeriod()),
98 _requestorId(p
->system
->getRequestorId(this, "ComputeUnit")),
99 lds(*p
->localDataStore
), gmTokenPort(name() + ".gmTokenPort", this),
100 ldsPort(csprintf("%s-port", name()), this),
101 scalarDataPort(csprintf("%s-port", name()), this),
102 scalarDTLBPort(csprintf("%s-port", name()), this),
103 sqcPort(csprintf("%s-port", name()), this),
104 sqcTLBPort(csprintf("%s-port", name()), this),
105 _cacheLineSize(p
->system
->cacheLineSize()),
106 _numBarrierSlots(p
->num_barrier_slots
),
107 globalSeqNum(0), wavefrontSize(p
->wf_size
),
108 scoreboardCheckToSchedule(p
),
112 * This check is necessary because std::bitset only provides conversion
113 * to unsigned long or unsigned long long via to_ulong() or to_ullong().
114 * there are a few places in the code where to_ullong() is used, however
115 * if wavefrontSize is larger than a value the host can support then
116 * bitset will throw a runtime exception. We should remove all use of
117 * to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
118 * however until that is done this assert is required.
120 fatal_if(p
->wf_size
> std::numeric_limits
<unsigned long long>::digits
||
122 "WF size is larger than the host can support");
123 fatal_if(!isPowerOf2(wavefrontSize
),
124 "Wavefront size should be a power of 2");
125 // calculate how many cycles a vector load or store will need to transfer
126 // its data over the corresponding buses
127 numCyclesPerStoreTransfer
=
128 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
129 (double)vrfToCoalescerBusWidth
);
131 numCyclesPerLoadTransfer
= (wfSize() * sizeof(uint32_t))
132 / coalescerToVrfBusWidth
;
134 // Initialization: all WF slots are assumed STOPPED
135 idleWfs
= p
->n_wf
* numVectorALUs
;
136 lastVaddrWF
.resize(numVectorALUs
);
137 wfList
.resize(numVectorALUs
);
139 wfBarrierSlots
.resize(p
->num_barrier_slots
, WFBarrier());
141 for (int i
= 0; i
< p
->num_barrier_slots
; ++i
) {
142 freeBarrierIds
.insert(i
);
145 for (int j
= 0; j
< numVectorALUs
; ++j
) {
146 lastVaddrWF
[j
].resize(p
->n_wf
);
148 for (int i
= 0; i
< p
->n_wf
; ++i
) {
149 lastVaddrWF
[j
][i
].resize(wfSize());
151 wfList
[j
].push_back(p
->wavefronts
[j
* p
->n_wf
+ i
]);
152 wfList
[j
][i
]->setParent(this);
154 for (int k
= 0; k
< wfSize(); ++k
) {
155 lastVaddrWF
[j
][i
][k
] = 0;
160 lastVaddrSimd
.resize(numVectorALUs
);
162 for (int i
= 0; i
< numVectorALUs
; ++i
) {
163 lastVaddrSimd
[i
].resize(wfSize(), 0);
166 lastVaddrCU
.resize(wfSize());
170 if (p
->execPolicy
== "OLDEST-FIRST") {
171 exec_policy
= EXEC_POLICY::OLDEST
;
172 } else if (p
->execPolicy
== "ROUND-ROBIN") {
173 exec_policy
= EXEC_POLICY::RR
;
175 fatal("Invalid WF execution policy (CU)\n");
178 for (int i
= 0; i
< p
->port_memory_port_connection_count
; ++i
) {
179 memPort
.emplace_back(csprintf("%s-port%d", name(), i
), this, i
);
182 for (int i
= 0; i
< p
->port_translation_port_connection_count
; ++i
) {
183 tlbPort
.emplace_back(csprintf("%s-port%d", name(), i
), this, i
);
186 // Setup tokens for response ports. The number of tokens in memPortTokens
187 // is the total token count for the entire vector port (i.e., this CU).
188 memPortTokens
= new TokenManager(p
->max_cu_tokens
);
190 registerExitCallback([this]() { exitCallback(); });
192 lastExecCycle
.resize(numVectorALUs
, 0);
194 for (int i
= 0; i
< vrf
.size(); ++i
) {
195 vrf
[i
]->setParent(this);
197 for (int i
= 0; i
< srf
.size(); ++i
) {
198 srf
[i
]->setParent(this);
200 numVecRegsPerSimd
= vrf
[0]->numRegs();
201 numScalarRegsPerSimd
= srf
[0]->numRegs();
203 registerManager
->setParent(this);
207 instExecPerSimd
.resize(numVectorALUs
, 0);
209 // Calculate the number of bits to address a cache line
210 panic_if(!isPowerOf2(_cacheLineSize
),
211 "Cache line size should be a power of two.");
212 cacheLineBits
= floorLog2(_cacheLineSize
);
215 ComputeUnit::~ComputeUnit()
217 // Delete wavefront slots
218 for (int j
= 0; j
< numVectorALUs
; ++j
) {
219 for (int i
= 0; i
< shader
->n_wf
; ++i
) {
222 lastVaddrSimd
[j
].clear();
228 ComputeUnit::numExeUnits() const
230 return numVectorALUs
+ numScalarALUs
+ numVectorGlobalMemUnits
+
231 numVectorSharedMemUnits
+ numScalarMemUnits
;
234 // index into readyList of the first memory unit
236 ComputeUnit::firstMemUnit() const
238 return numVectorALUs
+ numScalarALUs
;
241 // index into readyList of the last memory unit
243 ComputeUnit::lastMemUnit() const
245 return numExeUnits() - 1;
248 // index into scalarALUs vector of SALU used by the wavefront
250 ComputeUnit::mapWaveToScalarAlu(Wavefront
*w
) const
252 if (numScalarALUs
== 1) {
255 return w
->simdId
% numScalarALUs
;
259 // index into readyList of Scalar ALU unit used by wavefront
261 ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront
*w
) const
263 return numVectorALUs
+ mapWaveToScalarAlu(w
);
266 // index into readyList of Global Memory unit used by wavefront
268 ComputeUnit::mapWaveToGlobalMem(Wavefront
*w
) const
270 // TODO: FIXME if more than 1 GM pipe supported
271 return numVectorALUs
+ numScalarALUs
;
274 // index into readyList of Local Memory unit used by wavefront
276 ComputeUnit::mapWaveToLocalMem(Wavefront
*w
) const
278 // TODO: FIXME if more than 1 LM pipe supported
279 return numVectorALUs
+ numScalarALUs
+ numVectorGlobalMemUnits
;
282 // index into readyList of Scalar Memory unit used by wavefront
284 ComputeUnit::mapWaveToScalarMem(Wavefront
*w
) const
286 // TODO: FIXME if more than 1 ScM pipe supported
287 return numVectorALUs
+ numScalarALUs
+ numVectorGlobalMemUnits
+
288 numVectorSharedMemUnits
;
292 ComputeUnit::fillKernelState(Wavefront
*w
, HSAQueueEntry
*task
)
294 w
->resizeRegFiles(task
->numVectorRegs(), task
->numScalarRegs());
295 w
->workGroupSz
[0] = task
->wgSize(0);
296 w
->workGroupSz
[1] = task
->wgSize(1);
297 w
->workGroupSz
[2] = task
->wgSize(2);
298 w
->wgSz
= w
->workGroupSz
[0] * w
->workGroupSz
[1] * w
->workGroupSz
[2];
299 w
->gridSz
[0] = task
->gridSize(0);
300 w
->gridSz
[1] = task
->gridSize(1);
301 w
->gridSz
[2] = task
->gridSize(2);
302 w
->computeActualWgSz(task
);
306 ComputeUnit::startWavefront(Wavefront
*w
, int waveId
, LdsChunk
*ldsChunk
,
307 HSAQueueEntry
*task
, int bar_id
, bool fetchContext
)
309 static int _n_wave
= 0;
311 VectorMask init_mask
;
314 for (int k
= 0; k
< wfSize(); ++k
) {
315 if (k
+ waveId
* wfSize() < w
->actualWgSzTotal
)
319 w
->execMask() = init_mask
;
321 w
->kernId
= task
->dispatchId();
323 w
->initMask
= init_mask
.to_ullong();
325 if (bar_id
> WFBarrier::InvalidID
) {
326 w
->barrierId(bar_id
);
328 assert(!w
->hasBarrier());
331 for (int k
= 0; k
< wfSize(); ++k
) {
332 w
->workItemId
[0][k
] = (k
+ waveId
* wfSize()) % w
->actualWgSz
[0];
333 w
->workItemId
[1][k
] = ((k
+ waveId
* wfSize()) / w
->actualWgSz
[0]) %
335 w
->workItemId
[2][k
] = (k
+ waveId
* wfSize()) /
336 (w
->actualWgSz
[0] * w
->actualWgSz
[1]);
338 w
->workItemFlatId
[k
] = w
->workItemId
[2][k
] * w
->actualWgSz
[0] *
339 w
->actualWgSz
[1] + w
->workItemId
[1][k
] * w
->actualWgSz
[0] +
344 w
->wgId
= task
->globalWgId();
345 w
->dispatchId
= task
->dispatchId();
346 w
->workGroupId
[0] = w
->wgId
% task
->numWg(0);
347 w
->workGroupId
[1] = (w
->wgId
/ task
->numWg(0)) % task
->numWg(1);
348 w
->workGroupId
[2] = w
->wgId
/ (task
->numWg(0) * task
->numWg(1));
350 // set the wavefront context to have a pointer to this section of the LDS
351 w
->ldsChunk
= ldsChunk
;
353 int32_t refCount M5_VAR_USED
=
354 lds
.increaseRefCounter(w
->dispatchId
, w
->wgId
);
355 DPRINTF(GPUDisp
, "CU%d: increase ref ctr wg[%d] to [%d]\n",
356 cu_id
, w
->wgId
, refCount
);
358 w
->instructionBuffer
.clear();
363 DPRINTF(GPUDisp
, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
364 "WF[%d][%d]. Ref cnt:%d\n", _n_wave
, w
->barrierId(), cu_id
,
365 w
->simdId
, w
->wfSlotId
, refCount
);
367 w
->initRegState(task
, w
->actualWgSzTotal
);
368 w
->start(_n_wave
++, task
->codeAddr());
370 waveLevelParallelism
.sample(activeWaves
);
375 * trigger invalidate operation in the cu
377 * req: request initialized in shader, carrying the invlidate flags
380 ComputeUnit::doInvalidate(RequestPtr req
, int kernId
){
381 GPUDynInstPtr gpuDynInst
382 = std::make_shared
<GPUDynInst
>(this, nullptr,
383 new KernelLaunchStaticInst(), getAndIncSeqNum());
385 // kern_id will be used in inv responses
386 gpuDynInst
->kern_id
= kernId
;
387 // update contextId field
388 req
->setContext(gpuDynInst
->wfDynId
);
390 injectGlobalMemFence(gpuDynInst
, true, req
);
394 * trigger flush operation in the cu
396 * gpuDynInst: inst passed to the request
399 ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst
) {
400 injectGlobalMemFence(gpuDynInst
, true);
404 ComputeUnit::dispWorkgroup(HSAQueueEntry
*task
, int num_wfs_in_wg
)
406 // If we aren't ticking, start it up!
407 if (!tickEvent
.scheduled()) {
408 DPRINTF(GPUDisp
, "CU%d: Scheduling wakeup next cycle\n", cu_id
);
409 schedule(tickEvent
, nextCycle());
412 // the kernel's invalidate must have finished before any wg dispatch
413 assert(task
->isInvDone());
415 // reserve the LDS capacity allocated to the work group
416 // disambiguated by the dispatch ID and workgroup ID, which should be
418 LdsChunk
*ldsChunk
= lds
.reserveSpace(task
->dispatchId(),
422 panic_if(!ldsChunk
, "was not able to reserve space for this WG");
424 // calculate the number of 32-bit vector registers required
426 int vregDemand
= task
->numVectorRegs();
427 int sregDemand
= task
->numScalarRegs();
430 int barrier_id
= WFBarrier::InvalidID
;
433 * If this WG only has one WF it will not consume any barrier
434 * resources because it has no need of them.
436 if (num_wfs_in_wg
> 1) {
438 * Find a free barrier slot for this WG. Each WF in the WG will
439 * receive the same barrier ID.
441 barrier_id
= getFreeBarrierId();
442 auto &wf_barrier
= barrierSlot(barrier_id
);
443 assert(!wf_barrier
.maxBarrierCnt());
444 assert(!wf_barrier
.numAtBarrier());
445 wf_barrier
.setMaxBarrierCnt(num_wfs_in_wg
);
447 DPRINTF(GPUSync
, "CU[%d] - Dispatching WG with barrier Id%d. "
448 "%d waves using this barrier.\n", cu_id
, barrier_id
,
452 // Assign WFs according to numWfsToSched vector, which is computed by
453 // hasDispResources()
454 for (int j
= 0; j
< shader
->n_wf
; ++j
) {
455 for (int i
= 0; i
< numVectorALUs
; ++i
) {
456 Wavefront
*w
= wfList
[i
][j
];
457 // Check if this wavefront slot is available and there are WFs
458 // remaining to be dispatched to current SIMD:
459 // WF slot must be stopped and not waiting
460 // for a release to complete S_RETURNING
461 if (w
->getStatus() == Wavefront::S_STOPPED
&&
462 numWfsToSched
[i
] > 0) {
463 // decrement number of WFs awaiting dispatch to current SIMD
464 numWfsToSched
[i
] -= 1;
466 fillKernelState(w
, task
);
468 DPRINTF(GPURename
, "SIMD[%d] wfSlotId[%d] WF[%d] "
469 "vregDemand[%d] sregDemand[%d]\n", i
, j
, w
->wfDynId
,
470 vregDemand
, sregDemand
);
472 registerManager
->allocateRegisters(w
, vregDemand
, sregDemand
);
474 startWavefront(w
, wave_id
, ldsChunk
, task
, barrier_id
);
482 ComputeUnit::insertInPipeMap(Wavefront
*w
)
484 panic_if(w
->instructionBuffer
.empty(),
485 "Instruction Buffer of WF%d can't be empty", w
->wgId
);
486 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
487 pipeMap
.emplace(ii
->seqNum());
491 ComputeUnit::deleteFromPipeMap(Wavefront
*w
)
493 panic_if(w
->instructionBuffer
.empty(),
494 "Instruction Buffer of WF%d can't be empty", w
->wgId
);
495 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
496 // delete the dynamic instruction from the pipeline map
497 auto it
= pipeMap
.find(ii
->seqNum());
498 panic_if(it
== pipeMap
.end(), "Pipeline Map is empty\n");
503 ComputeUnit::hasDispResources(HSAQueueEntry
*task
, int &num_wfs_in_wg
)
505 // compute true size of workgroup (after clamping to grid size)
506 int trueWgSize
[HSAQueueEntry::MAX_DIM
];
507 int trueWgSizeTotal
= 1;
509 for (int d
= 0; d
< HSAQueueEntry::MAX_DIM
; ++d
) {
510 trueWgSize
[d
] = std::min(task
->wgSize(d
), task
->gridSize(d
) -
511 task
->wgId(d
) * task
->wgSize(d
));
513 trueWgSizeTotal
*= trueWgSize
[d
];
514 DPRINTF(GPUDisp
, "trueWgSize[%d] = %d\n", d
, trueWgSize
[d
]);
517 DPRINTF(GPUDisp
, "trueWgSizeTotal = %d\n", trueWgSizeTotal
);
519 // calculate the number of WFs in this WG
520 int numWfs
= (trueWgSizeTotal
+ wfSize() - 1) / wfSize();
521 num_wfs_in_wg
= numWfs
;
523 bool barrier_avail
= true;
525 if (numWfs
> 1 && !freeBarrierIds
.size()) {
526 barrier_avail
= false;
529 // calculate the number of 32-bit vector registers required by each
530 // work item of the work group
531 int vregDemandPerWI
= task
->numVectorRegs();
532 // calculate the number of 32-bit scalar registers required by each
533 // work item of the work group
534 int sregDemandPerWI
= task
->numScalarRegs();
536 // check if the total number of VGPRs snd SGPRs required by all WFs
537 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
538 panic_if((numWfs
* vregDemandPerWI
) > (numVectorALUs
* numVecRegsPerSimd
),
539 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
540 "that has %d VGPRs\n",
541 numWfs
, vregDemandPerWI
, numVectorALUs
* numVecRegsPerSimd
);
542 panic_if((numWfs
* sregDemandPerWI
) > numScalarRegsPerSimd
,
543 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
545 numWfs
, sregDemandPerWI
, numScalarRegsPerSimd
);
547 // number of WF slots that are not occupied
549 // number of Wfs from WG that were successfully mapped to a SIMD
550 int numMappedWfs
= 0;
551 numWfsToSched
.clear();
552 numWfsToSched
.resize(numVectorALUs
, 0);
554 // attempt to map WFs to the SIMDs, based on WF slot availability
555 // and register file availability
556 for (int j
= 0; j
< shader
->n_wf
; ++j
) {
557 for (int i
= 0; i
< numVectorALUs
; ++i
) {
558 if (wfList
[i
][j
]->getStatus() == Wavefront::S_STOPPED
) {
560 // check if current WF will fit onto current SIMD/VRF
561 // if all WFs have not yet been mapped to the SIMDs
562 if (numMappedWfs
< numWfs
&&
563 registerManager
->canAllocateSgprs(i
, numWfsToSched
[i
] + 1,
565 registerManager
->canAllocateVgprs(i
, numWfsToSched
[i
] + 1,
574 // check that the number of mapped WFs is not greater
575 // than the actual number of WFs
576 assert(numMappedWfs
<= numWfs
);
578 bool vregAvail
= true;
579 bool sregAvail
= true;
580 // if a WF to SIMD mapping was not found, find the limiting resource
581 if (numMappedWfs
< numWfs
) {
583 for (int j
= 0; j
< numVectorALUs
; ++j
) {
584 // find if there are enough free VGPRs in the SIMD's VRF
585 // to accomodate the WFs of the new WG that would be mapped
587 vregAvail
&= registerManager
->
588 canAllocateVgprs(j
, numWfsToSched
[j
], vregDemandPerWI
);
589 // find if there are enough free SGPRs in the SIMD's SRF
590 // to accomodate the WFs of the new WG that would be mapped
592 sregAvail
&= registerManager
->
593 canAllocateSgprs(j
, numWfsToSched
[j
], sregDemandPerWI
);
597 DPRINTF(GPUDisp
, "Free WF slots = %d, Mapped WFs = %d, \
598 VGPR Availability = %d, SGPR Availability = %d\n",
599 freeWfSlots
, numMappedWfs
, vregAvail
, sregAvail
);
602 ++numTimesWgBlockedDueVgprAlloc
;
606 ++numTimesWgBlockedDueSgprAlloc
;
609 // Return true if enough WF slots to submit workgroup and if there are
610 // enough VGPRs to schedule all WFs to their SIMD units
611 bool ldsAvail
= lds
.canReserve(task
->ldsSize());
613 wgBlockedDueLdsAllocation
++;
616 if (!barrier_avail
) {
617 wgBlockedDueBarrierAllocation
++;
620 // Return true if the following are all true:
621 // (a) all WFs of the WG were mapped to free WF slots
622 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
623 // (c) there are enough SGPRs on the CU to schedule all WFs
624 // (d) there is enough space in LDS to allocate for all WFs
625 bool can_dispatch
= numMappedWfs
== numWfs
&& vregAvail
&& sregAvail
626 && ldsAvail
&& barrier_avail
;
631 ComputeUnit::numYetToReachBarrier(int bar_id
)
633 auto &wf_barrier
= barrierSlot(bar_id
);
634 return wf_barrier
.numYetToReachBarrier();
638 ComputeUnit::allAtBarrier(int bar_id
)
640 auto &wf_barrier
= barrierSlot(bar_id
);
641 return wf_barrier
.allAtBarrier();
645 ComputeUnit::incNumAtBarrier(int bar_id
)
647 auto &wf_barrier
= barrierSlot(bar_id
);
648 wf_barrier
.incNumAtBarrier();
652 ComputeUnit::numAtBarrier(int bar_id
)
654 auto &wf_barrier
= barrierSlot(bar_id
);
655 return wf_barrier
.numAtBarrier();
659 ComputeUnit::maxBarrierCnt(int bar_id
)
661 auto &wf_barrier
= barrierSlot(bar_id
);
662 return wf_barrier
.maxBarrierCnt();
666 ComputeUnit::resetBarrier(int bar_id
)
668 auto &wf_barrier
= barrierSlot(bar_id
);
673 ComputeUnit::decMaxBarrierCnt(int bar_id
)
675 auto &wf_barrier
= barrierSlot(bar_id
);
676 wf_barrier
.decMaxBarrierCnt();
680 ComputeUnit::releaseBarrier(int bar_id
)
682 auto &wf_barrier
= barrierSlot(bar_id
);
683 wf_barrier
.release();
684 freeBarrierIds
.insert(bar_id
);
688 ComputeUnit::releaseWFsFromBarrier(int bar_id
)
690 for (int i
= 0; i
< numVectorALUs
; ++i
) {
691 for (int j
= 0; j
< shader
->n_wf
; ++j
) {
692 Wavefront
*wf
= wfList
[i
][j
];
693 if (wf
->barrierId() == bar_id
) {
694 assert(wf
->getStatus() == Wavefront::S_BARRIER
);
695 wf
->setStatus(Wavefront::S_RUNNING
);
701 // Execute one clock worth of work on the ComputeUnit.
705 // process reads and writes in the RFs
706 for (auto &vecRegFile
: vrf
) {
710 for (auto &scRegFile
: srf
) {
714 // Execute pipeline stages in reverse order to simulate
715 // the pipeline latency
716 scalarMemoryPipe
.exec();
717 globalMemoryPipe
.exec();
718 localMemoryPipe
.exec();
720 scheduleStage
.exec();
721 scoreboardCheckStage
.exec();
726 // Put this CU to sleep if there is no more work to be done.
728 schedule(tickEvent
, nextCycle());
730 shader
->notifyCuSleep();
731 DPRINTF(GPUDisp
, "CU%d: Going to sleep\n", cu_id
);
738 // Initialize CU Bus models and execution resources
742 for (int i
= 0; i
< numVectorALUs
; i
++) {
743 vectorALUs
.emplace_back(this, clockPeriod());
748 for (int i
= 0; i
< numScalarALUs
; i
++) {
749 scalarALUs
.emplace_back(this, clockPeriod());
752 // Vector Global Memory
753 fatal_if(numVectorGlobalMemUnits
> 1,
754 "No support for multiple Global Memory Pipelines exists!!!");
755 vectorGlobalMemUnit
.init(this, clockPeriod());
756 vrfToGlobalMemPipeBus
.init(this, clockPeriod());
757 glbMemToVrfBus
.init(this, clockPeriod());
759 // Vector Local/Shared Memory
760 fatal_if(numVectorSharedMemUnits
> 1,
761 "No support for multiple Local Memory Pipelines exists!!!");
762 vectorSharedMemUnit
.init(this, clockPeriod());
763 vrfToLocalMemPipeBus
.init(this, clockPeriod());
764 locMemToVrfBus
.init(this, clockPeriod());
767 fatal_if(numScalarMemUnits
> 1,
768 "No support for multiple Scalar Memory Pipelines exists!!!");
769 scalarMemUnit
.init(this, clockPeriod());
770 srfToScalarMemPipeBus
.init(this, clockPeriod());
771 scalarMemToSrfBus
.init(this, clockPeriod());
773 vectorRegsReserved
.resize(numVectorALUs
, 0);
774 scalarRegsReserved
.resize(numVectorALUs
, 0);
777 scheduleStage
.init();
779 globalMemoryPipe
.init();
781 gmTokenPort
.setTokenManager(memPortTokens
);
785 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt
)
787 // Ruby has completed the memory op. Schedule the mem_resp_event at the
788 // appropriate cycle to process the timing memory response
789 // This delay represents the pipeline delay
790 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
791 PortID index
= sender_state
->port_index
;
792 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
793 GPUDispatcher
&dispatcher
= computeUnit
->shader
->dispatcher();
795 // MemSyncResp + WriteAckResp are handled completely here and we don't
796 // schedule a MemRespEvent to process the responses further
797 if (pkt
->cmd
== MemCmd::MemSyncResp
) {
798 // This response is for 1 of the following request types:
801 // - non-kernel mem sync
804 // wavefront was nullptr when launching kernel, so it is meaningless
805 // here (simdId=-1, wfSlotId=-1)
806 if (gpuDynInst
->isKernelLaunch()) {
807 // for kernel launch, the original request must be both kernel-type
809 assert(pkt
->req
->isKernel());
810 assert(pkt
->req
->isAcquire());
812 // one D-Cache inv is done, decrement counter
813 dispatcher
.updateInvCounter(gpuDynInst
->kern_id
);
815 delete pkt
->senderState
;
820 // retrieve wavefront from inst
821 Wavefront
*w
= gpuDynInst
->wavefront();
823 // Check if we are waiting on Kernel End Release
824 if (w
->getStatus() == Wavefront::S_RETURNING
825 && gpuDynInst
->isEndOfKernel()) {
826 // for kernel end, the original request must be both kernel-type
828 assert(pkt
->req
->isKernel());
829 assert(pkt
->req
->isRelease());
831 // one wb done, decrement counter, and return whether all wbs are
832 // done for the kernel
833 bool isWbDone
= dispatcher
.updateWbCounter(gpuDynInst
->kern_id
);
835 // not all wbs are done for the kernel, just release pkt
838 delete pkt
->senderState
;
843 // all wbs are completed for the kernel, do retirement work
845 DPRINTF(GPUDisp
, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
846 computeUnit
->cu_id
, w
->simdId
, w
->wfSlotId
,
847 w
->wfDynId
, w
->wgId
);
849 dispatcher
.notifyWgCompl(w
);
850 w
->setStatus(Wavefront::S_STOPPED
);
853 if (!pkt
->req
->isKernel()) {
854 w
= computeUnit
->wfList
[gpuDynInst
->simdId
][gpuDynInst
->wfSlotId
];
855 DPRINTF(GPUExec
, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
856 "outstanding reqs %d => %d\n", gpuDynInst
->simdId
,
857 gpuDynInst
->wfSlotId
, gpuDynInst
->wfDynId
,
858 gpuDynInst
->disassemble(), w
->outstandingReqs
,
859 w
->outstandingReqs
- 1);
860 computeUnit
->globalMemoryPipe
.handleResponse(gpuDynInst
);
863 delete pkt
->senderState
;
866 } else if (pkt
->cmd
== MemCmd::WriteCompleteResp
) {
867 // this is for writeComplete callback
868 // we simply get decrement write-related wait counters
870 Wavefront
*w M5_VAR_USED
=
871 computeUnit
->wfList
[gpuDynInst
->simdId
][gpuDynInst
->wfSlotId
];
873 DPRINTF(GPUExec
, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
874 "outstanding reqs %d => %d\n", gpuDynInst
->simdId
,
875 gpuDynInst
->wfSlotId
, gpuDynInst
->wfDynId
,
876 gpuDynInst
->disassemble(), w
->outstandingReqs
,
877 w
->outstandingReqs
- 1);
878 if (gpuDynInst
->allLanesZero()) {
879 // ask gm pipe to decrement request counters, instead of directly
880 // performing here, to avoid asynchronous counter update and
881 // instruction retirement (which may hurt waincnt effects)
882 computeUnit
->globalMemoryPipe
.handleResponse(gpuDynInst
);
884 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: write totally complete\n",
885 computeUnit
->cu_id
, gpuDynInst
->simdId
,
886 gpuDynInst
->wfSlotId
);
889 delete pkt
->senderState
;
895 EventFunctionWrapper
*mem_resp_event
=
896 computeUnit
->memPort
[index
].createMemRespEvent(pkt
);
899 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
900 computeUnit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
901 gpuDynInst
->seqNum(), index
, pkt
->req
->getPaddr());
903 computeUnit
->schedule(mem_resp_event
,
904 curTick() + computeUnit
->resp_tick_latency
);
910 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt
)
912 assert(!pkt
->req
->isKernel());
914 // retrieve sender state
915 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
916 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
918 assert(pkt
->isRead() || pkt
->isWrite());
919 assert(gpuDynInst
->numScalarReqs
> 0);
921 gpuDynInst
->numScalarReqs
--;
924 * for each returned scalar request we decrement the
925 * numScalarReqs counter that is associated with this
926 * gpuDynInst, which should have been set to correspond
927 * to the number of packets sent for the memory op.
928 * once all packets return, the memory op is finished
929 * and we can push it into the response queue.
931 if (!gpuDynInst
->numScalarReqs
) {
932 if (gpuDynInst
->isLoad() || gpuDynInst
->isAtomic()) {
933 computeUnit
->scalarMemoryPipe
.getGMLdRespFIFO().push(
936 computeUnit
->scalarMemoryPipe
.getGMStRespFIFO().push(
941 delete pkt
->senderState
;
948 ComputeUnit::ScalarDataPort::recvReqRetry()
950 for (const auto &pkt
: retries
) {
951 if (!sendTimingReq(pkt
)) {
960 ComputeUnit::DataPort::recvReqRetry()
962 int len
= retries
.size();
966 for (int i
= 0; i
< len
; ++i
) {
967 PacketPtr pkt
= retries
.front().first
;
968 GPUDynInstPtr gpuDynInst M5_VAR_USED
= retries
.front().second
;
969 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
970 computeUnit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
971 pkt
->req
->getPaddr());
973 /** Currently Ruby can return false due to conflicts for the particular
974 * cache block or address. Thus other requests should be allowed to
975 * pass and the data port should expect multiple retries. */
976 if (!sendTimingReq(pkt
)) {
977 DPRINTF(GPUMem
, "failed again!\n");
980 DPRINTF(GPUMem
, "successful!\n");
987 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt
)
989 computeUnit
->fetchStage
.processFetchReturn(pkt
);
994 ComputeUnit::SQCPort::recvReqRetry()
996 int len
= retries
.size();
1000 for (int i
= 0; i
< len
; ++i
) {
1001 PacketPtr pkt
= retries
.front().first
;
1002 Wavefront
*wavefront M5_VAR_USED
= retries
.front().second
;
1003 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1004 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
1005 pkt
->req
->getPaddr());
1006 if (!sendTimingReq(pkt
)) {
1007 DPRINTF(GPUFetch
, "failed again!\n");
1010 DPRINTF(GPUFetch
, "successful!\n");
1011 retries
.pop_front();
1017 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst
, PortID index
, PacketPtr pkt
)
1019 // There must be a way around this check to do the globalMemStart...
1020 Addr tmp_vaddr
= pkt
->req
->getVaddr();
1022 updatePageDivergenceDist(tmp_vaddr
);
1024 // set PC in request
1025 pkt
->req
->setPC(gpuDynInst
->wavefront()->pc());
1027 pkt
->req
->setReqInstSeqNum(gpuDynInst
->seqNum());
1029 // figure out the type of the request to set read/write
1030 BaseTLB::Mode TLB_mode
;
1031 assert(pkt
->isRead() || pkt
->isWrite());
1033 // only do some things if actually accessing data
1034 bool isDataAccess
= pkt
->isWrite() || pkt
->isRead();
1036 // Check write before read for atomic operations
1037 // since atomic operations should use BaseTLB::Write
1038 if (pkt
->isWrite()) {
1039 TLB_mode
= BaseTLB::Write
;
1040 } else if (pkt
->isRead()) {
1041 TLB_mode
= BaseTLB::Read
;
1043 fatal("pkt is not a read nor a write\n");
1046 tlbCycles
-= curTick();
1049 PortID tlbPort_index
= perLaneTLB
? index
: 0;
1051 if (shader
->timingSim
) {
1052 if (debugSegFault
) {
1053 Process
*p
= shader
->gpuTc
->getProcessPtr();
1054 Addr vaddr
= pkt
->req
->getVaddr();
1055 unsigned size
= pkt
->getSize();
1057 if ((vaddr
+ size
- 1) % 64 < vaddr
% 64) {
1058 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, vaddr
);
1064 if (!p
->pTable
->translate(vaddr
, paddr
)) {
1065 if (!p
->fixupFault(vaddr
)) {
1066 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
1073 // This is the SenderState needed upon return
1074 pkt
->senderState
= new DTLBPort::SenderState(gpuDynInst
, index
);
1076 // This is the senderState needed by the TLB hierarchy to function
1077 TheISA::GpuTLB::TranslationState
*translation_state
=
1078 new TheISA::GpuTLB::TranslationState(TLB_mode
, shader
->gpuTc
, false,
1081 pkt
->senderState
= translation_state
;
1083 if (functionalTLB
) {
1084 tlbPort
[tlbPort_index
].sendFunctional(pkt
);
1086 // update the hitLevel distribution
1087 int hit_level
= translation_state
->hitLevel
;
1088 assert(hit_level
!= -1);
1089 hitsPerTLBLevel
[hit_level
]++;
1091 // New SenderState for the memory access
1092 X86ISA::GpuTLB::TranslationState
*sender_state
=
1093 safe_cast
<X86ISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1095 delete sender_state
->tlbEntry
;
1096 delete sender_state
->saved
;
1097 delete sender_state
;
1099 assert(pkt
->req
->hasPaddr());
1100 assert(pkt
->req
->hasSize());
1102 // this is necessary because the GPU TLB receives packets instead
1103 // of requests. when the translation is complete, all relevent
1104 // fields in the request will be populated, but not in the packet.
1105 // here we create the new packet so we can set the size, addr,
1106 // and proper flags.
1107 PacketPtr oldPkt
= pkt
;
1108 pkt
= new Packet(oldPkt
->req
, oldPkt
->cmd
);
1110 uint8_t *tmpData
= oldPkt
->getPtr
<uint8_t>();
1111 pkt
->dataStatic(tmpData
);
1116 // New SenderState for the memory access
1118 new ComputeUnit::DataPort::SenderState(gpuDynInst
, index
,
1121 gpuDynInst
->memStatusVector
[pkt
->getAddr()].push_back(index
);
1122 gpuDynInst
->tlbHitLevel
[index
] = hit_level
;
1124 // translation is done. Schedule the mem_req_event at the
1125 // appropriate cycle to send the timing memory request to ruby
1126 EventFunctionWrapper
*mem_req_event
=
1127 memPort
[index
].createMemReqEvent(pkt
);
1129 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1130 "scheduled\n", cu_id
, gpuDynInst
->simdId
,
1131 gpuDynInst
->wfSlotId
, index
, pkt
->req
->getPaddr());
1133 schedule(mem_req_event
, curTick() + req_tick_latency
);
1134 } else if (tlbPort
[tlbPort_index
].isStalled()) {
1135 assert(tlbPort
[tlbPort_index
].retries
.size() > 0);
1137 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Translation for addr %#x "
1138 "failed!\n", cu_id
, gpuDynInst
->simdId
,
1139 gpuDynInst
->wfSlotId
, tmp_vaddr
);
1141 tlbPort
[tlbPort_index
].retries
.push_back(pkt
);
1142 } else if (!tlbPort
[tlbPort_index
].sendTimingReq(pkt
)) {
1143 // Stall the data port;
1144 // No more packet will be issued till
1145 // ruby indicates resources are freed by
1146 // a recvReqRetry() call back on this port.
1147 tlbPort
[tlbPort_index
].stallPort();
1149 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Translation for addr %#x "
1150 "failed!\n", cu_id
, gpuDynInst
->simdId
,
1151 gpuDynInst
->wfSlotId
, tmp_vaddr
);
1153 tlbPort
[tlbPort_index
].retries
.push_back(pkt
);
1156 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, tmp_vaddr
);
1160 if (pkt
->cmd
== MemCmd::MemSyncReq
) {
1161 gpuDynInst
->resetEntireStatusVector();
1163 gpuDynInst
->decrementStatusVector(index
);
1166 // New SenderState for the memory access
1167 delete pkt
->senderState
;
1169 // Because it's atomic operation, only need TLB translation state
1170 pkt
->senderState
= new TheISA::GpuTLB::TranslationState(TLB_mode
,
1173 tlbPort
[tlbPort_index
].sendFunctional(pkt
);
1175 // the addr of the packet is not modified, so we need to create a new
1176 // packet, or otherwise the memory access will have the old virtual
1177 // address sent in the translation packet, instead of the physical
1178 // address returned by the translation.
1179 PacketPtr new_pkt
= new Packet(pkt
->req
, pkt
->cmd
);
1180 new_pkt
->dataStatic(pkt
->getPtr
<uint8_t>());
1182 // Translation is done. It is safe to send the packet to memory.
1183 memPort
[0].sendFunctional(new_pkt
);
1185 DPRINTF(GPUMem
, "Functional sendRequest\n");
1186 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id
,
1187 gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, index
,
1188 new_pkt
->req
->getPaddr());
1190 // safe_cast the senderState
1191 TheISA::GpuTLB::TranslationState
*sender_state
=
1192 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1194 delete sender_state
->tlbEntry
;
1196 delete pkt
->senderState
;
1202 ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst
, PacketPtr pkt
)
1204 assert(pkt
->isWrite() || pkt
->isRead());
1206 BaseTLB::Mode tlb_mode
= pkt
->isRead() ? BaseTLB::Read
: BaseTLB::Write
;
1209 new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst
);
1212 new TheISA::GpuTLB::TranslationState(tlb_mode
, shader
->gpuTc
, false,
1215 if (scalarDTLBPort
.isStalled()) {
1216 assert(scalarDTLBPort
.retries
.size());
1217 scalarDTLBPort
.retries
.push_back(pkt
);
1218 } else if (!scalarDTLBPort
.sendTimingReq(pkt
)) {
1219 scalarDTLBPort
.stallPort();
1220 scalarDTLBPort
.retries
.push_back(pkt
);
1222 DPRINTF(GPUTLB
, "sent scalar %s translation request for addr %#x\n",
1223 tlb_mode
== BaseTLB::Read
? "read" : "write",
1224 pkt
->req
->getVaddr());
1229 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst
,
1233 assert(gpuDynInst
->isGlobalSeg() ||
1234 gpuDynInst
->executedAs() == Enums::SC_GLOBAL
);
1237 req
= std::make_shared
<Request
>(
1238 0, 0, 0, requestorId(), 0, gpuDynInst
->wfDynId
);
1241 // all mem sync requests have Paddr == 0
1244 PacketPtr pkt
= nullptr;
1246 if (kernelMemSync
) {
1247 if (gpuDynInst
->isKernelLaunch()) {
1248 req
->setCacheCoherenceFlags(Request::ACQUIRE
);
1249 req
->setReqInstSeqNum(gpuDynInst
->seqNum());
1250 req
->setFlags(Request::KERNEL
);
1251 pkt
= new Packet(req
, MemCmd::MemSyncReq
);
1252 pkt
->pushSenderState(
1253 new ComputeUnit::DataPort::SenderState(gpuDynInst
, 0, nullptr));
1255 EventFunctionWrapper
*mem_req_event
=
1256 memPort
[0].createMemReqEvent(pkt
);
1258 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259 "an acquire\n", cu_id
, gpuDynInst
->simdId
,
1260 gpuDynInst
->wfSlotId
, 0, pkt
->req
->getPaddr());
1262 schedule(mem_req_event
, curTick() + req_tick_latency
);
1264 // kernel end release must be enabled
1265 assert(shader
->impl_kern_end_rel
);
1266 assert(gpuDynInst
->isEndOfKernel());
1268 req
->setCacheCoherenceFlags(Request::WB_L2
);
1269 req
->setReqInstSeqNum(gpuDynInst
->seqNum());
1270 req
->setFlags(Request::KERNEL
);
1271 pkt
= new Packet(req
, MemCmd::MemSyncReq
);
1272 pkt
->pushSenderState(
1273 new ComputeUnit::DataPort::SenderState(gpuDynInst
, 0, nullptr));
1275 EventFunctionWrapper
*mem_req_event
=
1276 memPort
[0].createMemReqEvent(pkt
);
1278 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1279 "a release\n", cu_id
, gpuDynInst
->simdId
,
1280 gpuDynInst
->wfSlotId
, 0, pkt
->req
->getPaddr());
1282 schedule(mem_req_event
, curTick() + req_tick_latency
);
1285 gpuDynInst
->setRequestFlags(req
);
1287 req
->setReqInstSeqNum(gpuDynInst
->seqNum());
1289 pkt
= new Packet(req
, MemCmd::MemSyncReq
);
1290 pkt
->pushSenderState(
1291 new ComputeUnit::DataPort::SenderState(gpuDynInst
, 0, nullptr));
1293 EventFunctionWrapper
*mem_req_event
=
1294 memPort
[0].createMemReqEvent(pkt
);
1297 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1298 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, 0,
1299 pkt
->req
->getPaddr());
1301 schedule(mem_req_event
, curTick() + req_tick_latency
);
1306 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt
)
1308 DataPort::SenderState
*sender_state
=
1309 safe_cast
<DataPort::SenderState
*>(pkt
->senderState
);
1311 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1312 ComputeUnit
*compute_unit
= computeUnit
;
1316 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1317 compute_unit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
1318 pkt
->req
->getPaddr(), id
);
1320 Addr paddr
= pkt
->req
->getPaddr();
1322 // mem sync resp and write-complete callback must be handled already in
1323 // DataPort::recvTimingResp
1324 assert(pkt
->cmd
!= MemCmd::MemSyncResp
);
1325 assert(pkt
->cmd
!= MemCmd::WriteCompleteResp
);
1327 // this is for read, write and atomic
1328 int index
= gpuDynInst
->memStatusVector
[paddr
].back();
1330 DPRINTF(GPUMem
, "Response for addr %#x, index %d\n",
1331 pkt
->req
->getPaddr(), id
);
1333 gpuDynInst
->memStatusVector
[paddr
].pop_back();
1334 gpuDynInst
->pAddr
= pkt
->req
->getPaddr();
1336 gpuDynInst
->decrementStatusVector(index
);
1337 DPRINTF(GPUMem
, "bitvector is now %s\n", gpuDynInst
->printStatusVector());
1339 if (gpuDynInst
->allLanesZero()) {
1340 auto iter
= gpuDynInst
->memStatusVector
.begin();
1341 auto end
= gpuDynInst
->memStatusVector
.end();
1343 while (iter
!= end
) {
1344 assert(iter
->second
.empty());
1348 // Calculate the difference between the arrival of the first cache
1349 // block and the last cache block to arrive if we have the time
1350 // for the first cache block.
1351 if (compute_unit
->headTailMap
.count(gpuDynInst
)) {
1352 Tick headTick
= compute_unit
->headTailMap
.at(gpuDynInst
);
1353 compute_unit
->headTailLatency
.sample(curTick() - headTick
);
1354 compute_unit
->headTailMap
.erase(gpuDynInst
);
1357 gpuDynInst
->memStatusVector
.clear();
1359 // note: only handle read response here; for write, the response
1360 // is separately handled when writeComplete callback is received
1361 if (pkt
->isRead()) {
1363 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue
);
1364 compute_unit
->globalMemoryPipe
.handleResponse(gpuDynInst
);
1366 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: packet totally complete\n",
1367 compute_unit
->cu_id
, gpuDynInst
->simdId
,
1368 gpuDynInst
->wfSlotId
);
1371 if (pkt
->isRead()) {
1372 if (!compute_unit
->headTailMap
.count(gpuDynInst
)) {
1373 compute_unit
->headTailMap
1374 .insert(std::make_pair(gpuDynInst
, curTick()));
1379 delete pkt
->senderState
;
1384 ComputeUnitParams::create()
1386 return new ComputeUnit(this);
1390 ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt
)
1392 Addr line
= pkt
->req
->getPaddr();
1394 DPRINTF(GPUTLB
, "CU%d: DTLBPort received %#x->%#x\n", computeUnit
->cu_id
,
1395 pkt
->req
->getVaddr(), line
);
1397 assert(pkt
->senderState
);
1398 computeUnit
->tlbCycles
+= curTick();
1400 // pop off the TLB translation state
1401 TheISA::GpuTLB::TranslationState
*translation_state
=
1402 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1404 // no PageFaults are permitted for data accesses
1405 if (!translation_state
->tlbEntry
) {
1406 DTLBPort::SenderState
*sender_state
=
1407 safe_cast
<DTLBPort::SenderState
*>(translation_state
->saved
);
1409 Wavefront
*w M5_VAR_USED
=
1410 computeUnit
->wfList
[sender_state
->_gpuDynInst
->simdId
]
1411 [sender_state
->_gpuDynInst
->wfSlotId
];
1413 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w
->wfDynId
,
1414 pkt
->req
->getVaddr());
1417 // update the hitLevel distribution
1418 int hit_level
= translation_state
->hitLevel
;
1419 computeUnit
->hitsPerTLBLevel
[hit_level
]++;
1421 delete translation_state
->tlbEntry
;
1422 assert(!translation_state
->ports
.size());
1423 pkt
->senderState
= translation_state
->saved
;
1426 BaseTLB::Mode TLB_mode
= translation_state
->tlbMode
;
1428 delete translation_state
;
1430 // use the original sender state to know how to close this transaction
1431 DTLBPort::SenderState
*sender_state
=
1432 safe_cast
<DTLBPort::SenderState
*>(pkt
->senderState
);
1434 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1435 PortID mp_index
= sender_state
->portIndex
;
1436 Addr vaddr
= pkt
->req
->getVaddr();
1437 gpuDynInst
->memStatusVector
[line
].push_back(mp_index
);
1438 gpuDynInst
->tlbHitLevel
[mp_index
] = hit_level
;
1442 if (pkt
->cmd
== MemCmd::ReadResp
) {
1443 requestCmd
= MemCmd::ReadReq
;
1444 } else if (pkt
->cmd
== MemCmd::WriteResp
) {
1445 requestCmd
= MemCmd::WriteReq
;
1446 } else if (pkt
->cmd
== MemCmd::SwapResp
) {
1447 requestCmd
= MemCmd::SwapReq
;
1449 panic("unsupported response to request conversion %s\n",
1450 pkt
->cmd
.toString());
1453 if (computeUnit
->prefetchDepth
) {
1454 int simdId
= gpuDynInst
->simdId
;
1455 int wfSlotId
= gpuDynInst
->wfSlotId
;
1458 switch(computeUnit
->prefetchType
) {
1460 last
= computeUnit
->lastVaddrCU
[mp_index
];
1462 case Enums::PF_PHASE
:
1463 last
= computeUnit
->lastVaddrSimd
[simdId
][mp_index
];
1466 last
= computeUnit
->lastVaddrWF
[simdId
][wfSlotId
][mp_index
];
1471 DPRINTF(GPUPrefetch
, "CU[%d][%d][%d][%d]: %#x was last\n",
1472 computeUnit
->cu_id
, simdId
, wfSlotId
, mp_index
, last
);
1474 int stride
= last
? (roundDown(vaddr
, TheISA::PageBytes
) -
1475 roundDown(last
, TheISA::PageBytes
)) >> TheISA::PageShift
1478 DPRINTF(GPUPrefetch
, "Stride is %d\n", stride
);
1480 computeUnit
->lastVaddrCU
[mp_index
] = vaddr
;
1481 computeUnit
->lastVaddrSimd
[simdId
][mp_index
] = vaddr
;
1482 computeUnit
->lastVaddrWF
[simdId
][wfSlotId
][mp_index
] = vaddr
;
1484 stride
= (computeUnit
->prefetchType
== Enums::PF_STRIDE
) ?
1485 computeUnit
->prefetchStride
: stride
;
1487 DPRINTF(GPUPrefetch
, "%#x to: CU[%d][%d][%d][%d]\n", vaddr
,
1488 computeUnit
->cu_id
, simdId
, wfSlotId
, mp_index
);
1490 DPRINTF(GPUPrefetch
, "Prefetching from %#x:", vaddr
);
1492 // Prefetch Next few pages atomically
1493 for (int pf
= 1; pf
<= computeUnit
->prefetchDepth
; ++pf
) {
1494 DPRINTF(GPUPrefetch
, "%d * %d: %#x\n", pf
, stride
,
1495 vaddr
+stride
*pf
*TheISA::PageBytes
);
1500 RequestPtr prefetch_req
= std::make_shared
<Request
>(
1501 vaddr
+ stride
* pf
* TheISA::PageBytes
,
1503 computeUnit
->requestorId(),
1506 PacketPtr prefetch_pkt
= new Packet(prefetch_req
, requestCmd
);
1508 prefetch_pkt
->dataStatic(&foo
);
1510 // Because it's atomic operation, only need TLB translation state
1511 prefetch_pkt
->senderState
=
1512 new TheISA::GpuTLB::TranslationState(TLB_mode
,
1513 computeUnit
->shader
->gpuTc
, true);
1515 // Currently prefetches are zero-latency, hence the sendFunctional
1516 sendFunctional(prefetch_pkt
);
1518 /* safe_cast the senderState */
1519 TheISA::GpuTLB::TranslationState
*tlb_state
=
1520 safe_cast
<TheISA::GpuTLB::TranslationState
*>(
1521 prefetch_pkt
->senderState
);
1524 delete tlb_state
->tlbEntry
;
1526 delete prefetch_pkt
;
1530 // First we must convert the response cmd back to a request cmd so that
1531 // the request can be sent through the cu's request port
1532 PacketPtr new_pkt
= new Packet(pkt
->req
, requestCmd
);
1533 new_pkt
->dataStatic(pkt
->getPtr
<uint8_t>());
1534 delete pkt
->senderState
;
1537 // New SenderState for the memory access
1538 new_pkt
->senderState
=
1539 new ComputeUnit::DataPort::SenderState(gpuDynInst
, mp_index
,
1542 // translation is done. Schedule the mem_req_event at the appropriate
1543 // cycle to send the timing memory request to ruby
1544 EventFunctionWrapper
*mem_req_event
=
1545 computeUnit
->memPort
[mp_index
].createMemReqEvent(new_pkt
);
1547 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1548 computeUnit
->cu_id
, gpuDynInst
->simdId
,
1549 gpuDynInst
->wfSlotId
, mp_index
, new_pkt
->req
->getPaddr());
1551 computeUnit
->schedule(mem_req_event
, curTick() +
1552 computeUnit
->req_tick_latency
);
1557 EventFunctionWrapper
*
1558 ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt
)
1560 return new EventFunctionWrapper(
1561 [this, pkt
]{ processMemReqEvent(pkt
); },
1562 "ComputeUnit memory request event", true);
1565 EventFunctionWrapper
*
1566 ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt
)
1568 return new EventFunctionWrapper(
1569 [this, pkt
]{ processMemRespEvent(pkt
); },
1570 "ComputeUnit memory response event", true);
1574 ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt
)
1576 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
1577 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1578 ComputeUnit
*compute_unit M5_VAR_USED
= computeUnit
;
1580 if (!(sendTimingReq(pkt
))) {
1581 retries
.push_back(std::make_pair(pkt
, gpuDynInst
));
1584 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1585 compute_unit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
1586 id
, pkt
->req
->getPaddr());
1589 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1590 "req sent!\n", compute_unit
->cu_id
, gpuDynInst
->simdId
,
1591 gpuDynInst
->wfSlotId
, gpuDynInst
->seqNum(), id
,
1592 pkt
->req
->getPaddr());
1597 ComputeUnit::ScalarDataPort::MemReqEvent::description() const
1599 return "ComputeUnit scalar memory request event";
1603 ComputeUnit::ScalarDataPort::MemReqEvent::process()
1605 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
1606 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1607 ComputeUnit
*compute_unit M5_VAR_USED
= scalarDataPort
.computeUnit
;
1609 if (!(scalarDataPort
.sendTimingReq(pkt
))) {
1610 scalarDataPort
.retries
.push_back(pkt
);
1613 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1614 compute_unit
->cu_id
, gpuDynInst
->simdId
,
1615 gpuDynInst
->wfSlotId
, pkt
->req
->getPaddr());
1618 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1619 "req sent!\n", compute_unit
->cu_id
, gpuDynInst
->simdId
,
1620 gpuDynInst
->wfSlotId
, gpuDynInst
->seqNum(),
1621 pkt
->req
->getPaddr());
1626 * The initial translation request could have been rejected,
1627 * if <retries> queue is not Retry sending the translation
1628 * request. sendRetry() is called from the peer port whenever
1629 * a translation completes.
1632 ComputeUnit::DTLBPort::recvReqRetry()
1634 int len
= retries
.size();
1636 DPRINTF(GPUTLB
, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1637 computeUnit
->cu_id
, len
);
1640 assert(isStalled());
1641 // recvReqRetry is an indication that the resource on which this
1642 // port was stalling on is freed. So, remove the stall first
1645 for (int i
= 0; i
< len
; ++i
) {
1646 PacketPtr pkt
= retries
.front();
1647 Addr vaddr M5_VAR_USED
= pkt
->req
->getVaddr();
1648 DPRINTF(GPUTLB
, "CU%d: retrying D-translaton for address%#x", vaddr
);
1650 if (!sendTimingReq(pkt
)) {
1653 DPRINTF(GPUTLB
, ": failed again\n");
1656 DPRINTF(GPUTLB
, ": successful\n");
1657 retries
.pop_front();
1663 ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt
)
1665 assert(pkt
->senderState
);
1667 TheISA::GpuTLB::TranslationState
*translation_state
=
1668 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1670 // Page faults are not allowed
1671 fatal_if(!translation_state
->tlbEntry
,
1672 "Translation of vaddr %#x failed\n", pkt
->req
->getVaddr());
1674 delete translation_state
->tlbEntry
;
1675 assert(!translation_state
->ports
.size());
1677 pkt
->senderState
= translation_state
->saved
;
1678 delete translation_state
;
1680 ScalarDTLBPort::SenderState
*sender_state
=
1681 safe_cast
<ScalarDTLBPort::SenderState
*>(pkt
->senderState
);
1683 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1684 delete pkt
->senderState
;
1686 Wavefront
*w M5_VAR_USED
= gpuDynInst
->wavefront();
1688 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1689 "translation: PA %#x -> %#x\n", computeUnit
->cu_id
, w
->simdId
,
1690 w
->wfSlotId
, w
->kernId
, pkt
->req
->getVaddr(), pkt
->req
->getPaddr());
1694 if (pkt
->cmd
== MemCmd::ReadResp
) {
1695 mem_cmd
= MemCmd::ReadReq
;
1696 } else if (pkt
->cmd
== MemCmd::WriteResp
) {
1697 mem_cmd
= MemCmd::WriteReq
;
1699 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1700 pkt
->cmd
.toString());
1703 PacketPtr req_pkt
= new Packet(pkt
->req
, mem_cmd
);
1704 req_pkt
->dataStatic(pkt
->getPtr
<uint8_t>());
1707 req_pkt
->senderState
=
1708 new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst
);
1710 if (!computeUnit
->scalarDataPort
.sendTimingReq(req_pkt
)) {
1711 computeUnit
->scalarDataPort
.retries
.push_back(req_pkt
);
1712 DPRINTF(GPUMem
, "send scalar req failed for: %s\n",
1713 gpuDynInst
->disassemble());
1715 DPRINTF(GPUMem
, "send scalar req for: %s\n",
1716 gpuDynInst
->disassemble());
1723 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt
)
1725 Addr line M5_VAR_USED
= pkt
->req
->getPaddr();
1726 DPRINTF(GPUTLB
, "CU%d: ITLBPort received %#x->%#x\n",
1727 computeUnit
->cu_id
, pkt
->req
->getVaddr(), line
);
1729 assert(pkt
->senderState
);
1731 // pop off the TLB translation state
1732 TheISA::GpuTLB::TranslationState
*translation_state
1733 = safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1735 bool success
= translation_state
->tlbEntry
!= nullptr;
1736 delete translation_state
->tlbEntry
;
1737 assert(!translation_state
->ports
.size());
1738 pkt
->senderState
= translation_state
->saved
;
1739 delete translation_state
;
1741 // use the original sender state to know how to close this transaction
1742 ITLBPort::SenderState
*sender_state
=
1743 safe_cast
<ITLBPort::SenderState
*>(pkt
->senderState
);
1745 // get the wavefront associated with this translation request
1746 Wavefront
*wavefront
= sender_state
->wavefront
;
1747 delete pkt
->senderState
;
1750 // pkt is reused in fetch(), don't delete it here. However, we must
1751 // reset the command to be a request so that it can be sent through
1752 // the cu's request port
1753 assert(pkt
->cmd
== MemCmd::ReadResp
);
1754 pkt
->cmd
= MemCmd::ReadReq
;
1756 computeUnit
->fetchStage
.fetch(pkt
, wavefront
);
1758 if (wavefront
->dropFetch
) {
1759 assert(wavefront
->instructionBuffer
.empty());
1760 wavefront
->dropFetch
= false;
1763 wavefront
->pendingFetch
= 0;
1770 * The initial translation request could have been rejected, if
1771 * <retries> queue is not empty. Retry sending the translation
1772 * request. sendRetry() is called from the peer port whenever
1773 * a translation completes.
1776 ComputeUnit::ITLBPort::recvReqRetry()
1779 int len
= retries
.size();
1780 DPRINTF(GPUTLB
, "CU%d: ITLB recvReqRetry - %d pending requests\n", len
);
1783 assert(isStalled());
1785 // recvReqRetry is an indication that the resource on which this
1786 // port was stalling on is freed. So, remove the stall first
1789 for (int i
= 0; i
< len
; ++i
) {
1790 PacketPtr pkt
= retries
.front();
1791 Addr vaddr M5_VAR_USED
= pkt
->req
->getVaddr();
1792 DPRINTF(GPUTLB
, "CU%d: retrying I-translaton for address%#x", vaddr
);
1794 if (!sendTimingReq(pkt
)) {
1795 stallPort(); // Stall port
1796 DPRINTF(GPUTLB
, ": failed again\n");
1799 DPRINTF(GPUTLB
, ": successful\n");
1800 retries
.pop_front();
1806 ComputeUnit::regStats()
1808 ClockedObject::regStats();
1811 .name(name() + ".valu_insts")
1812 .desc("Number of vector ALU insts issued.")
1815 .name(name() + ".valu_insts_per_wf")
1816 .desc("The avg. number of vector ALU insts issued per-wavefront.")
1819 .name(name() + ".salu_insts")
1820 .desc("Number of scalar ALU insts issued.")
1823 .name(name() + ".salu_insts_per_wf")
1824 .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1827 .name(name() + ".inst_cycles_valu")
1828 .desc("Number of cycles needed to execute VALU insts.")
1831 .name(name() + ".inst_cycles_salu")
1832 .desc("Number of cycles needed to execute SALU insts.")
1835 .name(name() + ".thread_cycles_valu")
1836 .desc("Number of thread cycles used to execute vector ALU ops. "
1837 "Similar to instCyclesVALU but multiplied by the number of "
1841 .name(name() + ".valu_utilization")
1842 .desc("Percentage of active vector ALU threads in a wave.")
1845 .name(name() + ".lds_no_flat_insts")
1846 .desc("Number of LDS insts issued, not including FLAT "
1847 "accesses that resolve to LDS.")
1850 .name(name() + ".lds_no_flat_insts_per_wf")
1851 .desc("The avg. number of LDS insts (not including FLAT "
1852 "accesses that resolve to LDS) per-wavefront.")
1855 .name(name() + ".flat_vmem_insts")
1856 .desc("The number of FLAT insts that resolve to vmem issued.")
1859 .name(name() + ".flat_vmem_insts_per_wf")
1860 .desc("The average number of FLAT insts that resolve to vmem "
1861 "issued per-wavefront.")
1864 .name(name() + ".flat_lds_insts")
1865 .desc("The number of FLAT insts that resolve to LDS issued.")
1868 .name(name() + ".flat_lds_insts_per_wf")
1869 .desc("The average number of FLAT insts that resolve to LDS "
1870 "issued per-wavefront.")
1873 .name(name() + ".vector_mem_writes")
1874 .desc("Number of vector mem write insts (excluding FLAT insts).")
1876 vectorMemWritesPerWF
1877 .name(name() + ".vector_mem_writes_per_wf")
1878 .desc("The average number of vector mem write insts "
1879 "(excluding FLAT insts) per-wavefront.")
1882 .name(name() + ".vector_mem_reads")
1883 .desc("Number of vector mem read insts (excluding FLAT insts).")
1886 .name(name() + ".vector_mem_reads_per_wf")
1887 .desc("The avg. number of vector mem read insts (excluding "
1888 "FLAT insts) per-wavefront.")
1891 .name(name() + ".scalar_mem_writes")
1892 .desc("Number of scalar mem write insts.")
1894 scalarMemWritesPerWF
1895 .name(name() + ".scalar_mem_writes_per_wf")
1896 .desc("The average number of scalar mem write insts per-wavefront.")
1899 .name(name() + ".scalar_mem_reads")
1900 .desc("Number of scalar mem read insts.")
1903 .name(name() + ".scalar_mem_reads_per_wf")
1904 .desc("The average number of scalar mem read insts per-wavefront.")
1907 vALUInstsPerWF
= vALUInsts
/ completedWfs
;
1908 sALUInstsPerWF
= sALUInsts
/ completedWfs
;
1909 vALUUtilization
= (threadCyclesVALU
/ (64 * instCyclesVALU
)) * 100;
1910 ldsNoFlatInstsPerWF
= ldsNoFlatInsts
/ completedWfs
;
1911 flatVMemInstsPerWF
= flatVMemInsts
/ completedWfs
;
1912 flatLDSInstsPerWF
= flatLDSInsts
/ completedWfs
;
1913 vectorMemWritesPerWF
= vectorMemWrites
/ completedWfs
;
1914 vectorMemReadsPerWF
= vectorMemReads
/ completedWfs
;
1915 scalarMemWritesPerWF
= scalarMemWrites
/ completedWfs
;
1916 scalarMemReadsPerWF
= scalarMemReads
/ completedWfs
;
1918 vectorMemReadsPerKiloInst
1919 .name(name() + ".vector_mem_reads_per_kilo_inst")
1920 .desc("Number of vector mem reads per kilo-instruction")
1922 vectorMemReadsPerKiloInst
= (vectorMemReads
/ numInstrExecuted
) * 1000;
1923 vectorMemWritesPerKiloInst
1924 .name(name() + ".vector_mem_writes_per_kilo_inst")
1925 .desc("Number of vector mem writes per kilo-instruction")
1927 vectorMemWritesPerKiloInst
= (vectorMemWrites
/ numInstrExecuted
) * 1000;
1928 vectorMemInstsPerKiloInst
1929 .name(name() + ".vector_mem_insts_per_kilo_inst")
1930 .desc("Number of vector mem insts per kilo-instruction")
1932 vectorMemInstsPerKiloInst
=
1933 ((vectorMemReads
+ vectorMemWrites
) / numInstrExecuted
) * 1000;
1934 scalarMemReadsPerKiloInst
1935 .name(name() + ".scalar_mem_reads_per_kilo_inst")
1936 .desc("Number of scalar mem reads per kilo-instruction")
1938 scalarMemReadsPerKiloInst
= (scalarMemReads
/ numInstrExecuted
) * 1000;
1939 scalarMemWritesPerKiloInst
1940 .name(name() + ".scalar_mem_writes_per_kilo_inst")
1941 .desc("Number of scalar mem writes per kilo-instruction")
1943 scalarMemWritesPerKiloInst
= (scalarMemWrites
/ numInstrExecuted
) * 1000;
1944 scalarMemInstsPerKiloInst
1945 .name(name() + ".scalar_mem_insts_per_kilo_inst")
1946 .desc("Number of scalar mem insts per kilo-instruction")
1948 scalarMemInstsPerKiloInst
=
1949 ((scalarMemReads
+ scalarMemWrites
) / numInstrExecuted
) * 1000;
1951 instCyclesVMemPerSimd
1952 .init(numVectorALUs
)
1953 .name(name() + ".inst_cycles_vector_memory")
1954 .desc("Number of cycles to send address, command, data from VRF to "
1955 "vector memory unit, per SIMD")
1958 instCyclesScMemPerSimd
1959 .init(numVectorALUs
)
1960 .name(name() + ".inst_cycles_scalar_memory")
1961 .desc("Number of cycles to send address, command, data from SRF to "
1962 "scalar memory unit, per SIMD")
1965 instCyclesLdsPerSimd
1966 .init(numVectorALUs
)
1967 .name(name() + ".inst_cycles_lds")
1968 .desc("Number of cycles to send address, command, data from VRF to "
1969 "LDS unit, per SIMD")
1973 .name(name() + ".global_mem_reads")
1974 .desc("Number of reads to the global segment")
1977 .name(name() + ".global_mem_writes")
1978 .desc("Number of writes to the global segment")
1981 .name(name() + ".global_mem_insts")
1982 .desc("Number of memory instructions sent to the global segment")
1984 globalMemInsts
= globalReads
+ globalWrites
;
1986 .name(name() + ".arg_reads")
1987 .desc("Number of reads to the arg segment")
1990 .name(name() + ".arg_writes")
1991 .desc("NUmber of writes to the arg segment")
1994 .name(name() + ".arg_mem_insts")
1995 .desc("Number of memory instructions sent to the arg segment")
1997 argMemInsts
= argReads
+ argWrites
;
1999 .name(name() + ".spill_reads")
2000 .desc("Number of reads to the spill segment")
2003 .name(name() + ".spill_writes")
2004 .desc("Number of writes to the spill segment")
2007 .name(name() + ".spill_mem_insts")
2008 .desc("Number of memory instructions sent to the spill segment")
2010 spillMemInsts
= spillReads
+ spillWrites
;
2012 .name(name() + ".group_reads")
2013 .desc("Number of reads to the group segment")
2016 .name(name() + ".group_writes")
2017 .desc("Number of writes to the group segment")
2020 .name(name() + ".group_mem_insts")
2021 .desc("Number of memory instructions sent to the group segment")
2023 groupMemInsts
= groupReads
+ groupWrites
;
2025 .name(name() + ".private_reads")
2026 .desc("Number of reads to the private segment")
2029 .name(name() + ".private_writes")
2030 .desc("Number of writes to the private segment")
2033 .name(name() + ".private_mem_insts")
2034 .desc("Number of memory instructions sent to the private segment")
2036 privMemInsts
= privReads
+ privWrites
;
2038 .name(name() + ".readonly_reads")
2039 .desc("Number of reads to the readonly segment")
2042 .name(name() + ".readonly_writes")
2043 .desc("Number of memory instructions sent to the readonly segment")
2046 .name(name() + ".readonly_mem_insts")
2047 .desc("Number of memory instructions sent to the readonly segment")
2049 readonlyMemInsts
= readonlyReads
+ readonlyWrites
;
2051 .name(name() + ".kernarg_reads")
2052 .desc("Number of reads sent to the kernarg segment")
2055 .name(name() + ".kernarg_writes")
2056 .desc("Number of memory instructions sent to the kernarg segment")
2059 .name(name() + ".kernarg_mem_insts")
2060 .desc("Number of memory instructions sent to the kernarg segment")
2062 kernargMemInsts
= kernargReads
+ kernargWrites
;
2065 .name(name() + ".tlb_cycles")
2066 .desc("total number of cycles for all uncoalesced requests")
2070 .name(name() + ".tlb_requests")
2071 .desc("number of uncoalesced requests")
2075 .name(name() + ".avg_translation_latency")
2076 .desc("Avg. translation latency for data translations")
2079 tlbLatency
= tlbCycles
/ tlbRequests
;
2083 .name(name() + ".TLB_hits_distribution")
2084 .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
2087 // fixed number of TLB levels
2088 for (int i
= 0; i
< 4; ++i
) {
2090 hitsPerTLBLevel
.subname(i
,"page_table");
2092 hitsPerTLBLevel
.subname(i
, csprintf("L%d_TLB",i
));
2097 .name(name() + ".inst_exec_rate")
2098 .desc("Instruction Execution Rate: Number of executed vector "
2099 "instructions per cycle")
2103 .init(0, wfSize(), 2)
2104 .name(name() + ".lds_bank_conflicts")
2105 .desc("Number of bank conflicts per LDS memory packet")
2109 .name(name() + ".lds_bank_access_cnt")
2110 .desc("Total number of LDS bank accesses")
2114 // A wavefront can touch up to N pages per memory instruction where
2115 // N is equal to the wavefront size
2116 // The number of pages per bin can be configured (here it's 4).
2117 .init(1, wfSize(), 4)
2118 .name(name() + ".page_divergence_dist")
2119 .desc("pages touched per wf (over all mem. instr.)")
2122 controlFlowDivergenceDist
2123 .init(1, wfSize(), 4)
2124 .name(name() + ".warp_execution_dist")
2125 .desc("number of lanes active per instruction (oval all instructions)")
2128 activeLanesPerGMemInstrDist
2129 .init(1, wfSize(), 4)
2130 .name(name() + ".gmem_lanes_execution_dist")
2131 .desc("number of active lanes per global memory instruction")
2134 activeLanesPerLMemInstrDist
2135 .init(1, wfSize(), 4)
2136 .name(name() + ".lmem_lanes_execution_dist")
2137 .desc("number of active lanes per local memory instruction")
2141 .name(name() + ".num_instr_executed")
2142 .desc("number of instructions executed")
2146 .name(name() + ".num_vec_ops_executed")
2147 .desc("number of vec ops executed (e.g. WF size/inst)")
2150 numVecOpsExecutedF16
2151 .name(name() + ".num_vec_ops_f16_executed")
2152 .desc("number of f16 vec ops executed (e.g. WF size/inst)")
2155 numVecOpsExecutedF32
2156 .name(name() + ".num_vec_ops_f32_executed")
2157 .desc("number of f32 vec ops executed (e.g. WF size/inst)")
2160 numVecOpsExecutedF64
2161 .name(name() + ".num_vec_ops_f64_executed")
2162 .desc("number of f64 vec ops executed (e.g. WF size/inst)")
2165 numVecOpsExecutedFMA16
2166 .name(name() + ".num_vec_ops_fma16_executed")
2167 .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
2170 numVecOpsExecutedFMA32
2171 .name(name() + ".num_vec_ops_fma32_executed")
2172 .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
2175 numVecOpsExecutedFMA64
2176 .name(name() + ".num_vec_ops_fma64_executed")
2177 .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
2180 numVecOpsExecutedMAD16
2181 .name(name() + ".num_vec_ops_mad16_executed")
2182 .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
2185 numVecOpsExecutedMAD32
2186 .name(name() + ".num_vec_ops_mad32_executed")
2187 .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
2190 numVecOpsExecutedMAD64
2191 .name(name() + ".num_vec_ops_mad64_executed")
2192 .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
2195 numVecOpsExecutedMAC16
2196 .name(name() + ".num_vec_ops_mac16_executed")
2197 .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
2200 numVecOpsExecutedMAC32
2201 .name(name() + ".num_vec_ops_mac32_executed")
2202 .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
2205 numVecOpsExecutedMAC64
2206 .name(name() + ".num_vec_ops_mac64_executed")
2207 .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
2210 numVecOpsExecutedTwoOpFP
2211 .name(name() + ".num_vec_ops_two_op_fp_executed")
2212 .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
2216 .name(name() + ".num_total_cycles")
2217 .desc("number of cycles the CU ran for")
2221 .name(name() + ".ipc")
2222 .desc("Instructions per cycle (this CU only)")
2226 .name(name() + ".vpc")
2227 .desc("Vector Operations per cycle (this CU only)")
2231 .name(name() + ".vpc_f16")
2232 .desc("F16 Vector Operations per cycle (this CU only)")
2236 .name(name() + ".vpc_f32")
2237 .desc("F32 Vector Operations per cycle (this CU only)")
2241 .name(name() + ".vpc_f64")
2242 .desc("F64 Vector Operations per cycle (this CU only)")
2246 .name(name() + ".num_alu_insts_executed")
2247 .desc("Number of dynamic non-GM memory insts executed")
2250 wgBlockedDueBarrierAllocation
2251 .name(name() + ".wg_blocked_due_barrier_alloc")
2252 .desc("WG dispatch was blocked due to lack of barrier resources")
2255 wgBlockedDueLdsAllocation
2256 .name(name() + ".wg_blocked_due_lds_alloc")
2257 .desc("Workgroup blocked due to LDS capacity")
2260 ipc
= numInstrExecuted
/ totalCycles
;
2261 vpc
= numVecOpsExecuted
/ totalCycles
;
2262 vpc_f16
= numVecOpsExecutedF16
/ totalCycles
;
2263 vpc_f32
= numVecOpsExecutedF32
/ totalCycles
;
2264 vpc_f64
= numVecOpsExecutedF64
/ totalCycles
;
2266 numTimesWgBlockedDueVgprAlloc
2267 .name(name() + ".times_wg_blocked_due_vgpr_alloc")
2268 .desc("Number of times WGs are blocked due to VGPR allocation per "
2272 numTimesWgBlockedDueSgprAlloc
2273 .name(name() + ".times_wg_blocked_due_sgpr_alloc")
2274 .desc("Number of times WGs are blocked due to SGPR allocation per "
2279 .name(name() + ".global_mem_instr_cnt")
2280 .desc("dynamic non-flat global memory instruction count")
2283 dynamicFlatMemInstrCnt
2284 .name(name() + ".flat_global_mem_instr_cnt")
2285 .desc("dynamic flat global memory instruction count")
2289 .name(name() + ".local_mem_instr_cnt")
2290 .desc("dynamic local memory intruction count")
2293 numALUInstsExecuted
= numInstrExecuted
- dynamicGMemInstrCnt
-
2294 dynamicLMemInstrCnt
;
2297 .name(name() + ".num_completed_wfs")
2298 .desc("number of completed wavefronts")
2302 .name(name() + ".num_completed_wgs")
2303 .desc("number of completed workgroups")
2307 .name(name() + ".num_CAS_ops")
2308 .desc("number of compare and swap operations")
2312 .name(name() + ".num_failed_CAS_ops")
2313 .desc("number of compare and swap operations that failed")
2317 .init(0, 1000000, 10000)
2318 .name(name() + ".head_tail_latency")
2319 .desc("ticks between first and last cache block arrival at coalescer")
2320 .flags(Stats::pdf
| Stats::oneline
)
2323 waveLevelParallelism
2324 .init(0, shader
->n_wf
* numVectorALUs
, 1)
2325 .name(name() + ".wlp")
2326 .desc("wave level parallelism: count of active waves at wave launch")
2330 .init(numVectorALUs
, 0, 20, 1)
2331 .name(name() + ".interleaving")
2332 .desc("Measure of instruction interleaving per SIMD")
2335 // register stats of pipeline stages
2336 fetchStage
.regStats();
2337 scoreboardCheckStage
.regStats();
2338 scheduleStage
.regStats();
2339 execStage
.regStats();
2341 // register stats of memory pipelines
2342 globalMemoryPipe
.regStats();
2343 localMemoryPipe
.regStats();
2344 scalarMemoryPipe
.regStats();
2346 registerManager
->regStats();
2350 ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst
)
2352 if (gpuDynInst
->isScalar()) {
2353 if (gpuDynInst
->isALU() && !gpuDynInst
->isWaitcnt()) {
2356 } else if (gpuDynInst
->isLoad()) {
2358 } else if (gpuDynInst
->isStore()) {
2362 if (gpuDynInst
->isALU()) {
2363 shader
->total_valu_insts
++;
2364 if (shader
->total_valu_insts
== shader
->max_valu_insts
) {
2365 exitSimLoop("max vALU insts");
2369 threadCyclesVALU
+= gpuDynInst
->wavefront()->execMask().count();
2370 } else if (gpuDynInst
->isFlat()) {
2371 if (gpuDynInst
->isLocalMem()) {
2376 } else if (gpuDynInst
->isLocalMem()) {
2378 } else if (gpuDynInst
->isLoad()) {
2380 } else if (gpuDynInst
->isStore()) {
2384 if (gpuDynInst
->isLoad()) {
2385 switch (gpuDynInst
->executedAs()) {
2386 case Enums::SC_SPILL
:
2389 case Enums::SC_GLOBAL
:
2392 case Enums::SC_GROUP
:
2395 case Enums::SC_PRIVATE
:
2398 case Enums::SC_READONLY
:
2401 case Enums::SC_KERNARG
:
2407 case Enums::SC_NONE
:
2409 * this case can occur for flat mem insts
2410 * who execute with EXEC = 0
2414 fatal("%s has no valid segment\n", gpuDynInst
->disassemble());
2417 } else if (gpuDynInst
->isStore()) {
2418 switch (gpuDynInst
->executedAs()) {
2419 case Enums::SC_SPILL
:
2422 case Enums::SC_GLOBAL
:
2425 case Enums::SC_GROUP
:
2428 case Enums::SC_PRIVATE
:
2431 case Enums::SC_READONLY
:
2434 case Enums::SC_KERNARG
:
2440 case Enums::SC_NONE
:
2442 * this case can occur for flat mem insts
2443 * who execute with EXEC = 0
2447 fatal("%s has no valid segment\n", gpuDynInst
->disassemble());
2455 ComputeUnit::updatePageDivergenceDist(Addr addr
)
2457 Addr virt_page_addr
= roundDown(addr
, TheISA::PageBytes
);
2459 if (!pagesTouched
.count(virt_page_addr
))
2460 pagesTouched
[virt_page_addr
] = 1;
2462 pagesTouched
[virt_page_addr
]++;
2466 ComputeUnit::exitCallback()
2469 std::ostream
*page_stat_file
= simout
.create(name().c_str())->stream();
2471 *page_stat_file
<< "page, wavefront accesses, workitem accesses" <<
2474 for (auto iter
: pageAccesses
) {
2475 *page_stat_file
<< std::hex
<< iter
.first
<< ",";
2476 *page_stat_file
<< std::dec
<< iter
.second
.first
<< ",";
2477 *page_stat_file
<< std::dec
<< iter
.second
.second
<< std::endl
;
2483 ComputeUnit::isDone() const
2485 for (int i
= 0; i
< numVectorALUs
; ++i
) {
2486 if (!isVectorAluIdle(i
)) {
2491 // TODO: FIXME if more than 1 of any memory pipe supported
2492 if (!srfToScalarMemPipeBus
.rdy()) {
2495 if (!vrfToGlobalMemPipeBus
.rdy()) {
2498 if (!vrfToLocalMemPipeBus
.rdy()) {
2502 if (!globalMemoryPipe
.isGMReqFIFOWrRdy()
2503 || !localMemoryPipe
.isLMReqFIFOWrRdy()
2504 || !localMemoryPipe
.isLMRespFIFOWrRdy() || !locMemToVrfBus
.rdy() ||
2505 !glbMemToVrfBus
.rdy() || !scalarMemToSrfBus
.rdy()) {
2513 ComputeUnit::getRefCounter(const uint32_t dispatchId
,
2514 const uint32_t wgId
) const
2516 return lds
.getRefCounter(dispatchId
, wgId
);
2520 ComputeUnit::isVectorAluIdle(uint32_t simdId
) const
2522 assert(simdId
< numVectorALUs
);
2524 for (int i_wf
= 0; i_wf
< shader
->n_wf
; ++i_wf
){
2525 if (wfList
[simdId
][i_wf
]->getStatus() != Wavefront::S_STOPPED
) {
2534 * send a general request to the LDS
2535 * make sure to look at the return value here as your request might be
2536 * NACK'd and returning false means that you have to have some backup plan
2539 ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst
)
2541 // this is just a request to carry the GPUDynInstPtr
2543 RequestPtr newRequest
= std::make_shared
<Request
>();
2544 newRequest
->setPaddr(0x0);
2546 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2547 PacketPtr newPacket
= new Packet(newRequest
, MemCmd::ReadReq
);
2549 // This is the SenderState needed upon return
2550 newPacket
->senderState
= new LDSPort::SenderState(gpuDynInst
);
2552 return ldsPort
.sendTimingReq(newPacket
);
2556 * get the result of packets sent to the LDS when they return
2559 ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet
)
2561 const ComputeUnit::LDSPort::SenderState
*senderState
=
2562 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(packet
->senderState
);
2564 fatal_if(!senderState
, "did not get the right sort of sender state");
2566 GPUDynInstPtr gpuDynInst
= senderState
->getMemInst();
2568 delete packet
->senderState
;
2571 computeUnit
->localMemoryPipe
.getLMRespFIFO().push(gpuDynInst
);
2576 * attempt to send this packet, either the port is already stalled, the request
2577 * is nack'd and must stall or the request goes through
2578 * when a request cannot be sent, add it to the retries queue
2581 ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt
)
2583 ComputeUnit::LDSPort::SenderState
*sender_state
=
2584 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(pkt
->senderState
);
2585 fatal_if(!sender_state
, "packet without a valid sender state");
2587 GPUDynInstPtr gpuDynInst M5_VAR_USED
= sender_state
->getMemInst();
2590 fatal_if(retries
.empty(), "must have retries waiting to be stalled");
2594 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: LDS send failed!\n",
2595 computeUnit
->cu_id
, gpuDynInst
->simdId
,
2596 gpuDynInst
->wfSlotId
);
2598 } else if (!RequestPort::sendTimingReq(pkt
)) {
2599 // need to stall the LDS port until a recvReqRetry() is received
2600 // this indicates that there is more space
2604 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2605 computeUnit
->cu_id
, gpuDynInst
->simdId
,
2606 gpuDynInst
->wfSlotId
, pkt
->req
->getPaddr());
2609 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2610 computeUnit
->cu_id
, gpuDynInst
->simdId
,
2611 gpuDynInst
->wfSlotId
, pkt
->req
->getPaddr());
2617 * the bus is telling the port that there is now space so retrying stalled
2618 * requests should work now
2619 * this allows the port to have a request be nack'd and then have the receiver
2620 * say when there is space, rather than simply retrying the send every cycle
2623 ComputeUnit::LDSPort::recvReqRetry()
2625 auto queueSize
= retries
.size();
2627 DPRINTF(GPUPort
, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2628 computeUnit
->cu_id
, queueSize
);
2630 fatal_if(queueSize
< 1,
2631 "why was there a recvReqRetry() with no pending reqs?");
2632 fatal_if(!isStalled(),
2633 "recvReqRetry() happened when the port was not stalled");
2637 while (!retries
.empty()) {
2638 PacketPtr packet
= retries
.front();
2640 DPRINTF(GPUPort
, "CU%d: retrying LDS send\n", computeUnit
->cu_id
);
2642 if (!RequestPort::sendTimingReq(packet
)) {
2645 DPRINTF(GPUPort
, ": LDS send failed again\n");
2648 DPRINTF(GPUTLB
, ": LDS send successful\n");