2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/compute_unit.hh"
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUSync.hh"
46 #include "debug/GPUTLB.hh"
47 #include "gpu-compute/dispatcher.hh"
48 #include "gpu-compute/gpu_dyn_inst.hh"
49 #include "gpu-compute/gpu_static_inst.hh"
50 #include "gpu-compute/ndrange.hh"
51 #include "gpu-compute/shader.hh"
52 #include "gpu-compute/simple_pool_manager.hh"
53 #include "gpu-compute/vector_register_file.hh"
54 #include "gpu-compute/wavefront.hh"
55 #include "mem/page_table.hh"
56 #include "sim/process.hh"
58 ComputeUnit::ComputeUnit(const Params
*p
) : ClockedObject(p
), fetchStage(p
),
59 scoreboardCheckStage(p
), scheduleStage(p
), execStage(p
),
60 globalMemoryPipe(p
), localMemoryPipe(p
), rrNextMemID(0), rrNextALUWp(0),
61 cu_id(p
->cu_id
), vrf(p
->vector_register_file
), numSIMDs(p
->num_SIMDs
),
62 spBypassPipeLength(p
->spbypass_pipe_length
),
63 dpBypassPipeLength(p
->dpbypass_pipe_length
),
64 issuePeriod(p
->issue_period
),
65 numGlbMemUnits(p
->num_global_mem_pipes
),
66 numLocMemUnits(p
->num_shared_mem_pipes
),
67 perLaneTLB(p
->perLaneTLB
), prefetchDepth(p
->prefetch_depth
),
68 prefetchStride(p
->prefetch_stride
), prefetchType(p
->prefetch_prev_type
),
69 xact_cas_mode(p
->xactCasMode
), debugSegFault(p
->debugSegFault
),
70 functionalTLB(p
->functionalTLB
), localMemBarrier(p
->localMemBarrier
),
71 countPages(p
->countPages
), barrier_id(0),
72 vrfToCoalescerBusWidth(p
->vrf_to_coalescer_bus_width
),
73 coalescerToVrfBusWidth(p
->coalescer_to_vrf_bus_width
),
74 req_tick_latency(p
->mem_req_latency
* p
->clk_domain
->clockPeriod()),
75 resp_tick_latency(p
->mem_resp_latency
* p
->clk_domain
->clockPeriod()),
76 _masterId(p
->system
->getMasterId(this, "ComputeUnit")),
77 lds(*p
->localDataStore
), gmTokenPort(name() + ".gmTokenPort", this),
78 _cacheLineSize(p
->system
->cacheLineSize()), globalSeqNum(0),
79 wavefrontSize(p
->wfSize
), kernelLaunchInst(new KernelLaunchStaticInst())
82 * This check is necessary because std::bitset only provides conversion
83 * to unsigned long or unsigned long long via to_ulong() or to_ullong().
84 * there are * a few places in the code where to_ullong() is used, however
85 * if VSZ is larger than a value the host can support then bitset will
86 * throw a runtime exception. we should remove all use of to_long() or
87 * to_ullong() so we can have VSZ greater than 64b, however until that is
88 * done this assert is required.
90 fatal_if(p
->wfSize
> std::numeric_limits
<unsigned long long>::digits
||
92 "WF size is larger than the host can support");
93 fatal_if(!isPowerOf2(wavefrontSize
),
94 "Wavefront size should be a power of 2");
95 // calculate how many cycles a vector load or store will need to transfer
96 // its data over the corresponding buses
97 numCyclesPerStoreTransfer
=
98 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
99 (double)vrfToCoalescerBusWidth
);
101 numCyclesPerLoadTransfer
= (wfSize() * sizeof(uint32_t))
102 / coalescerToVrfBusWidth
;
104 lastVaddrWF
.resize(numSIMDs
);
105 wfList
.resize(numSIMDs
);
107 for (int j
= 0; j
< numSIMDs
; ++j
) {
108 lastVaddrWF
[j
].resize(p
->n_wf
);
110 for (int i
= 0; i
< p
->n_wf
; ++i
) {
111 lastVaddrWF
[j
][i
].resize(wfSize());
113 wfList
[j
].push_back(p
->wavefronts
[j
* p
->n_wf
+ i
]);
114 wfList
[j
][i
]->setParent(this);
116 for (int k
= 0; k
< wfSize(); ++k
) {
117 lastVaddrWF
[j
][i
][k
] = 0;
122 lastVaddrSimd
.resize(numSIMDs
);
124 for (int i
= 0; i
< numSIMDs
; ++i
) {
125 lastVaddrSimd
[i
].resize(wfSize(), 0);
128 lastVaddrCU
.resize(wfSize());
132 if (p
->execPolicy
== "OLDEST-FIRST") {
133 exec_policy
= EXEC_POLICY::OLDEST
;
134 } else if (p
->execPolicy
== "ROUND-ROBIN") {
135 exec_policy
= EXEC_POLICY::RR
;
137 fatal("Invalid WF execution policy (CU)\n");
140 memPort
.resize(wfSize());
142 // Setup tokens for slave ports. The number of tokens in memSlaveTokens
143 // is the total token count for the entire vector port (i.e., this CU).
144 memPortTokens
= new TokenManager(p
->max_cu_tokens
);
146 // resize the tlbPort vectorArray
147 int tlbPort_width
= perLaneTLB
? wfSize() : 1;
148 tlbPort
.resize(tlbPort_width
);
150 cuExitCallback
= new CUExitCallback(this);
151 registerExitCallback(cuExitCallback
);
153 xactCasLoadMap
.clear();
154 lastExecCycle
.resize(numSIMDs
, 0);
156 for (int i
= 0; i
< vrf
.size(); ++i
) {
157 vrf
[i
]->setParent(this);
160 numVecRegsPerSimd
= vrf
[0]->numRegs();
163 ComputeUnit::~ComputeUnit()
165 // Delete wavefront slots
166 for (int j
= 0; j
< numSIMDs
; ++j
) {
167 for (int i
= 0; i
< shader
->n_wf
; ++i
) {
170 lastVaddrSimd
[j
].clear();
174 waveStatusList
.clear();
175 dispatchList
.clear();
176 vectorAluInstAvail
.clear();
177 delete cuExitCallback
;
182 ComputeUnit::fillKernelState(Wavefront
*w
, NDRange
*ndr
)
184 w
->resizeRegFiles(ndr
->q
.cRegCount
, ndr
->q
.sRegCount
, ndr
->q
.dRegCount
);
186 w
->workGroupSz
[0] = ndr
->q
.wgSize
[0];
187 w
->workGroupSz
[1] = ndr
->q
.wgSize
[1];
188 w
->workGroupSz
[2] = ndr
->q
.wgSize
[2];
189 w
->wgSz
= w
->workGroupSz
[0] * w
->workGroupSz
[1] * w
->workGroupSz
[2];
190 w
->gridSz
[0] = ndr
->q
.gdSize
[0];
191 w
->gridSz
[1] = ndr
->q
.gdSize
[1];
192 w
->gridSz
[2] = ndr
->q
.gdSize
[2];
193 w
->kernelArgs
= ndr
->q
.args
;
194 w
->privSizePerItem
= ndr
->q
.privMemPerItem
;
195 w
->spillSizePerItem
= ndr
->q
.spillMemPerItem
;
196 w
->roBase
= ndr
->q
.roMemStart
;
197 w
->roSize
= ndr
->q
.roMemTotal
;
198 w
->computeActualWgSz(ndr
);
202 ComputeUnit::updateEvents() {
204 if (!timestampVec
.empty()) {
205 uint32_t vecSize
= timestampVec
.size();
207 while (i
< vecSize
) {
208 if (timestampVec
[i
] <= shader
->tick_cnt
) {
209 std::pair
<uint32_t, uint32_t> regInfo
= regIdxVec
[i
];
210 vrf
[regInfo
.first
]->markReg(regInfo
.second
, sizeof(uint32_t),
212 timestampVec
.erase(timestampVec
.begin() + i
);
213 regIdxVec
.erase(regIdxVec
.begin() + i
);
214 statusVec
.erase(statusVec
.begin() + i
);
222 for (int i
= 0; i
< numSIMDs
; ++i
) {
223 vrf
[i
]->updateEvents();
229 ComputeUnit::startWavefront(Wavefront
*w
, int waveId
, LdsChunk
*ldsChunk
,
232 static int _n_wave
= 0;
234 VectorMask init_mask
;
237 for (int k
= 0; k
< wfSize(); ++k
) {
238 if (k
+ waveId
* wfSize() < w
->actualWgSzTotal
)
242 w
->kernId
= ndr
->dispatchId
;
244 w
->initMask
= init_mask
.to_ullong();
246 for (int k
= 0; k
< wfSize(); ++k
) {
247 w
->workItemId
[0][k
] = (k
+ waveId
* wfSize()) % w
->actualWgSz
[0];
248 w
->workItemId
[1][k
] = ((k
+ waveId
* wfSize()) / w
->actualWgSz
[0]) %
250 w
->workItemId
[2][k
] = (k
+ waveId
* wfSize()) /
251 (w
->actualWgSz
[0] * w
->actualWgSz
[1]);
253 w
->workItemFlatId
[k
] = w
->workItemId
[2][k
] * w
->actualWgSz
[0] *
254 w
->actualWgSz
[1] + w
->workItemId
[1][k
] * w
->actualWgSz
[0] +
258 w
->barrierSlots
= divCeil(w
->actualWgSzTotal
, wfSize());
260 w
->barCnt
.resize(wfSize(), 0);
263 w
->oldBarrierCnt
= 0;
266 w
->privBase
= ndr
->q
.privMemStart
;
267 ndr
->q
.privMemStart
+= ndr
->q
.privMemPerItem
* wfSize();
269 w
->spillBase
= ndr
->q
.spillMemStart
;
270 ndr
->q
.spillMemStart
+= ndr
->q
.spillMemPerItem
* wfSize();
272 w
->pushToReconvergenceStack(0, UINT32_MAX
, init_mask
.to_ulong());
275 w
->wgId
= ndr
->globalWgId
;
276 w
->dispatchId
= ndr
->dispatchId
;
277 w
->workGroupId
[0] = w
->wgId
% ndr
->numWg
[0];
278 w
->workGroupId
[1] = (w
->wgId
/ ndr
->numWg
[0]) % ndr
->numWg
[1];
279 w
->workGroupId
[2] = w
->wgId
/ (ndr
->numWg
[0] * ndr
->numWg
[1]);
281 w
->barrierId
= barrier_id
;
282 w
->stalledAtBarrier
= false;
284 // set the wavefront context to have a pointer to this section of the LDS
285 w
->ldsChunk
= ldsChunk
;
287 int32_t refCount M5_VAR_USED
=
288 lds
.increaseRefCounter(w
->dispatchId
, w
->wgId
);
289 DPRINTF(GPUDisp
, "CU%d: increase ref ctr wg[%d] to [%d]\n",
290 cu_id
, w
->wgId
, refCount
);
292 w
->instructionBuffer
.clear();
297 // is this the last wavefront in the workgroup
298 // if set the spillWidth to be the remaining work-items
299 // so that the vector access is correct
300 if ((waveId
+ 1) * wfSize() >= w
->actualWgSzTotal
) {
301 w
->spillWidth
= w
->actualWgSzTotal
- (waveId
* wfSize());
303 w
->spillWidth
= wfSize();
306 DPRINTF(GPUDisp
, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
307 "WF[%d][%d]\n", _n_wave
, barrier_id
, cu_id
, w
->simdId
, w
->wfSlotId
);
309 w
->start(++_n_wave
, ndr
->q
.code_ptr
);
313 ComputeUnit::StartWorkgroup(NDRange
*ndr
)
315 // reserve the LDS capacity allocated to the work group
316 // disambiguated by the dispatch ID and workgroup ID, which should be
318 LdsChunk
*ldsChunk
= lds
.reserveSpace(ndr
->dispatchId
, ndr
->globalWgId
,
321 // Send L1 cache acquire
322 // isKernel + isAcquire = Kernel Begin
323 if (shader
->impl_kern_boundary_sync
) {
324 GPUDynInstPtr gpuDynInst
=
325 std::make_shared
<GPUDynInst
>(this, nullptr, kernelLaunchInst
,
328 gpuDynInst
->useContinuation
= false;
329 injectGlobalMemFence(gpuDynInst
, true);
332 // calculate the number of 32-bit vector registers required by wavefront
333 int vregDemand
= ndr
->q
.sRegCount
+ (2 * ndr
->q
.dRegCount
);
336 // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
337 for (int m
= 0; m
< shader
->n_wf
* numSIMDs
; ++m
) {
338 Wavefront
*w
= wfList
[m
% numSIMDs
][m
/ numSIMDs
];
339 // Check if this wavefront slot is available:
340 // It must be stopped and not waiting
341 // for a release to complete S_RETURNING
342 if (w
->status
== Wavefront::S_STOPPED
) {
343 fillKernelState(w
, ndr
);
344 // if we have scheduled all work items then stop
345 // scheduling wavefronts
346 if (wave_id
* wfSize() >= w
->actualWgSzTotal
)
349 // reserve vector registers for the scheduled wavefront
350 assert(vectorRegsReserved
[m
% numSIMDs
] <= numVecRegsPerSimd
);
351 uint32_t normSize
= 0;
353 w
->startVgprIndex
= vrf
[m
% numSIMDs
]->manager
->
354 allocateRegion(vregDemand
, &normSize
);
356 w
->reservedVectorRegs
= normSize
;
357 vectorRegsReserved
[m
% numSIMDs
] += w
->reservedVectorRegs
;
359 startWavefront(w
, wave_id
, ldsChunk
, ndr
);
367 ComputeUnit::ReadyWorkgroup(NDRange
*ndr
)
369 // Get true size of workgroup (after clamping to grid size)
371 int trueWgSizeTotal
= 1;
373 for (int d
= 0; d
< 3; ++d
) {
374 trueWgSize
[d
] = std::min(ndr
->q
.wgSize
[d
], ndr
->q
.gdSize
[d
] -
375 ndr
->wgId
[d
] * ndr
->q
.wgSize
[d
]);
377 trueWgSizeTotal
*= trueWgSize
[d
];
378 DPRINTF(GPUDisp
, "trueWgSize[%d] = %d\n", d
, trueWgSize
[d
]);
381 DPRINTF(GPUDisp
, "trueWgSizeTotal = %d\n", trueWgSizeTotal
);
383 // calculate the number of 32-bit vector registers required by each
384 // work item of the work group
385 int vregDemandPerWI
= ndr
->q
.sRegCount
+ (2 * ndr
->q
.dRegCount
);
386 bool vregAvail
= true;
387 int numWfs
= (trueWgSizeTotal
+ wfSize() - 1) / wfSize();
389 // check if the total number of VGPRs required by all WFs of the WG
390 // fit in the VRFs of all SIMD units
391 assert((numWfs
* vregDemandPerWI
) <= (numSIMDs
* numVecRegsPerSimd
));
392 int numMappedWfs
= 0;
393 std::vector
<int> numWfsPerSimd
;
394 numWfsPerSimd
.resize(numSIMDs
, 0);
395 // find how many free WF slots we have across all SIMDs
396 for (int j
= 0; j
< shader
->n_wf
; ++j
) {
397 for (int i
= 0; i
< numSIMDs
; ++i
) {
398 if (wfList
[i
][j
]->status
== Wavefront::S_STOPPED
) {
399 // count the number of free WF slots
401 if (numMappedWfs
< numWfs
) {
402 // count the WFs to be assigned per SIMD
410 // if there are enough free WF slots then find if there are enough
411 // free VGPRs per SIMD based on the WF->SIMD mapping
412 if (freeWfSlots
>= numWfs
) {
413 for (int j
= 0; j
< numSIMDs
; ++j
) {
414 // find if there are enough free VGPR regions in the SIMD's VRF
415 // to accommodate the WFs of the new WG that would be mapped to
417 vregAvail
= vrf
[j
]->manager
->canAllocate(numWfsPerSimd
[j
],
420 // stop searching if there is at least one SIMD
421 // whose VRF does not have enough free VGPR pools.
422 // This is because a WG is scheduled only if ALL
423 // of its WFs can be scheduled
429 DPRINTF(GPUDisp
, "Free WF slots = %d, VGPR Availability = %d\n",
430 freeWfSlots
, vregAvail
);
433 ++numTimesWgBlockedDueVgprAlloc
;
436 // Return true if enough WF slots to submit workgroup and if there are
437 // enough VGPRs to schedule all WFs to their SIMD units
438 if (!lds
.canReserve(ndr
->q
.ldsSize
)) {
439 wgBlockedDueLdsAllocation
++;
442 // Return true if (a) there are enough free WF slots to submit
443 // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
444 // SIMD units and (c) if there is enough space in LDS
445 return freeWfSlots
>= numWfs
&& vregAvail
&& lds
.canReserve(ndr
->q
.ldsSize
);
449 ComputeUnit::AllAtBarrier(uint32_t _barrier_id
, uint32_t bcnt
, uint32_t bslots
)
451 DPRINTF(GPUSync
, "CU%d: Checking for All At Barrier\n", cu_id
);
454 for (int i_simd
= 0; i_simd
< numSIMDs
; ++i_simd
) {
455 for (int i_wf
= 0; i_wf
< shader
->n_wf
; ++i_wf
) {
456 Wavefront
*w
= wfList
[i_simd
][i_wf
];
458 if (w
->status
== Wavefront::S_RUNNING
) {
459 DPRINTF(GPUSync
, "Checking WF[%d][%d]\n", i_simd
, i_wf
);
461 DPRINTF(GPUSync
, "wf->barrier_id = %d, _barrier_id = %d\n",
462 w
->barrierId
, _barrier_id
);
464 DPRINTF(GPUSync
, "wf->barrier_cnt %d, bcnt = %d\n",
465 w
->barrierCnt
, bcnt
);
468 if (w
->status
== Wavefront::S_RUNNING
&&
469 w
->barrierId
== _barrier_id
&& w
->barrierCnt
== bcnt
&&
470 !w
->outstandingReqs
) {
473 DPRINTF(GPUSync
, "WF[%d][%d] at barrier, increment ccnt to "
474 "%d\n", i_simd
, i_wf
, ccnt
);
479 DPRINTF(GPUSync
, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
480 cu_id
, ccnt
, bslots
);
482 return ccnt
== bslots
;
485 // Check if the current wavefront is blocked on additional resources.
487 ComputeUnit::cedeSIMD(int simdId
, int wfSlotId
)
491 // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
492 // magic instructions will impact the scheduling of wavefronts
495 * When a wavefront calls xact_cas_ld, it adds itself to a per address
496 * queue. All per address queues are managed by the xactCasLoadMap.
498 * A wavefront is not blocked if: it is not in ANY per address queue or
499 * if it is at the head of a per address queue.
501 for (auto itMap
: xactCasLoadMap
) {
502 std::list
<waveIdentifier
> curWaveIDQueue
= itMap
.second
.waveIDQueue
;
504 if (!curWaveIDQueue
.empty()) {
505 for (auto it
: curWaveIDQueue
) {
506 waveIdentifier cur_wave
= it
;
508 if (cur_wave
.simdId
== simdId
&&
509 cur_wave
.wfSlotId
== wfSlotId
) {
511 // 1: this WF has a green light
512 // 2: another WF has a green light
513 waveIdentifier owner_wave
= curWaveIDQueue
.front();
515 if (owner_wave
.simdId
!= cur_wave
.simdId
||
516 owner_wave
.wfSlotId
!= cur_wave
.wfSlotId
) {
533 // Execute one clock worth of work on the ComputeUnit.
538 // Execute pipeline stages in reverse order to simulate
539 // the pipeline latency
540 globalMemoryPipe
.exec();
541 localMemoryPipe
.exec();
543 scheduleStage
.exec();
544 scoreboardCheckStage
.exec();
553 // Initialize CU Bus models
554 glbMemToVrfBus
.init(&shader
->tick_cnt
, shader
->ticks(1));
555 locMemToVrfBus
.init(&shader
->tick_cnt
, shader
->ticks(1));
558 fatal_if(numGlbMemUnits
> 1,
559 "No support for multiple Global Memory Pipelines exists!!!");
560 vrfToGlobalMemPipeBus
.resize(numGlbMemUnits
);
561 for (int j
= 0; j
< numGlbMemUnits
; ++j
) {
562 vrfToGlobalMemPipeBus
[j
] = WaitClass();
563 vrfToGlobalMemPipeBus
[j
].init(&shader
->tick_cnt
, shader
->ticks(1));
566 fatal_if(numLocMemUnits
> 1,
567 "No support for multiple Local Memory Pipelines exists!!!");
568 vrfToLocalMemPipeBus
.resize(numLocMemUnits
);
569 for (int j
= 0; j
< numLocMemUnits
; ++j
) {
570 vrfToLocalMemPipeBus
[j
] = WaitClass();
571 vrfToLocalMemPipeBus
[j
].init(&shader
->tick_cnt
, shader
->ticks(1));
573 vectorRegsReserved
.resize(numSIMDs
, 0);
574 aluPipe
.resize(numSIMDs
);
575 wfWait
.resize(numSIMDs
+ numLocMemUnits
+ numGlbMemUnits
);
577 for (int i
= 0; i
< numSIMDs
+ numLocMemUnits
+ numGlbMemUnits
; ++i
) {
578 wfWait
[i
] = WaitClass();
579 wfWait
[i
].init(&shader
->tick_cnt
, shader
->ticks(1));
582 for (int i
= 0; i
< numSIMDs
; ++i
) {
583 aluPipe
[i
] = WaitClass();
584 aluPipe
[i
].init(&shader
->tick_cnt
, shader
->ticks(1));
587 // Setup space for call args
588 for (int j
= 0; j
< numSIMDs
; ++j
) {
589 for (int i
= 0; i
< shader
->n_wf
; ++i
) {
590 wfList
[j
][i
]->initCallArgMem(shader
->funcargs_size
, wavefrontSize
);
594 // Initializing pipeline resources
595 readyList
.resize(numSIMDs
+ numGlbMemUnits
+ numLocMemUnits
);
596 waveStatusList
.resize(numSIMDs
);
598 for (int j
= 0; j
< numSIMDs
; ++j
) {
599 for (int i
= 0; i
< shader
->n_wf
; ++i
) {
600 waveStatusList
[j
].push_back(
601 std::make_pair(wfList
[j
][i
], BLOCKED
));
605 for (int j
= 0; j
< (numSIMDs
+ numGlbMemUnits
+ numLocMemUnits
); ++j
) {
606 dispatchList
.push_back(std::make_pair((Wavefront
*)nullptr, EMPTY
));
609 fetchStage
.init(this);
610 scoreboardCheckStage
.init(this);
611 scheduleStage
.init(this);
612 execStage
.init(this);
613 globalMemoryPipe
.init(this);
614 localMemoryPipe
.init(this);
615 // initialize state for statistics calculation
616 vectorAluInstAvail
.resize(numSIMDs
, false);
620 gmTokenPort
.setTokenManager(memPortTokens
);
624 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt
)
626 // Ruby has completed the memory op. Schedule the mem_resp_event at the
627 // appropriate cycle to process the timing memory response
628 // This delay represents the pipeline delay
629 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
630 int index
= sender_state
->port_index
;
631 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
633 // Is the packet returned a Kernel End or Barrier
634 if (pkt
->req
->isKernel() && pkt
->req
->isRelease()) {
636 computeUnit
->wfList
[gpuDynInst
->simdId
][gpuDynInst
->wfSlotId
];
638 // Check if we are waiting on Kernel End Release
639 if (w
->status
== Wavefront::S_RETURNING
) {
640 DPRINTF(GPUDisp
, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
641 computeUnit
->cu_id
, w
->simdId
, w
->wfSlotId
,
642 w
->wfDynId
, w
->kernId
);
644 computeUnit
->shader
->dispatcher
->notifyWgCompl(w
);
645 w
->status
= Wavefront::S_STOPPED
;
647 w
->outstandingReqs
--;
650 DPRINTF(GPUSync
, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
651 computeUnit
->cu_id
, gpuDynInst
->simdId
,
652 gpuDynInst
->wfSlotId
, w
->barrierCnt
);
654 if (gpuDynInst
->useContinuation
) {
655 assert(!gpuDynInst
->isNoScope());
656 gpuDynInst
->execContinuation(gpuDynInst
->staticInstruction(),
660 delete pkt
->senderState
;
663 } else if (pkt
->req
->isKernel() && pkt
->req
->isAcquire()) {
664 if (gpuDynInst
->useContinuation
) {
665 assert(!gpuDynInst
->isNoScope());
666 gpuDynInst
->execContinuation(gpuDynInst
->staticInstruction(),
670 delete pkt
->senderState
;
675 EventFunctionWrapper
*mem_resp_event
=
676 computeUnit
->memPort
[index
]->createMemRespEvent(pkt
);
678 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
679 computeUnit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
680 index
, pkt
->req
->getPaddr());
682 computeUnit
->schedule(mem_resp_event
,
683 curTick() + computeUnit
->resp_tick_latency
);
688 ComputeUnit::DataPort::recvReqRetry()
690 int len
= retries
.size();
694 for (int i
= 0; i
< len
; ++i
) {
695 PacketPtr pkt
= retries
.front().first
;
696 GPUDynInstPtr gpuDynInst M5_VAR_USED
= retries
.front().second
;
697 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
698 computeUnit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
699 pkt
->req
->getPaddr());
701 /** Currently Ruby can return false due to conflicts for the particular
702 * cache block or address. Thus other requests should be allowed to
703 * pass and the data port should expect multiple retries. */
704 if (!sendTimingReq(pkt
)) {
705 DPRINTF(GPUMem
, "failed again!\n");
708 DPRINTF(GPUMem
, "successful!\n");
715 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt
)
717 computeUnit
->fetchStage
.processFetchReturn(pkt
);
723 ComputeUnit::SQCPort::recvReqRetry()
725 int len
= retries
.size();
729 for (int i
= 0; i
< len
; ++i
) {
730 PacketPtr pkt
= retries
.front().first
;
731 Wavefront
*wavefront M5_VAR_USED
= retries
.front().second
;
732 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
733 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
734 pkt
->req
->getPaddr());
735 if (!sendTimingReq(pkt
)) {
736 DPRINTF(GPUFetch
, "failed again!\n");
739 DPRINTF(GPUFetch
, "successful!\n");
746 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst
, int index
, PacketPtr pkt
)
748 // There must be a way around this check to do the globalMemStart...
749 Addr tmp_vaddr
= pkt
->req
->getVaddr();
751 updatePageDivergenceDist(tmp_vaddr
);
754 pkt
->req
->setPC(gpuDynInst
->wavefront()->pc());
756 pkt
->req
->setReqInstSeqNum(gpuDynInst
->seqNum());
758 // figure out the type of the request to set read/write
759 BaseTLB::Mode TLB_mode
;
760 assert(pkt
->isRead() || pkt
->isWrite());
762 // Check write before read for atomic operations
763 // since atomic operations should use BaseTLB::Write
765 TLB_mode
= BaseTLB::Write
;
766 } else if (pkt
->isRead()) {
767 TLB_mode
= BaseTLB::Read
;
769 fatal("pkt is not a read nor a write\n");
772 tlbCycles
-= curTick();
775 int tlbPort_index
= perLaneTLB
? index
: 0;
777 if (shader
->timingSim
) {
779 Process
*p
= shader
->gpuTc
->getProcessPtr();
780 Addr vaddr
= pkt
->req
->getVaddr();
781 unsigned size
= pkt
->getSize();
783 if ((vaddr
+ size
- 1) % 64 < vaddr
% 64) {
784 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
785 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, vaddr
);
790 if (!p
->pTable
->translate(vaddr
, paddr
)) {
791 if (!p
->fixupFault(vaddr
)) {
792 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
793 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
799 // This is the SenderState needed upon return
800 pkt
->senderState
= new DTLBPort::SenderState(gpuDynInst
, index
);
802 // This is the senderState needed by the TLB hierarchy to function
803 TheISA::GpuTLB::TranslationState
*translation_state
=
804 new TheISA::GpuTLB::TranslationState(TLB_mode
, shader
->gpuTc
, false,
807 pkt
->senderState
= translation_state
;
810 tlbPort
[tlbPort_index
]->sendFunctional(pkt
);
812 // update the hitLevel distribution
813 int hit_level
= translation_state
->hitLevel
;
814 assert(hit_level
!= -1);
815 hitsPerTLBLevel
[hit_level
]++;
817 // New SenderState for the memory access
818 X86ISA::GpuTLB::TranslationState
*sender_state
=
819 safe_cast
<X86ISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
821 delete sender_state
->tlbEntry
;
822 delete sender_state
->saved
;
825 assert(pkt
->req
->hasPaddr());
826 assert(pkt
->req
->hasSize());
828 uint8_t *tmpData
= pkt
->getPtr
<uint8_t>();
830 // this is necessary because the GPU TLB receives packets instead
831 // of requests. when the translation is complete, all relevent
832 // fields in the request will be populated, but not in the packet.
833 // here we create the new packet so we can set the size, addr,
835 PacketPtr oldPkt
= pkt
;
836 pkt
= new Packet(oldPkt
->req
, oldPkt
->cmd
);
838 pkt
->dataStatic(tmpData
);
841 // New SenderState for the memory access
842 pkt
->senderState
= new ComputeUnit::DataPort::SenderState(gpuDynInst
,
845 gpuDynInst
->memStatusVector
[pkt
->getAddr()].push_back(index
);
846 gpuDynInst
->tlbHitLevel
[index
] = hit_level
;
849 // translation is done. Schedule the mem_req_event at the
850 // appropriate cycle to send the timing memory request to ruby
851 EventFunctionWrapper
*mem_req_event
=
852 memPort
[index
]->createMemReqEvent(pkt
);
854 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x data "
855 "scheduled\n", cu_id
, gpuDynInst
->simdId
,
856 gpuDynInst
->wfSlotId
, index
, pkt
->req
->getPaddr());
858 schedule(mem_req_event
, curTick() + req_tick_latency
);
859 } else if (tlbPort
[tlbPort_index
]->isStalled()) {
860 assert(tlbPort
[tlbPort_index
]->retries
.size() > 0);
862 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Translation for addr %#x "
863 "failed!\n", cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
866 tlbPort
[tlbPort_index
]->retries
.push_back(pkt
);
867 } else if (!tlbPort
[tlbPort_index
]->sendTimingReq(pkt
)) {
868 // Stall the data port;
869 // No more packet will be issued till
870 // ruby indicates resources are freed by
871 // a recvReqRetry() call back on this port.
872 tlbPort
[tlbPort_index
]->stallPort();
874 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Translation for addr %#x "
875 "failed!\n", cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
878 tlbPort
[tlbPort_index
]->retries
.push_back(pkt
);
881 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
882 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, tmp_vaddr
);
885 if (pkt
->cmd
== MemCmd::MemFenceReq
) {
886 gpuDynInst
->statusBitVector
= VectorMask(0);
888 gpuDynInst
->statusBitVector
&= (~(1ll << index
));
891 // New SenderState for the memory access
892 delete pkt
->senderState
;
894 // Because it's atomic operation, only need TLB translation state
895 pkt
->senderState
= new TheISA::GpuTLB::TranslationState(TLB_mode
,
898 tlbPort
[tlbPort_index
]->sendFunctional(pkt
);
900 // the addr of the packet is not modified, so we need to create a new
901 // packet, or otherwise the memory access will have the old virtual
902 // address sent in the translation packet, instead of the physical
903 // address returned by the translation.
904 PacketPtr new_pkt
= new Packet(pkt
->req
, pkt
->cmd
);
905 new_pkt
->dataStatic(pkt
->getPtr
<uint8_t>());
907 // Translation is done. It is safe to send the packet to memory.
908 memPort
[0]->sendFunctional(new_pkt
);
910 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id
,
911 gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, index
,
912 new_pkt
->req
->getPaddr());
914 // safe_cast the senderState
915 TheISA::GpuTLB::TranslationState
*sender_state
=
916 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
918 delete sender_state
->tlbEntry
;
920 delete pkt
->senderState
;
926 ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst
, int index
, PacketPtr pkt
)
928 EventFunctionWrapper
*mem_req_event
=
929 memPort
[index
]->createMemReqEvent(pkt
);
932 // New SenderState for the memory access
933 pkt
->senderState
= new ComputeUnit::DataPort::SenderState(gpuDynInst
, index
,
936 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
937 cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
, index
,
938 pkt
->req
->getPaddr());
940 schedule(mem_req_event
, curTick() + req_tick_latency
);
944 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst
, bool kernelLaunch
,
947 assert(gpuDynInst
->isGlobalSeg());
950 req
= std::make_shared
<Request
>(
951 0, 0, 0, masterId(), 0, gpuDynInst
->wfDynId
);
955 req
->setFlags(Request::KERNEL
);
958 // for non-kernel MemFence operations, memorder flags are set depending
959 // on which type of request is currently being sent, so this
960 // should be set by the caller (e.g. if an inst has acq-rel
961 // semantics, it will send one acquire req an one release req)
962 gpuDynInst
->setRequestFlags(req
, kernelLaunch
);
964 // a mem fence must correspond to an acquire/release request
965 assert(req
->isAcquire() || req
->isRelease());
968 PacketPtr pkt
= new Packet(req
, MemCmd::MemFenceReq
);
970 // set packet's sender state
972 new ComputeUnit::DataPort::SenderState(gpuDynInst
, 0, nullptr);
975 sendSyncRequest(gpuDynInst
, 0, pkt
);
979 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt
)
981 DataPort::SenderState
*sender_state
=
982 safe_cast
<DataPort::SenderState
*>(pkt
->senderState
);
984 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
985 ComputeUnit
*compute_unit
= computeUnit
;
989 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
990 compute_unit
->cu_id
, gpuDynInst
->simdId
, gpuDynInst
->wfSlotId
,
991 pkt
->req
->getPaddr(), index
);
993 Addr paddr
= pkt
->req
->getPaddr();
995 if (pkt
->cmd
!= MemCmd::MemFenceResp
) {
996 int index
= gpuDynInst
->memStatusVector
[paddr
].back();
998 DPRINTF(GPUMem
, "Response for addr %#x, index %d\n",
999 pkt
->req
->getPaddr(), index
);
1001 gpuDynInst
->memStatusVector
[paddr
].pop_back();
1002 gpuDynInst
->pAddr
= pkt
->req
->getPaddr();
1004 if (pkt
->isRead() || pkt
->isWrite()) {
1006 if (gpuDynInst
->n_reg
<= MAX_REGS_FOR_NON_VEC_MEM_INST
) {
1007 gpuDynInst
->statusBitVector
&= (~(1ULL << index
));
1009 assert(gpuDynInst
->statusVector
[index
] > 0);
1010 gpuDynInst
->statusVector
[index
]--;
1012 if (!gpuDynInst
->statusVector
[index
])
1013 gpuDynInst
->statusBitVector
&= (~(1ULL << index
));
1016 DPRINTF(GPUMem
, "bitvector is now %#x\n",
1017 gpuDynInst
->statusBitVector
);
1019 if (gpuDynInst
->statusBitVector
== VectorMask(0)) {
1020 auto iter
= gpuDynInst
->memStatusVector
.begin();
1021 auto end
= gpuDynInst
->memStatusVector
.end();
1023 while (iter
!= end
) {
1024 assert(iter
->second
.empty());
1028 gpuDynInst
->memStatusVector
.clear();
1030 if (gpuDynInst
->n_reg
> MAX_REGS_FOR_NON_VEC_MEM_INST
)
1031 gpuDynInst
->statusVector
.clear();
1033 compute_unit
->globalMemoryPipe
.handleResponse(gpuDynInst
);
1035 DPRINTF(GPUMem
, "CU%d: WF[%d][%d]: packet totally complete\n",
1036 compute_unit
->cu_id
, gpuDynInst
->simdId
,
1037 gpuDynInst
->wfSlotId
);
1039 // after clearing the status vectors,
1040 // see if there is a continuation to perform
1041 // the continuation may generate more work for
1042 // this memory request
1043 if (gpuDynInst
->useContinuation
) {
1044 assert(!gpuDynInst
->isNoScope());
1045 gpuDynInst
->execContinuation(
1046 gpuDynInst
->staticInstruction(),
1052 gpuDynInst
->statusBitVector
= VectorMask(0);
1054 if (gpuDynInst
->useContinuation
) {
1055 assert(!gpuDynInst
->isNoScope());
1056 gpuDynInst
->execContinuation(gpuDynInst
->staticInstruction(),
1061 delete pkt
->senderState
;
1066 ComputeUnitParams::create()
1068 return new ComputeUnit(this);
1072 ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt
)
1074 Addr line
= pkt
->req
->getPaddr();
1076 DPRINTF(GPUTLB
, "CU%d: DTLBPort received %#x->%#x\n", computeUnit
->cu_id
,
1077 pkt
->req
->getVaddr(), line
);
1079 assert(pkt
->senderState
);
1080 computeUnit
->tlbCycles
+= curTick();
1082 // pop off the TLB translation state
1083 TheISA::GpuTLB::TranslationState
*translation_state
=
1084 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1086 // no PageFaults are permitted for data accesses
1087 if (!translation_state
->tlbEntry
) {
1088 DTLBPort::SenderState
*sender_state
=
1089 safe_cast
<DTLBPort::SenderState
*>(translation_state
->saved
);
1091 Wavefront
*w M5_VAR_USED
=
1092 computeUnit
->wfList
[sender_state
->_gpuDynInst
->simdId
]
1093 [sender_state
->_gpuDynInst
->wfSlotId
];
1095 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w
->wfDynId
,
1096 pkt
->req
->getVaddr());
1099 // update the hitLevel distribution
1100 int hit_level
= translation_state
->hitLevel
;
1101 computeUnit
->hitsPerTLBLevel
[hit_level
]++;
1103 delete translation_state
->tlbEntry
;
1104 assert(!translation_state
->ports
.size());
1105 pkt
->senderState
= translation_state
->saved
;
1108 BaseTLB::Mode TLB_mode
= translation_state
->tlbMode
;
1110 delete translation_state
;
1112 // use the original sender state to know how to close this transaction
1113 DTLBPort::SenderState
*sender_state
=
1114 safe_cast
<DTLBPort::SenderState
*>(pkt
->senderState
);
1116 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1117 int mp_index
= sender_state
->portIndex
;
1118 Addr vaddr
= pkt
->req
->getVaddr();
1119 gpuDynInst
->memStatusVector
[line
].push_back(mp_index
);
1120 gpuDynInst
->tlbHitLevel
[mp_index
] = hit_level
;
1124 if (pkt
->cmd
== MemCmd::ReadResp
) {
1125 requestCmd
= MemCmd::ReadReq
;
1126 } else if (pkt
->cmd
== MemCmd::WriteResp
) {
1127 requestCmd
= MemCmd::WriteReq
;
1128 } else if (pkt
->cmd
== MemCmd::SwapResp
) {
1129 requestCmd
= MemCmd::SwapReq
;
1131 panic("unsupported response to request conversion %s\n",
1132 pkt
->cmd
.toString());
1135 if (computeUnit
->prefetchDepth
) {
1136 int simdId
= gpuDynInst
->simdId
;
1137 int wfSlotId
= gpuDynInst
->wfSlotId
;
1140 switch(computeUnit
->prefetchType
) {
1142 last
= computeUnit
->lastVaddrCU
[mp_index
];
1144 case Enums::PF_PHASE
:
1145 last
= computeUnit
->lastVaddrSimd
[simdId
][mp_index
];
1148 last
= computeUnit
->lastVaddrWF
[simdId
][wfSlotId
][mp_index
];
1153 DPRINTF(GPUPrefetch
, "CU[%d][%d][%d][%d]: %#x was last\n",
1154 computeUnit
->cu_id
, simdId
, wfSlotId
, mp_index
, last
);
1156 int stride
= last
? (roundDown(vaddr
, TheISA::PageBytes
) -
1157 roundDown(last
, TheISA::PageBytes
)) >> TheISA::PageShift
1160 DPRINTF(GPUPrefetch
, "Stride is %d\n", stride
);
1162 computeUnit
->lastVaddrCU
[mp_index
] = vaddr
;
1163 computeUnit
->lastVaddrSimd
[simdId
][mp_index
] = vaddr
;
1164 computeUnit
->lastVaddrWF
[simdId
][wfSlotId
][mp_index
] = vaddr
;
1166 stride
= (computeUnit
->prefetchType
== Enums::PF_STRIDE
) ?
1167 computeUnit
->prefetchStride
: stride
;
1169 DPRINTF(GPUPrefetch
, "%#x to: CU[%d][%d][%d][%d]\n", vaddr
,
1170 computeUnit
->cu_id
, simdId
, wfSlotId
, mp_index
);
1172 DPRINTF(GPUPrefetch
, "Prefetching from %#x:", vaddr
);
1174 // Prefetch Next few pages atomically
1175 for (int pf
= 1; pf
<= computeUnit
->prefetchDepth
; ++pf
) {
1176 DPRINTF(GPUPrefetch
, "%d * %d: %#x\n", pf
, stride
,
1177 vaddr
+stride
*pf
*TheISA::PageBytes
);
1182 RequestPtr prefetch_req
= std::make_shared
<Request
>(
1183 vaddr
+ stride
* pf
* TheISA::PageBytes
,
1185 computeUnit
->masterId(),
1188 PacketPtr prefetch_pkt
= new Packet(prefetch_req
, requestCmd
);
1190 prefetch_pkt
->dataStatic(&foo
);
1192 // Because it's atomic operation, only need TLB translation state
1193 prefetch_pkt
->senderState
=
1194 new TheISA::GpuTLB::TranslationState(TLB_mode
,
1195 computeUnit
->shader
->gpuTc
,
1198 // Currently prefetches are zero-latency, hence the sendFunctional
1199 sendFunctional(prefetch_pkt
);
1201 /* safe_cast the senderState */
1202 TheISA::GpuTLB::TranslationState
*tlb_state
=
1203 safe_cast
<TheISA::GpuTLB::TranslationState
*>(
1204 prefetch_pkt
->senderState
);
1207 delete tlb_state
->tlbEntry
;
1209 delete prefetch_pkt
;
1213 // First we must convert the response cmd back to a request cmd so that
1214 // the request can be sent through the cu's master port
1215 PacketPtr new_pkt
= new Packet(pkt
->req
, requestCmd
);
1216 new_pkt
->dataStatic(pkt
->getPtr
<uint8_t>());
1217 delete pkt
->senderState
;
1220 // New SenderState for the memory access
1221 new_pkt
->senderState
=
1222 new ComputeUnit::DataPort::SenderState(gpuDynInst
, mp_index
,
1225 // translation is done. Schedule the mem_req_event at the appropriate
1226 // cycle to send the timing memory request to ruby
1227 EventFunctionWrapper
*mem_req_event
=
1228 computeUnit
->memPort
[mp_index
]->createMemReqEvent(new_pkt
);
1230 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1231 computeUnit
->cu_id
, gpuDynInst
->simdId
,
1232 gpuDynInst
->wfSlotId
, mp_index
, new_pkt
->req
->getPaddr());
1234 computeUnit
->schedule(mem_req_event
, curTick() +
1235 computeUnit
->req_tick_latency
);
1240 EventFunctionWrapper
*
1241 ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt
)
1243 return new EventFunctionWrapper(
1244 [this, pkt
]{ processMemReqEvent(pkt
); },
1245 "ComputeUnit memory request event", true);
1248 EventFunctionWrapper
*
1249 ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt
)
1251 return new EventFunctionWrapper(
1252 [this, pkt
]{ processMemRespEvent(pkt
); },
1253 "ComputeUnit memory response event", true);
1257 ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt
)
1259 SenderState
*sender_state
= safe_cast
<SenderState
*>(pkt
->senderState
);
1260 GPUDynInstPtr gpuDynInst
= sender_state
->_gpuDynInst
;
1261 ComputeUnit
*compute_unit M5_VAR_USED
= computeUnit
;
1263 if (!(sendTimingReq(pkt
))) {
1264 retries
.push_back(std::make_pair(pkt
, gpuDynInst
));
1267 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1268 compute_unit
->cu_id
, gpuDynInst
->simdId
,
1269 gpuDynInst
->wfSlotId
, index
,
1270 pkt
->req
->getPaddr());
1273 "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1274 compute_unit
->cu_id
, gpuDynInst
->simdId
,
1275 gpuDynInst
->wfSlotId
, index
,
1276 pkt
->req
->getPaddr());
1281 * The initial translation request could have been rejected,
1282 * if <retries> queue is not Retry sending the translation
1283 * request. sendRetry() is called from the peer port whenever
1284 * a translation completes.
1287 ComputeUnit::DTLBPort::recvReqRetry()
1289 int len
= retries
.size();
1291 DPRINTF(GPUTLB
, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1292 computeUnit
->cu_id
, len
);
1295 assert(isStalled());
1296 // recvReqRetry is an indication that the resource on which this
1297 // port was stalling on is freed. So, remove the stall first
1300 for (int i
= 0; i
< len
; ++i
) {
1301 PacketPtr pkt
= retries
.front();
1302 Addr vaddr M5_VAR_USED
= pkt
->req
->getVaddr();
1303 DPRINTF(GPUTLB
, "CU%d: retrying D-translaton for address%#x", vaddr
);
1305 if (!sendTimingReq(pkt
)) {
1308 DPRINTF(GPUTLB
, ": failed again\n");
1311 DPRINTF(GPUTLB
, ": successful\n");
1312 retries
.pop_front();
1318 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt
)
1320 Addr line M5_VAR_USED
= pkt
->req
->getPaddr();
1321 DPRINTF(GPUTLB
, "CU%d: ITLBPort received %#x->%#x\n",
1322 computeUnit
->cu_id
, pkt
->req
->getVaddr(), line
);
1324 assert(pkt
->senderState
);
1326 // pop off the TLB translation state
1327 TheISA::GpuTLB::TranslationState
*translation_state
=
1328 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
1330 bool success
= translation_state
->tlbEntry
!= nullptr;
1331 delete translation_state
->tlbEntry
;
1332 assert(!translation_state
->ports
.size());
1333 pkt
->senderState
= translation_state
->saved
;
1334 delete translation_state
;
1336 // use the original sender state to know how to close this transaction
1337 ITLBPort::SenderState
*sender_state
=
1338 safe_cast
<ITLBPort::SenderState
*>(pkt
->senderState
);
1340 // get the wavefront associated with this translation request
1341 Wavefront
*wavefront
= sender_state
->wavefront
;
1342 delete pkt
->senderState
;
1345 // pkt is reused in fetch(), don't delete it here. However, we must
1346 // reset the command to be a request so that it can be sent through
1347 // the cu's master port
1348 assert(pkt
->cmd
== MemCmd::ReadResp
);
1349 pkt
->cmd
= MemCmd::ReadReq
;
1351 computeUnit
->fetchStage
.fetch(pkt
, wavefront
);
1353 if (wavefront
->dropFetch
) {
1354 assert(wavefront
->instructionBuffer
.empty());
1355 wavefront
->dropFetch
= false;
1358 wavefront
->pendingFetch
= 0;
1365 * The initial translation request could have been rejected, if
1366 * <retries> queue is not empty. Retry sending the translation
1367 * request. sendRetry() is called from the peer port whenever
1368 * a translation completes.
1371 ComputeUnit::ITLBPort::recvReqRetry()
1374 int len
= retries
.size();
1375 DPRINTF(GPUTLB
, "CU%d: ITLB recvReqRetry - %d pending requests\n", len
);
1378 assert(isStalled());
1380 // recvReqRetry is an indication that the resource on which this
1381 // port was stalling on is freed. So, remove the stall first
1384 for (int i
= 0; i
< len
; ++i
) {
1385 PacketPtr pkt
= retries
.front();
1386 Addr vaddr M5_VAR_USED
= pkt
->req
->getVaddr();
1387 DPRINTF(GPUTLB
, "CU%d: retrying I-translaton for address%#x", vaddr
);
1389 if (!sendTimingReq(pkt
)) {
1390 stallPort(); // Stall port
1391 DPRINTF(GPUTLB
, ": failed again\n");
1394 DPRINTF(GPUTLB
, ": successful\n");
1395 retries
.pop_front();
1401 ComputeUnit::regStats()
1403 ClockedObject::regStats();
1406 .name(name() + ".valu_insts")
1407 .desc("Number of vector ALU insts issued.")
1410 .name(name() + ".valu_insts_per_wf")
1411 .desc("The avg. number of vector ALU insts issued per-wavefront.")
1414 .name(name() + ".salu_insts")
1415 .desc("Number of scalar ALU insts issued.")
1418 .name(name() + ".salu_insts_per_wf")
1419 .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1422 .name(name() + ".inst_cycles_valu")
1423 .desc("Number of cycles needed to execute VALU insts.")
1426 .name(name() + ".inst_cycles_salu")
1427 .desc("Number of cycles needed to execute SALU insts.")
1430 .name(name() + ".thread_cycles_valu")
1431 .desc("Number of thread cycles used to execute vector ALU ops. "
1432 "Similar to instCyclesVALU but multiplied by the number of "
1436 .name(name() + ".valu_utilization")
1437 .desc("Percentage of active vector ALU threads in a wave.")
1440 .name(name() + ".lds_no_flat_insts")
1441 .desc("Number of LDS insts issued, not including FLAT "
1442 "accesses that resolve to LDS.")
1445 .name(name() + ".lds_no_flat_insts_per_wf")
1446 .desc("The avg. number of LDS insts (not including FLAT "
1447 "accesses that resolve to LDS) per-wavefront.")
1450 .name(name() + ".flat_vmem_insts")
1451 .desc("The number of FLAT insts that resolve to vmem issued.")
1454 .name(name() + ".flat_vmem_insts_per_wf")
1455 .desc("The average number of FLAT insts that resolve to vmem "
1456 "issued per-wavefront.")
1459 .name(name() + ".flat_lds_insts")
1460 .desc("The number of FLAT insts that resolve to LDS issued.")
1463 .name(name() + ".flat_lds_insts_per_wf")
1464 .desc("The average number of FLAT insts that resolve to LDS "
1465 "issued per-wavefront.")
1468 .name(name() + ".vector_mem_writes")
1469 .desc("Number of vector mem write insts (excluding FLAT insts).")
1471 vectorMemWritesPerWF
1472 .name(name() + ".vector_mem_writes_per_wf")
1473 .desc("The average number of vector mem write insts "
1474 "(excluding FLAT insts) per-wavefront.")
1477 .name(name() + ".vector_mem_reads")
1478 .desc("Number of vector mem read insts (excluding FLAT insts).")
1481 .name(name() + ".vector_mem_reads_per_wf")
1482 .desc("The avg. number of vector mem read insts (excluding "
1483 "FLAT insts) per-wavefront.")
1486 .name(name() + ".scalar_mem_writes")
1487 .desc("Number of scalar mem write insts.")
1489 scalarMemWritesPerWF
1490 .name(name() + ".scalar_mem_writes_per_wf")
1491 .desc("The average number of scalar mem write insts per-wavefront.")
1494 .name(name() + ".scalar_mem_reads")
1495 .desc("Number of scalar mem read insts.")
1498 .name(name() + ".scalar_mem_reads_per_wf")
1499 .desc("The average number of scalar mem read insts per-wavefront.")
1502 vALUInstsPerWF
= vALUInsts
/ completedWfs
;
1503 sALUInstsPerWF
= sALUInsts
/ completedWfs
;
1504 vALUUtilization
= (threadCyclesVALU
/ (64 * instCyclesVALU
)) * 100;
1505 ldsNoFlatInstsPerWF
= ldsNoFlatInsts
/ completedWfs
;
1506 flatVMemInstsPerWF
= flatVMemInsts
/ completedWfs
;
1507 flatLDSInstsPerWF
= flatLDSInsts
/ completedWfs
;
1508 vectorMemWritesPerWF
= vectorMemWrites
/ completedWfs
;
1509 vectorMemReadsPerWF
= vectorMemReads
/ completedWfs
;
1510 scalarMemWritesPerWF
= scalarMemWrites
/ completedWfs
;
1511 scalarMemReadsPerWF
= scalarMemReads
/ completedWfs
;
1514 .name(name() + ".tlb_cycles")
1515 .desc("total number of cycles for all uncoalesced requests")
1519 .name(name() + ".tlb_requests")
1520 .desc("number of uncoalesced requests")
1524 .name(name() + ".avg_translation_latency")
1525 .desc("Avg. translation latency for data translations")
1528 tlbLatency
= tlbCycles
/ tlbRequests
;
1532 .name(name() + ".TLB_hits_distribution")
1533 .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1536 // fixed number of TLB levels
1537 for (int i
= 0; i
< 4; ++i
) {
1539 hitsPerTLBLevel
.subname(i
,"page_table");
1541 hitsPerTLBLevel
.subname(i
, csprintf("L%d_TLB",i
));
1546 .name(name() + ".inst_exec_rate")
1547 .desc("Instruction Execution Rate: Number of executed vector "
1548 "instructions per cycle")
1552 .init(0, wfSize(), 2)
1553 .name(name() + ".lds_bank_conflicts")
1554 .desc("Number of bank conflicts per LDS memory packet")
1558 .name(name() + ".lds_bank_access_cnt")
1559 .desc("Total number of LDS bank accesses")
1563 // A wavefront can touch up to N pages per memory instruction where
1564 // N is equal to the wavefront size
1565 // The number of pages per bin can be configured (here it's 4).
1566 .init(1, wfSize(), 4)
1567 .name(name() + ".page_divergence_dist")
1568 .desc("pages touched per wf (over all mem. instr.)")
1571 controlFlowDivergenceDist
1572 .init(1, wfSize(), 4)
1573 .name(name() + ".warp_execution_dist")
1574 .desc("number of lanes active per instruction (oval all instructions)")
1577 activeLanesPerGMemInstrDist
1578 .init(1, wfSize(), 4)
1579 .name(name() + ".gmem_lanes_execution_dist")
1580 .desc("number of active lanes per global memory instruction")
1583 activeLanesPerLMemInstrDist
1584 .init(1, wfSize(), 4)
1585 .name(name() + ".lmem_lanes_execution_dist")
1586 .desc("number of active lanes per local memory instruction")
1590 .name(name() + ".num_instr_executed")
1591 .desc("number of instructions executed")
1595 .name(name() + ".num_vec_ops_executed")
1596 .desc("number of vec ops executed (e.g. WF size/inst)")
1600 .name(name() + ".num_total_cycles")
1601 .desc("number of cycles the CU ran for")
1605 .name(name() + ".ipc")
1606 .desc("Instructions per cycle (this CU only)")
1610 .name(name() + ".vpc")
1611 .desc("Vector Operations per cycle (this CU only)")
1615 .name(name() + ".num_alu_insts_executed")
1616 .desc("Number of dynamic non-GM memory insts executed")
1619 wgBlockedDueLdsAllocation
1620 .name(name() + ".wg_blocked_due_lds_alloc")
1621 .desc("Workgroup blocked due to LDS capacity")
1624 ipc
= numInstrExecuted
/ totalCycles
;
1625 vpc
= numVecOpsExecuted
/ totalCycles
;
1627 numTimesWgBlockedDueVgprAlloc
1628 .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1629 .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1633 .name(name() + ".global_mem_instr_cnt")
1634 .desc("dynamic global memory instructions count")
1638 .name(name() + ".local_mem_instr_cnt")
1639 .desc("dynamic local memory intruction count")
1642 numALUInstsExecuted
= numInstrExecuted
- dynamicGMemInstrCnt
-
1643 dynamicLMemInstrCnt
;
1646 .name(name() + ".num_completed_wfs")
1647 .desc("number of completed wavefronts")
1651 .name(name() + ".num_CAS_ops")
1652 .desc("number of compare and swap operations")
1656 .name(name() + ".num_failed_CAS_ops")
1657 .desc("number of compare and swap operations that failed")
1660 // register stats of pipeline stages
1661 fetchStage
.regStats();
1662 scoreboardCheckStage
.regStats();
1663 scheduleStage
.regStats();
1664 execStage
.regStats();
1666 // register stats of memory pipeline
1667 globalMemoryPipe
.regStats();
1668 localMemoryPipe
.regStats();
1672 ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst
)
1674 if (gpuDynInst
->isScalar()) {
1675 if (gpuDynInst
->isALU() && !gpuDynInst
->isWaitcnt()) {
1678 } else if (gpuDynInst
->isLoad()) {
1680 } else if (gpuDynInst
->isStore()) {
1684 if (gpuDynInst
->isALU()) {
1687 threadCyclesVALU
+= gpuDynInst
->wavefront()->execMask().count();
1688 } else if (gpuDynInst
->isFlat()) {
1689 if (gpuDynInst
->isLocalMem()) {
1694 } else if (gpuDynInst
->isLocalMem()) {
1696 } else if (gpuDynInst
->isLoad()) {
1698 } else if (gpuDynInst
->isStore()) {
1705 ComputeUnit::updatePageDivergenceDist(Addr addr
)
1707 Addr virt_page_addr
= roundDown(addr
, TheISA::PageBytes
);
1709 if (!pagesTouched
.count(virt_page_addr
))
1710 pagesTouched
[virt_page_addr
] = 1;
1712 pagesTouched
[virt_page_addr
]++;
1716 ComputeUnit::CUExitCallback::process()
1718 if (computeUnit
->countPages
) {
1719 std::ostream
*page_stat_file
=
1720 simout
.create(computeUnit
->name().c_str())->stream();
1722 *page_stat_file
<< "page, wavefront accesses, workitem accesses" <<
1725 for (auto iter
: computeUnit
->pageAccesses
) {
1726 *page_stat_file
<< std::hex
<< iter
.first
<< ",";
1727 *page_stat_file
<< std::dec
<< iter
.second
.first
<< ",";
1728 *page_stat_file
<< std::dec
<< iter
.second
.second
<< std::endl
;
1734 ComputeUnit::isDone() const
1736 for (int i
= 0; i
< numSIMDs
; ++i
) {
1737 if (!isSimdDone(i
)) {
1742 bool glbMemBusRdy
= true;
1743 for (int j
= 0; j
< numGlbMemUnits
; ++j
) {
1744 glbMemBusRdy
&= vrfToGlobalMemPipeBus
[j
].rdy();
1746 bool locMemBusRdy
= true;
1747 for (int j
= 0; j
< numLocMemUnits
; ++j
) {
1748 locMemBusRdy
&= vrfToLocalMemPipeBus
[j
].rdy();
1751 if (!globalMemoryPipe
.isGMLdRespFIFOWrRdy() ||
1752 !globalMemoryPipe
.isGMStRespFIFOWrRdy() ||
1753 !globalMemoryPipe
.isGMReqFIFOWrRdy() || !localMemoryPipe
.isLMReqFIFOWrRdy()
1754 || !localMemoryPipe
.isLMRespFIFOWrRdy() || !locMemToVrfBus
.rdy() ||
1755 !glbMemToVrfBus
.rdy() || !locMemBusRdy
|| !glbMemBusRdy
) {
1763 ComputeUnit::getRefCounter(const uint32_t dispatchId
, const uint32_t wgId
) const
1765 return lds
.getRefCounter(dispatchId
, wgId
);
1769 ComputeUnit::isSimdDone(uint32_t simdId
) const
1771 assert(simdId
< numSIMDs
);
1773 for (int i
=0; i
< numGlbMemUnits
; ++i
) {
1774 if (!vrfToGlobalMemPipeBus
[i
].rdy())
1777 for (int i
=0; i
< numLocMemUnits
; ++i
) {
1778 if (!vrfToLocalMemPipeBus
[i
].rdy())
1781 if (!aluPipe
[simdId
].rdy()) {
1785 for (int i_wf
= 0; i_wf
< shader
->n_wf
; ++i_wf
){
1786 if (wfList
[simdId
][i_wf
]->status
!= Wavefront::S_STOPPED
) {
1795 * send a general request to the LDS
1796 * make sure to look at the return value here as your request might be
1797 * NACK'd and returning false means that you have to have some backup plan
1800 ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst
)
1802 // this is just a request to carry the GPUDynInstPtr
1804 RequestPtr newRequest
= std::make_shared
<Request
>();
1805 newRequest
->setPaddr(0x0);
1807 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1808 PacketPtr newPacket
= new Packet(newRequest
, MemCmd::ReadReq
);
1810 // This is the SenderState needed upon return
1811 newPacket
->senderState
= new LDSPort::SenderState(gpuDynInst
);
1813 return ldsPort
->sendTimingReq(newPacket
);
1817 * get the result of packets sent to the LDS when they return
1820 ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet
)
1822 const ComputeUnit::LDSPort::SenderState
*senderState
=
1823 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(packet
->senderState
);
1825 fatal_if(!senderState
, "did not get the right sort of sender state");
1827 GPUDynInstPtr gpuDynInst
= senderState
->getMemInst();
1829 delete packet
->senderState
;
1832 computeUnit
->localMemoryPipe
.getLMRespFIFO().push(gpuDynInst
);
1837 * attempt to send this packet, either the port is already stalled, the request
1838 * is nack'd and must stall or the request goes through
1839 * when a request cannot be sent, add it to the retries queue
1842 ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt
)
1844 ComputeUnit::LDSPort::SenderState
*sender_state
=
1845 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(pkt
->senderState
);
1846 fatal_if(!sender_state
, "packet without a valid sender state");
1848 GPUDynInstPtr gpuDynInst M5_VAR_USED
= sender_state
->getMemInst();
1851 fatal_if(retries
.empty(), "must have retries waiting to be stalled");
1855 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: LDS send failed!\n",
1856 computeUnit
->cu_id
, gpuDynInst
->simdId
,
1857 gpuDynInst
->wfSlotId
);
1859 } else if (!MasterPort::sendTimingReq(pkt
)) {
1860 // need to stall the LDS port until a recvReqRetry() is received
1861 // this indicates that there is more space
1865 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1866 computeUnit
->cu_id
, gpuDynInst
->simdId
,
1867 gpuDynInst
->wfSlotId
, pkt
->req
->getPaddr());
1870 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1871 computeUnit
->cu_id
, gpuDynInst
->simdId
,
1872 gpuDynInst
->wfSlotId
, pkt
->req
->getPaddr());
1878 * the bus is telling the port that there is now space so retrying stalled
1879 * requests should work now
1880 * this allows the port to have a request be nack'd and then have the receiver
1881 * say when there is space, rather than simply retrying the send every cycle
1884 ComputeUnit::LDSPort::recvReqRetry()
1886 auto queueSize
= retries
.size();
1888 DPRINTF(GPUPort
, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1889 computeUnit
->cu_id
, queueSize
);
1891 fatal_if(queueSize
< 1,
1892 "why was there a recvReqRetry() with no pending reqs?");
1893 fatal_if(!isStalled(),
1894 "recvReqRetry() happened when the port was not stalled");
1898 while (!retries
.empty()) {
1899 PacketPtr packet
= retries
.front();
1901 DPRINTF(GPUPort
, "CU%d: retrying LDS send\n", computeUnit
->cu_id
);
1903 if (!MasterPort::sendTimingReq(packet
)) {
1906 DPRINTF(GPUPort
, ": LDS send failed again\n");
1909 DPRINTF(GPUTLB
, ": LDS send successful\n");