misc: Replaced master/slave terminology
[gem5.git] / src / gpu-compute / compute_unit.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/compute_unit.hh"
35
36 #include <limits>
37
38 #include "arch/x86/isa_traits.hh"
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
50 #include "gpu-compute/dispatcher.hh"
51 #include "gpu-compute/gpu_dyn_inst.hh"
52 #include "gpu-compute/gpu_static_inst.hh"
53 #include "gpu-compute/scalar_register_file.hh"
54 #include "gpu-compute/shader.hh"
55 #include "gpu-compute/simple_pool_manager.hh"
56 #include "gpu-compute/vector_register_file.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61
62 ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
63 numVectorGlobalMemUnits(p->num_global_mem_pipes),
64 numVectorSharedMemUnits(p->num_shared_mem_pipes),
65 numScalarMemUnits(p->num_scalar_mem_pipes),
66 numVectorALUs(p->num_SIMDs),
67 numScalarALUs(p->num_scalar_cores),
68 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
69 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
70 registerManager(p->register_manager),
71 fetchStage(p, *this),
72 scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
73 scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
74 execStage(p, *this, scheduleToExecute),
75 globalMemoryPipe(p, *this),
76 localMemoryPipe(p, *this),
77 scalarMemoryPipe(p, *this),
78 tickEvent([this]{ exec(); }, "Compute unit tick event",
79 false, Event::CPU_Tick_Pri),
80 cu_id(p->cu_id),
81 vrf(p->vector_register_file), srf(p->scalar_register_file),
82 simdWidth(p->simd_width),
83 spBypassPipeLength(p->spbypass_pipe_length),
84 dpBypassPipeLength(p->dpbypass_pipe_length),
85 scalarPipeStages(p->scalar_pipe_length),
86 operandNetworkLength(p->operand_network_length),
87 issuePeriod(p->issue_period),
88 vrf_gm_bus_latency(p->vrf_gm_bus_latency),
89 srf_scm_bus_latency(p->srf_scm_bus_latency),
90 vrf_lm_bus_latency(p->vrf_lm_bus_latency),
91 perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
92 prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
93 debugSegFault(p->debugSegFault),
94 functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
95 countPages(p->countPages),
96 req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
97 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
98 _requestorId(p->system->getRequestorId(this, "ComputeUnit")),
99 lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
100 ldsPort(csprintf("%s-port", name()), this),
101 scalarDataPort(csprintf("%s-port", name()), this),
102 scalarDTLBPort(csprintf("%s-port", name()), this),
103 sqcPort(csprintf("%s-port", name()), this),
104 sqcTLBPort(csprintf("%s-port", name()), this),
105 _cacheLineSize(p->system->cacheLineSize()),
106 _numBarrierSlots(p->num_barrier_slots),
107 globalSeqNum(0), wavefrontSize(p->wf_size),
108 scoreboardCheckToSchedule(p),
109 scheduleToExecute(p)
110 {
111 /**
112 * This check is necessary because std::bitset only provides conversion
113 * to unsigned long or unsigned long long via to_ulong() or to_ullong().
114 * there are a few places in the code where to_ullong() is used, however
115 * if wavefrontSize is larger than a value the host can support then
116 * bitset will throw a runtime exception. We should remove all use of
117 * to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
118 * however until that is done this assert is required.
119 */
120 fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
121 p->wf_size <= 0,
122 "WF size is larger than the host can support");
123 fatal_if(!isPowerOf2(wavefrontSize),
124 "Wavefront size should be a power of 2");
125 // calculate how many cycles a vector load or store will need to transfer
126 // its data over the corresponding buses
127 numCyclesPerStoreTransfer =
128 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
129 (double)vrfToCoalescerBusWidth);
130
131 numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
132 / coalescerToVrfBusWidth;
133
134 // Initialization: all WF slots are assumed STOPPED
135 idleWfs = p->n_wf * numVectorALUs;
136 lastVaddrWF.resize(numVectorALUs);
137 wfList.resize(numVectorALUs);
138
139 wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
140
141 for (int i = 0; i < p->num_barrier_slots; ++i) {
142 freeBarrierIds.insert(i);
143 }
144
145 for (int j = 0; j < numVectorALUs; ++j) {
146 lastVaddrWF[j].resize(p->n_wf);
147
148 for (int i = 0; i < p->n_wf; ++i) {
149 lastVaddrWF[j][i].resize(wfSize());
150
151 wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
152 wfList[j][i]->setParent(this);
153
154 for (int k = 0; k < wfSize(); ++k) {
155 lastVaddrWF[j][i][k] = 0;
156 }
157 }
158 }
159
160 lastVaddrSimd.resize(numVectorALUs);
161
162 for (int i = 0; i < numVectorALUs; ++i) {
163 lastVaddrSimd[i].resize(wfSize(), 0);
164 }
165
166 lastVaddrCU.resize(wfSize());
167
168 lds.setParent(this);
169
170 if (p->execPolicy == "OLDEST-FIRST") {
171 exec_policy = EXEC_POLICY::OLDEST;
172 } else if (p->execPolicy == "ROUND-ROBIN") {
173 exec_policy = EXEC_POLICY::RR;
174 } else {
175 fatal("Invalid WF execution policy (CU)\n");
176 }
177
178 for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
179 memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
180 }
181
182 for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
183 tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
184 }
185
186 // Setup tokens for response ports. The number of tokens in memPortTokens
187 // is the total token count for the entire vector port (i.e., this CU).
188 memPortTokens = new TokenManager(p->max_cu_tokens);
189
190 registerExitCallback([this]() { exitCallback(); });
191
192 lastExecCycle.resize(numVectorALUs, 0);
193
194 for (int i = 0; i < vrf.size(); ++i) {
195 vrf[i]->setParent(this);
196 }
197 for (int i = 0; i < srf.size(); ++i) {
198 srf[i]->setParent(this);
199 }
200 numVecRegsPerSimd = vrf[0]->numRegs();
201 numScalarRegsPerSimd = srf[0]->numRegs();
202
203 registerManager->setParent(this);
204
205 activeWaves = 0;
206
207 instExecPerSimd.resize(numVectorALUs, 0);
208
209 // Calculate the number of bits to address a cache line
210 panic_if(!isPowerOf2(_cacheLineSize),
211 "Cache line size should be a power of two.");
212 cacheLineBits = floorLog2(_cacheLineSize);
213 }
214
215 ComputeUnit::~ComputeUnit()
216 {
217 // Delete wavefront slots
218 for (int j = 0; j < numVectorALUs; ++j) {
219 for (int i = 0; i < shader->n_wf; ++i) {
220 delete wfList[j][i];
221 }
222 lastVaddrSimd[j].clear();
223 }
224 lastVaddrCU.clear();
225 }
226
227 int
228 ComputeUnit::numExeUnits() const
229 {
230 return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
231 numVectorSharedMemUnits + numScalarMemUnits;
232 }
233
234 // index into readyList of the first memory unit
235 int
236 ComputeUnit::firstMemUnit() const
237 {
238 return numVectorALUs + numScalarALUs;
239 }
240
241 // index into readyList of the last memory unit
242 int
243 ComputeUnit::lastMemUnit() const
244 {
245 return numExeUnits() - 1;
246 }
247
248 // index into scalarALUs vector of SALU used by the wavefront
249 int
250 ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const
251 {
252 if (numScalarALUs == 1) {
253 return 0;
254 } else {
255 return w->simdId % numScalarALUs;
256 }
257 }
258
259 // index into readyList of Scalar ALU unit used by wavefront
260 int
261 ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const
262 {
263 return numVectorALUs + mapWaveToScalarAlu(w);
264 }
265
266 // index into readyList of Global Memory unit used by wavefront
267 int
268 ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const
269 {
270 // TODO: FIXME if more than 1 GM pipe supported
271 return numVectorALUs + numScalarALUs;
272 }
273
274 // index into readyList of Local Memory unit used by wavefront
275 int
276 ComputeUnit::mapWaveToLocalMem(Wavefront *w) const
277 {
278 // TODO: FIXME if more than 1 LM pipe supported
279 return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits;
280 }
281
282 // index into readyList of Scalar Memory unit used by wavefront
283 int
284 ComputeUnit::mapWaveToScalarMem(Wavefront *w) const
285 {
286 // TODO: FIXME if more than 1 ScM pipe supported
287 return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
288 numVectorSharedMemUnits;
289 }
290
291 void
292 ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
293 {
294 w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
295 w->workGroupSz[0] = task->wgSize(0);
296 w->workGroupSz[1] = task->wgSize(1);
297 w->workGroupSz[2] = task->wgSize(2);
298 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
299 w->gridSz[0] = task->gridSize(0);
300 w->gridSz[1] = task->gridSize(1);
301 w->gridSz[2] = task->gridSize(2);
302 w->computeActualWgSz(task);
303 }
304
305 void
306 ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
307 HSAQueueEntry *task, int bar_id, bool fetchContext)
308 {
309 static int _n_wave = 0;
310
311 VectorMask init_mask;
312 init_mask.reset();
313
314 for (int k = 0; k < wfSize(); ++k) {
315 if (k + waveId * wfSize() < w->actualWgSzTotal)
316 init_mask[k] = 1;
317 }
318
319 w->execMask() = init_mask;
320
321 w->kernId = task->dispatchId();
322 w->wfId = waveId;
323 w->initMask = init_mask.to_ullong();
324
325 if (bar_id > WFBarrier::InvalidID) {
326 w->barrierId(bar_id);
327 } else {
328 assert(!w->hasBarrier());
329 }
330
331 for (int k = 0; k < wfSize(); ++k) {
332 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
333 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
334 w->actualWgSz[1];
335 w->workItemId[2][k] = (k + waveId * wfSize()) /
336 (w->actualWgSz[0] * w->actualWgSz[1]);
337
338 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
339 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
340 w->workItemId[0][k];
341 }
342
343 // WG state
344 w->wgId = task->globalWgId();
345 w->dispatchId = task->dispatchId();
346 w->workGroupId[0] = w->wgId % task->numWg(0);
347 w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
348 w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
349
350 // set the wavefront context to have a pointer to this section of the LDS
351 w->ldsChunk = ldsChunk;
352
353 int32_t refCount M5_VAR_USED =
354 lds.increaseRefCounter(w->dispatchId, w->wgId);
355 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
356 cu_id, w->wgId, refCount);
357
358 w->instructionBuffer.clear();
359
360 if (w->pendingFetch)
361 w->dropFetch = true;
362
363 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
364 "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
365 w->simdId, w->wfSlotId, refCount);
366
367 w->initRegState(task, w->actualWgSzTotal);
368 w->start(_n_wave++, task->codeAddr());
369
370 waveLevelParallelism.sample(activeWaves);
371 activeWaves++;
372 }
373
374 /**
375 * trigger invalidate operation in the cu
376 *
377 * req: request initialized in shader, carrying the invlidate flags
378 */
379 void
380 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
381 GPUDynInstPtr gpuDynInst
382 = std::make_shared<GPUDynInst>(this, nullptr,
383 new KernelLaunchStaticInst(), getAndIncSeqNum());
384
385 // kern_id will be used in inv responses
386 gpuDynInst->kern_id = kernId;
387 // update contextId field
388 req->setContext(gpuDynInst->wfDynId);
389
390 injectGlobalMemFence(gpuDynInst, true, req);
391 }
392
393 /**
394 * trigger flush operation in the cu
395 *
396 * gpuDynInst: inst passed to the request
397 */
398 void
399 ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
400 injectGlobalMemFence(gpuDynInst, true);
401 }
402
403 void
404 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
405 {
406 // If we aren't ticking, start it up!
407 if (!tickEvent.scheduled()) {
408 DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
409 schedule(tickEvent, nextCycle());
410 }
411
412 // the kernel's invalidate must have finished before any wg dispatch
413 assert(task->isInvDone());
414
415 // reserve the LDS capacity allocated to the work group
416 // disambiguated by the dispatch ID and workgroup ID, which should be
417 // globally unique
418 LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
419 task->globalWgId(),
420 task->ldsSize());
421
422 panic_if(!ldsChunk, "was not able to reserve space for this WG");
423
424 // calculate the number of 32-bit vector registers required
425 // by each work item
426 int vregDemand = task->numVectorRegs();
427 int sregDemand = task->numScalarRegs();
428 int wave_id = 0;
429
430 int barrier_id = WFBarrier::InvalidID;
431
432 /**
433 * If this WG only has one WF it will not consume any barrier
434 * resources because it has no need of them.
435 */
436 if (num_wfs_in_wg > 1) {
437 /**
438 * Find a free barrier slot for this WG. Each WF in the WG will
439 * receive the same barrier ID.
440 */
441 barrier_id = getFreeBarrierId();
442 auto &wf_barrier = barrierSlot(barrier_id);
443 assert(!wf_barrier.maxBarrierCnt());
444 assert(!wf_barrier.numAtBarrier());
445 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
446
447 DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
448 "%d waves using this barrier.\n", cu_id, barrier_id,
449 num_wfs_in_wg);
450 }
451
452 // Assign WFs according to numWfsToSched vector, which is computed by
453 // hasDispResources()
454 for (int j = 0; j < shader->n_wf; ++j) {
455 for (int i = 0; i < numVectorALUs; ++i) {
456 Wavefront *w = wfList[i][j];
457 // Check if this wavefront slot is available and there are WFs
458 // remaining to be dispatched to current SIMD:
459 // WF slot must be stopped and not waiting
460 // for a release to complete S_RETURNING
461 if (w->getStatus() == Wavefront::S_STOPPED &&
462 numWfsToSched[i] > 0) {
463 // decrement number of WFs awaiting dispatch to current SIMD
464 numWfsToSched[i] -= 1;
465
466 fillKernelState(w, task);
467
468 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
469 "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
470 vregDemand, sregDemand);
471
472 registerManager->allocateRegisters(w, vregDemand, sregDemand);
473
474 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
475 ++wave_id;
476 }
477 }
478 }
479 }
480
481 void
482 ComputeUnit::insertInPipeMap(Wavefront *w)
483 {
484 panic_if(w->instructionBuffer.empty(),
485 "Instruction Buffer of WF%d can't be empty", w->wgId);
486 GPUDynInstPtr ii = w->instructionBuffer.front();
487 pipeMap.emplace(ii->seqNum());
488 }
489
490 void
491 ComputeUnit::deleteFromPipeMap(Wavefront *w)
492 {
493 panic_if(w->instructionBuffer.empty(),
494 "Instruction Buffer of WF%d can't be empty", w->wgId);
495 GPUDynInstPtr ii = w->instructionBuffer.front();
496 // delete the dynamic instruction from the pipeline map
497 auto it = pipeMap.find(ii->seqNum());
498 panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
499 pipeMap.erase(it);
500 }
501
502 bool
503 ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
504 {
505 // compute true size of workgroup (after clamping to grid size)
506 int trueWgSize[HSAQueueEntry::MAX_DIM];
507 int trueWgSizeTotal = 1;
508
509 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
510 trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
511 task->wgId(d) * task->wgSize(d));
512
513 trueWgSizeTotal *= trueWgSize[d];
514 DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
515 }
516
517 DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
518
519 // calculate the number of WFs in this WG
520 int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
521 num_wfs_in_wg = numWfs;
522
523 bool barrier_avail = true;
524
525 if (numWfs > 1 && !freeBarrierIds.size()) {
526 barrier_avail = false;
527 }
528
529 // calculate the number of 32-bit vector registers required by each
530 // work item of the work group
531 int vregDemandPerWI = task->numVectorRegs();
532 // calculate the number of 32-bit scalar registers required by each
533 // work item of the work group
534 int sregDemandPerWI = task->numScalarRegs();
535
536 // check if the total number of VGPRs snd SGPRs required by all WFs
537 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
538 panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
539 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
540 "that has %d VGPRs\n",
541 numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
542 panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
543 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
544 "with %d SGPRs\n",
545 numWfs, sregDemandPerWI, numScalarRegsPerSimd);
546
547 // number of WF slots that are not occupied
548 int freeWfSlots = 0;
549 // number of Wfs from WG that were successfully mapped to a SIMD
550 int numMappedWfs = 0;
551 numWfsToSched.clear();
552 numWfsToSched.resize(numVectorALUs, 0);
553
554 // attempt to map WFs to the SIMDs, based on WF slot availability
555 // and register file availability
556 for (int j = 0; j < shader->n_wf; ++j) {
557 for (int i = 0; i < numVectorALUs; ++i) {
558 if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
559 ++freeWfSlots;
560 // check if current WF will fit onto current SIMD/VRF
561 // if all WFs have not yet been mapped to the SIMDs
562 if (numMappedWfs < numWfs &&
563 registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
564 sregDemandPerWI) &&
565 registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
566 vregDemandPerWI)) {
567 numWfsToSched[i]++;
568 numMappedWfs++;
569 }
570 }
571 }
572 }
573
574 // check that the number of mapped WFs is not greater
575 // than the actual number of WFs
576 assert(numMappedWfs <= numWfs);
577
578 bool vregAvail = true;
579 bool sregAvail = true;
580 // if a WF to SIMD mapping was not found, find the limiting resource
581 if (numMappedWfs < numWfs) {
582
583 for (int j = 0; j < numVectorALUs; ++j) {
584 // find if there are enough free VGPRs in the SIMD's VRF
585 // to accomodate the WFs of the new WG that would be mapped
586 // to this SIMD unit
587 vregAvail &= registerManager->
588 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
589 // find if there are enough free SGPRs in the SIMD's SRF
590 // to accomodate the WFs of the new WG that would be mapped
591 // to this SIMD unit
592 sregAvail &= registerManager->
593 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
594 }
595 }
596
597 DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
598 VGPR Availability = %d, SGPR Availability = %d\n",
599 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
600
601 if (!vregAvail) {
602 ++numTimesWgBlockedDueVgprAlloc;
603 }
604
605 if (!sregAvail) {
606 ++numTimesWgBlockedDueSgprAlloc;
607 }
608
609 // Return true if enough WF slots to submit workgroup and if there are
610 // enough VGPRs to schedule all WFs to their SIMD units
611 bool ldsAvail = lds.canReserve(task->ldsSize());
612 if (!ldsAvail) {
613 wgBlockedDueLdsAllocation++;
614 }
615
616 if (!barrier_avail) {
617 wgBlockedDueBarrierAllocation++;
618 }
619
620 // Return true if the following are all true:
621 // (a) all WFs of the WG were mapped to free WF slots
622 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
623 // (c) there are enough SGPRs on the CU to schedule all WFs
624 // (d) there is enough space in LDS to allocate for all WFs
625 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
626 && ldsAvail && barrier_avail;
627 return can_dispatch;
628 }
629
630 int
631 ComputeUnit::numYetToReachBarrier(int bar_id)
632 {
633 auto &wf_barrier = barrierSlot(bar_id);
634 return wf_barrier.numYetToReachBarrier();
635 }
636
637 bool
638 ComputeUnit::allAtBarrier(int bar_id)
639 {
640 auto &wf_barrier = barrierSlot(bar_id);
641 return wf_barrier.allAtBarrier();
642 }
643
644 void
645 ComputeUnit::incNumAtBarrier(int bar_id)
646 {
647 auto &wf_barrier = barrierSlot(bar_id);
648 wf_barrier.incNumAtBarrier();
649 }
650
651 int
652 ComputeUnit::numAtBarrier(int bar_id)
653 {
654 auto &wf_barrier = barrierSlot(bar_id);
655 return wf_barrier.numAtBarrier();
656 }
657
658 int
659 ComputeUnit::maxBarrierCnt(int bar_id)
660 {
661 auto &wf_barrier = barrierSlot(bar_id);
662 return wf_barrier.maxBarrierCnt();
663 }
664
665 void
666 ComputeUnit::resetBarrier(int bar_id)
667 {
668 auto &wf_barrier = barrierSlot(bar_id);
669 wf_barrier.reset();
670 }
671
672 void
673 ComputeUnit::decMaxBarrierCnt(int bar_id)
674 {
675 auto &wf_barrier = barrierSlot(bar_id);
676 wf_barrier.decMaxBarrierCnt();
677 }
678
679 void
680 ComputeUnit::releaseBarrier(int bar_id)
681 {
682 auto &wf_barrier = barrierSlot(bar_id);
683 wf_barrier.release();
684 freeBarrierIds.insert(bar_id);
685 }
686
687 void
688 ComputeUnit::releaseWFsFromBarrier(int bar_id)
689 {
690 for (int i = 0; i < numVectorALUs; ++i) {
691 for (int j = 0; j < shader->n_wf; ++j) {
692 Wavefront *wf = wfList[i][j];
693 if (wf->barrierId() == bar_id) {
694 assert(wf->getStatus() == Wavefront::S_BARRIER);
695 wf->setStatus(Wavefront::S_RUNNING);
696 }
697 }
698 }
699 }
700
701 // Execute one clock worth of work on the ComputeUnit.
702 void
703 ComputeUnit::exec()
704 {
705 // process reads and writes in the RFs
706 for (auto &vecRegFile : vrf) {
707 vecRegFile->exec();
708 }
709
710 for (auto &scRegFile : srf) {
711 scRegFile->exec();
712 }
713
714 // Execute pipeline stages in reverse order to simulate
715 // the pipeline latency
716 scalarMemoryPipe.exec();
717 globalMemoryPipe.exec();
718 localMemoryPipe.exec();
719 execStage.exec();
720 scheduleStage.exec();
721 scoreboardCheckStage.exec();
722 fetchStage.exec();
723
724 totalCycles++;
725
726 // Put this CU to sleep if there is no more work to be done.
727 if (!isDone()) {
728 schedule(tickEvent, nextCycle());
729 } else {
730 shader->notifyCuSleep();
731 DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
732 }
733 }
734
735 void
736 ComputeUnit::init()
737 {
738 // Initialize CU Bus models and execution resources
739
740 // Vector ALUs
741 vectorALUs.clear();
742 for (int i = 0; i < numVectorALUs; i++) {
743 vectorALUs.emplace_back(this, clockPeriod());
744 }
745
746 // Scalar ALUs
747 scalarALUs.clear();
748 for (int i = 0; i < numScalarALUs; i++) {
749 scalarALUs.emplace_back(this, clockPeriod());
750 }
751
752 // Vector Global Memory
753 fatal_if(numVectorGlobalMemUnits > 1,
754 "No support for multiple Global Memory Pipelines exists!!!");
755 vectorGlobalMemUnit.init(this, clockPeriod());
756 vrfToGlobalMemPipeBus.init(this, clockPeriod());
757 glbMemToVrfBus.init(this, clockPeriod());
758
759 // Vector Local/Shared Memory
760 fatal_if(numVectorSharedMemUnits > 1,
761 "No support for multiple Local Memory Pipelines exists!!!");
762 vectorSharedMemUnit.init(this, clockPeriod());
763 vrfToLocalMemPipeBus.init(this, clockPeriod());
764 locMemToVrfBus.init(this, clockPeriod());
765
766 // Scalar Memory
767 fatal_if(numScalarMemUnits > 1,
768 "No support for multiple Scalar Memory Pipelines exists!!!");
769 scalarMemUnit.init(this, clockPeriod());
770 srfToScalarMemPipeBus.init(this, clockPeriod());
771 scalarMemToSrfBus.init(this, clockPeriod());
772
773 vectorRegsReserved.resize(numVectorALUs, 0);
774 scalarRegsReserved.resize(numVectorALUs, 0);
775
776 fetchStage.init();
777 scheduleStage.init();
778 execStage.init();
779 globalMemoryPipe.init();
780
781 gmTokenPort.setTokenManager(memPortTokens);
782 }
783
784 bool
785 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
786 {
787 // Ruby has completed the memory op. Schedule the mem_resp_event at the
788 // appropriate cycle to process the timing memory response
789 // This delay represents the pipeline delay
790 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
791 PortID index = sender_state->port_index;
792 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
793 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
794
795 // MemSyncResp + WriteAckResp are handled completely here and we don't
796 // schedule a MemRespEvent to process the responses further
797 if (pkt->cmd == MemCmd::MemSyncResp) {
798 // This response is for 1 of the following request types:
799 // - kernel launch
800 // - kernel end
801 // - non-kernel mem sync
802
803 // Kernel Launch
804 // wavefront was nullptr when launching kernel, so it is meaningless
805 // here (simdId=-1, wfSlotId=-1)
806 if (gpuDynInst->isKernelLaunch()) {
807 // for kernel launch, the original request must be both kernel-type
808 // and acquire
809 assert(pkt->req->isKernel());
810 assert(pkt->req->isAcquire());
811
812 // one D-Cache inv is done, decrement counter
813 dispatcher.updateInvCounter(gpuDynInst->kern_id);
814
815 delete pkt->senderState;
816 delete pkt;
817 return true;
818 }
819
820 // retrieve wavefront from inst
821 Wavefront *w = gpuDynInst->wavefront();
822
823 // Check if we are waiting on Kernel End Release
824 if (w->getStatus() == Wavefront::S_RETURNING
825 && gpuDynInst->isEndOfKernel()) {
826 // for kernel end, the original request must be both kernel-type
827 // and release
828 assert(pkt->req->isKernel());
829 assert(pkt->req->isRelease());
830
831 // one wb done, decrement counter, and return whether all wbs are
832 // done for the kernel
833 bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
834
835 // not all wbs are done for the kernel, just release pkt
836 // resources
837 if (!isWbDone) {
838 delete pkt->senderState;
839 delete pkt;
840 return true;
841 }
842
843 // all wbs are completed for the kernel, do retirement work
844 // for the workgroup
845 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
846 computeUnit->cu_id, w->simdId, w->wfSlotId,
847 w->wfDynId, w->wgId);
848
849 dispatcher.notifyWgCompl(w);
850 w->setStatus(Wavefront::S_STOPPED);
851 }
852
853 if (!pkt->req->isKernel()) {
854 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
855 DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
856 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
857 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
858 gpuDynInst->disassemble(), w->outstandingReqs,
859 w->outstandingReqs - 1);
860 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
861 }
862
863 delete pkt->senderState;
864 delete pkt;
865 return true;
866 } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
867 // this is for writeComplete callback
868 // we simply get decrement write-related wait counters
869 assert(gpuDynInst);
870 Wavefront *w M5_VAR_USED =
871 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
872 assert(w);
873 DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
874 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
875 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
876 gpuDynInst->disassemble(), w->outstandingReqs,
877 w->outstandingReqs - 1);
878 if (gpuDynInst->allLanesZero()) {
879 // ask gm pipe to decrement request counters, instead of directly
880 // performing here, to avoid asynchronous counter update and
881 // instruction retirement (which may hurt waincnt effects)
882 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
883
884 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
885 computeUnit->cu_id, gpuDynInst->simdId,
886 gpuDynInst->wfSlotId);
887 }
888
889 delete pkt->senderState;
890 delete pkt;
891
892 return true;
893 }
894
895 EventFunctionWrapper *mem_resp_event =
896 computeUnit->memPort[index].createMemRespEvent(pkt);
897
898 DPRINTF(GPUPort,
899 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
900 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
901 gpuDynInst->seqNum(), index, pkt->req->getPaddr());
902
903 computeUnit->schedule(mem_resp_event,
904 curTick() + computeUnit->resp_tick_latency);
905
906 return true;
907 }
908
909 bool
910 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
911 {
912 assert(!pkt->req->isKernel());
913
914 // retrieve sender state
915 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
916 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
917
918 assert(pkt->isRead() || pkt->isWrite());
919 assert(gpuDynInst->numScalarReqs > 0);
920
921 gpuDynInst->numScalarReqs--;
922
923 /**
924 * for each returned scalar request we decrement the
925 * numScalarReqs counter that is associated with this
926 * gpuDynInst, which should have been set to correspond
927 * to the number of packets sent for the memory op.
928 * once all packets return, the memory op is finished
929 * and we can push it into the response queue.
930 */
931 if (!gpuDynInst->numScalarReqs) {
932 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
933 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
934 gpuDynInst);
935 } else {
936 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
937 gpuDynInst);
938 }
939 }
940
941 delete pkt->senderState;
942 delete pkt;
943
944 return true;
945 }
946
947 void
948 ComputeUnit::ScalarDataPort::recvReqRetry()
949 {
950 for (const auto &pkt : retries) {
951 if (!sendTimingReq(pkt)) {
952 break;
953 } else {
954 retries.pop_front();
955 }
956 }
957 }
958
959 void
960 ComputeUnit::DataPort::recvReqRetry()
961 {
962 int len = retries.size();
963
964 assert(len > 0);
965
966 for (int i = 0; i < len; ++i) {
967 PacketPtr pkt = retries.front().first;
968 GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
969 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
970 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
971 pkt->req->getPaddr());
972
973 /** Currently Ruby can return false due to conflicts for the particular
974 * cache block or address. Thus other requests should be allowed to
975 * pass and the data port should expect multiple retries. */
976 if (!sendTimingReq(pkt)) {
977 DPRINTF(GPUMem, "failed again!\n");
978 break;
979 } else {
980 DPRINTF(GPUMem, "successful!\n");
981 retries.pop_front();
982 }
983 }
984 }
985
986 bool
987 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
988 {
989 computeUnit->fetchStage.processFetchReturn(pkt);
990 return true;
991 }
992
993 void
994 ComputeUnit::SQCPort::recvReqRetry()
995 {
996 int len = retries.size();
997
998 assert(len > 0);
999
1000 for (int i = 0; i < len; ++i) {
1001 PacketPtr pkt = retries.front().first;
1002 Wavefront *wavefront M5_VAR_USED = retries.front().second;
1003 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1004 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1005 pkt->req->getPaddr());
1006 if (!sendTimingReq(pkt)) {
1007 DPRINTF(GPUFetch, "failed again!\n");
1008 break;
1009 } else {
1010 DPRINTF(GPUFetch, "successful!\n");
1011 retries.pop_front();
1012 }
1013 }
1014 }
1015
1016 void
1017 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
1018 {
1019 // There must be a way around this check to do the globalMemStart...
1020 Addr tmp_vaddr = pkt->req->getVaddr();
1021
1022 updatePageDivergenceDist(tmp_vaddr);
1023
1024 // set PC in request
1025 pkt->req->setPC(gpuDynInst->wavefront()->pc());
1026
1027 pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1028
1029 // figure out the type of the request to set read/write
1030 BaseTLB::Mode TLB_mode;
1031 assert(pkt->isRead() || pkt->isWrite());
1032
1033 // only do some things if actually accessing data
1034 bool isDataAccess = pkt->isWrite() || pkt->isRead();
1035
1036 // Check write before read for atomic operations
1037 // since atomic operations should use BaseTLB::Write
1038 if (pkt->isWrite()) {
1039 TLB_mode = BaseTLB::Write;
1040 } else if (pkt->isRead()) {
1041 TLB_mode = BaseTLB::Read;
1042 } else {
1043 fatal("pkt is not a read nor a write\n");
1044 }
1045
1046 tlbCycles -= curTick();
1047 ++tlbRequests;
1048
1049 PortID tlbPort_index = perLaneTLB ? index : 0;
1050
1051 if (shader->timingSim) {
1052 if (debugSegFault) {
1053 Process *p = shader->gpuTc->getProcessPtr();
1054 Addr vaddr = pkt->req->getVaddr();
1055 unsigned size = pkt->getSize();
1056
1057 if ((vaddr + size - 1) % 64 < vaddr % 64) {
1058 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1060 }
1061
1062 Addr paddr;
1063
1064 if (!p->pTable->translate(vaddr, paddr)) {
1065 if (!p->fixupFault(vaddr)) {
1066 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1068 vaddr);
1069 }
1070 }
1071 }
1072
1073 // This is the SenderState needed upon return
1074 pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1075
1076 // This is the senderState needed by the TLB hierarchy to function
1077 TheISA::GpuTLB::TranslationState *translation_state =
1078 new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
1079 pkt->senderState);
1080
1081 pkt->senderState = translation_state;
1082
1083 if (functionalTLB) {
1084 tlbPort[tlbPort_index].sendFunctional(pkt);
1085
1086 // update the hitLevel distribution
1087 int hit_level = translation_state->hitLevel;
1088 assert(hit_level != -1);
1089 hitsPerTLBLevel[hit_level]++;
1090
1091 // New SenderState for the memory access
1092 X86ISA::GpuTLB::TranslationState *sender_state =
1093 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1094
1095 delete sender_state->tlbEntry;
1096 delete sender_state->saved;
1097 delete sender_state;
1098
1099 assert(pkt->req->hasPaddr());
1100 assert(pkt->req->hasSize());
1101
1102 // this is necessary because the GPU TLB receives packets instead
1103 // of requests. when the translation is complete, all relevent
1104 // fields in the request will be populated, but not in the packet.
1105 // here we create the new packet so we can set the size, addr,
1106 // and proper flags.
1107 PacketPtr oldPkt = pkt;
1108 pkt = new Packet(oldPkt->req, oldPkt->cmd);
1109 if (isDataAccess) {
1110 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1111 pkt->dataStatic(tmpData);
1112 }
1113 delete oldPkt;
1114
1115
1116 // New SenderState for the memory access
1117 pkt->senderState =
1118 new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1119 nullptr);
1120
1121 gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1122 gpuDynInst->tlbHitLevel[index] = hit_level;
1123
1124 // translation is done. Schedule the mem_req_event at the
1125 // appropriate cycle to send the timing memory request to ruby
1126 EventFunctionWrapper *mem_req_event =
1127 memPort[index].createMemReqEvent(pkt);
1128
1129 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1130 "scheduled\n", cu_id, gpuDynInst->simdId,
1131 gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1132
1133 schedule(mem_req_event, curTick() + req_tick_latency);
1134 } else if (tlbPort[tlbPort_index].isStalled()) {
1135 assert(tlbPort[tlbPort_index].retries.size() > 0);
1136
1137 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1138 "failed!\n", cu_id, gpuDynInst->simdId,
1139 gpuDynInst->wfSlotId, tmp_vaddr);
1140
1141 tlbPort[tlbPort_index].retries.push_back(pkt);
1142 } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1143 // Stall the data port;
1144 // No more packet will be issued till
1145 // ruby indicates resources are freed by
1146 // a recvReqRetry() call back on this port.
1147 tlbPort[tlbPort_index].stallPort();
1148
1149 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1150 "failed!\n", cu_id, gpuDynInst->simdId,
1151 gpuDynInst->wfSlotId, tmp_vaddr);
1152
1153 tlbPort[tlbPort_index].retries.push_back(pkt);
1154 } else {
1155 DPRINTF(GPUTLB,
1156 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1158 }
1159 } else {
1160 if (pkt->cmd == MemCmd::MemSyncReq) {
1161 gpuDynInst->resetEntireStatusVector();
1162 } else {
1163 gpuDynInst->decrementStatusVector(index);
1164 }
1165
1166 // New SenderState for the memory access
1167 delete pkt->senderState;
1168
1169 // Because it's atomic operation, only need TLB translation state
1170 pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
1171 shader->gpuTc);
1172
1173 tlbPort[tlbPort_index].sendFunctional(pkt);
1174
1175 // the addr of the packet is not modified, so we need to create a new
1176 // packet, or otherwise the memory access will have the old virtual
1177 // address sent in the translation packet, instead of the physical
1178 // address returned by the translation.
1179 PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1180 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1181
1182 // Translation is done. It is safe to send the packet to memory.
1183 memPort[0].sendFunctional(new_pkt);
1184
1185 DPRINTF(GPUMem, "Functional sendRequest\n");
1186 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1187 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1188 new_pkt->req->getPaddr());
1189
1190 // safe_cast the senderState
1191 TheISA::GpuTLB::TranslationState *sender_state =
1192 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1193
1194 delete sender_state->tlbEntry;
1195 delete new_pkt;
1196 delete pkt->senderState;
1197 delete pkt;
1198 }
1199 }
1200
1201 void
1202 ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
1203 {
1204 assert(pkt->isWrite() || pkt->isRead());
1205
1206 BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
1207
1208 pkt->senderState =
1209 new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
1210
1211 pkt->senderState =
1212 new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
1213 pkt->senderState);
1214
1215 if (scalarDTLBPort.isStalled()) {
1216 assert(scalarDTLBPort.retries.size());
1217 scalarDTLBPort.retries.push_back(pkt);
1218 } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1219 scalarDTLBPort.stallPort();
1220 scalarDTLBPort.retries.push_back(pkt);
1221 } else {
1222 DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1223 tlb_mode == BaseTLB::Read ? "read" : "write",
1224 pkt->req->getVaddr());
1225 }
1226 }
1227
1228 void
1229 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
1230 bool kernelMemSync,
1231 RequestPtr req)
1232 {
1233 assert(gpuDynInst->isGlobalSeg() ||
1234 gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1235
1236 if (!req) {
1237 req = std::make_shared<Request>(
1238 0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1239 }
1240
1241 // all mem sync requests have Paddr == 0
1242 req->setPaddr(0);
1243
1244 PacketPtr pkt = nullptr;
1245
1246 if (kernelMemSync) {
1247 if (gpuDynInst->isKernelLaunch()) {
1248 req->setCacheCoherenceFlags(Request::ACQUIRE);
1249 req->setReqInstSeqNum(gpuDynInst->seqNum());
1250 req->setFlags(Request::KERNEL);
1251 pkt = new Packet(req, MemCmd::MemSyncReq);
1252 pkt->pushSenderState(
1253 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1254
1255 EventFunctionWrapper *mem_req_event =
1256 memPort[0].createMemReqEvent(pkt);
1257
1258 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259 "an acquire\n", cu_id, gpuDynInst->simdId,
1260 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1261
1262 schedule(mem_req_event, curTick() + req_tick_latency);
1263 } else {
1264 // kernel end release must be enabled
1265 assert(shader->impl_kern_end_rel);
1266 assert(gpuDynInst->isEndOfKernel());
1267
1268 req->setCacheCoherenceFlags(Request::WB_L2);
1269 req->setReqInstSeqNum(gpuDynInst->seqNum());
1270 req->setFlags(Request::KERNEL);
1271 pkt = new Packet(req, MemCmd::MemSyncReq);
1272 pkt->pushSenderState(
1273 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1274
1275 EventFunctionWrapper *mem_req_event =
1276 memPort[0].createMemReqEvent(pkt);
1277
1278 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1279 "a release\n", cu_id, gpuDynInst->simdId,
1280 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1281
1282 schedule(mem_req_event, curTick() + req_tick_latency);
1283 }
1284 } else {
1285 gpuDynInst->setRequestFlags(req);
1286
1287 req->setReqInstSeqNum(gpuDynInst->seqNum());
1288
1289 pkt = new Packet(req, MemCmd::MemSyncReq);
1290 pkt->pushSenderState(
1291 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1292
1293 EventFunctionWrapper *mem_req_event =
1294 memPort[0].createMemReqEvent(pkt);
1295
1296 DPRINTF(GPUPort,
1297 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1298 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1299 pkt->req->getPaddr());
1300
1301 schedule(mem_req_event, curTick() + req_tick_latency);
1302 }
1303 }
1304
1305 void
1306 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
1307 {
1308 DataPort::SenderState *sender_state =
1309 safe_cast<DataPort::SenderState*>(pkt->senderState);
1310
1311 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1312 ComputeUnit *compute_unit = computeUnit;
1313
1314 assert(gpuDynInst);
1315
1316 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1317 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1318 pkt->req->getPaddr(), id);
1319
1320 Addr paddr = pkt->req->getPaddr();
1321
1322 // mem sync resp and write-complete callback must be handled already in
1323 // DataPort::recvTimingResp
1324 assert(pkt->cmd != MemCmd::MemSyncResp);
1325 assert(pkt->cmd != MemCmd::WriteCompleteResp);
1326
1327 // this is for read, write and atomic
1328 int index = gpuDynInst->memStatusVector[paddr].back();
1329
1330 DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1331 pkt->req->getPaddr(), id);
1332
1333 gpuDynInst->memStatusVector[paddr].pop_back();
1334 gpuDynInst->pAddr = pkt->req->getPaddr();
1335
1336 gpuDynInst->decrementStatusVector(index);
1337 DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1338
1339 if (gpuDynInst->allLanesZero()) {
1340 auto iter = gpuDynInst->memStatusVector.begin();
1341 auto end = gpuDynInst->memStatusVector.end();
1342
1343 while (iter != end) {
1344 assert(iter->second.empty());
1345 ++iter;
1346 }
1347
1348 // Calculate the difference between the arrival of the first cache
1349 // block and the last cache block to arrive if we have the time
1350 // for the first cache block.
1351 if (compute_unit->headTailMap.count(gpuDynInst)) {
1352 Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1353 compute_unit->headTailLatency.sample(curTick() - headTick);
1354 compute_unit->headTailMap.erase(gpuDynInst);
1355 }
1356
1357 gpuDynInst->memStatusVector.clear();
1358
1359 // note: only handle read response here; for write, the response
1360 // is separately handled when writeComplete callback is received
1361 if (pkt->isRead()) {
1362 gpuDynInst->
1363 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1364 compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1365
1366 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1367 compute_unit->cu_id, gpuDynInst->simdId,
1368 gpuDynInst->wfSlotId);
1369 }
1370 } else {
1371 if (pkt->isRead()) {
1372 if (!compute_unit->headTailMap.count(gpuDynInst)) {
1373 compute_unit->headTailMap
1374 .insert(std::make_pair(gpuDynInst, curTick()));
1375 }
1376 }
1377 }
1378
1379 delete pkt->senderState;
1380 delete pkt;
1381 }
1382
1383 ComputeUnit*
1384 ComputeUnitParams::create()
1385 {
1386 return new ComputeUnit(this);
1387 }
1388
1389 bool
1390 ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
1391 {
1392 Addr line = pkt->req->getPaddr();
1393
1394 DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1395 pkt->req->getVaddr(), line);
1396
1397 assert(pkt->senderState);
1398 computeUnit->tlbCycles += curTick();
1399
1400 // pop off the TLB translation state
1401 TheISA::GpuTLB::TranslationState *translation_state =
1402 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1403
1404 // no PageFaults are permitted for data accesses
1405 if (!translation_state->tlbEntry) {
1406 DTLBPort::SenderState *sender_state =
1407 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1408
1409 Wavefront *w M5_VAR_USED =
1410 computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1411 [sender_state->_gpuDynInst->wfSlotId];
1412
1413 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1414 pkt->req->getVaddr());
1415 }
1416
1417 // update the hitLevel distribution
1418 int hit_level = translation_state->hitLevel;
1419 computeUnit->hitsPerTLBLevel[hit_level]++;
1420
1421 delete translation_state->tlbEntry;
1422 assert(!translation_state->ports.size());
1423 pkt->senderState = translation_state->saved;
1424
1425 // for prefetch pkt
1426 BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1427
1428 delete translation_state;
1429
1430 // use the original sender state to know how to close this transaction
1431 DTLBPort::SenderState *sender_state =
1432 safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1433
1434 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1435 PortID mp_index = sender_state->portIndex;
1436 Addr vaddr = pkt->req->getVaddr();
1437 gpuDynInst->memStatusVector[line].push_back(mp_index);
1438 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1439
1440 MemCmd requestCmd;
1441
1442 if (pkt->cmd == MemCmd::ReadResp) {
1443 requestCmd = MemCmd::ReadReq;
1444 } else if (pkt->cmd == MemCmd::WriteResp) {
1445 requestCmd = MemCmd::WriteReq;
1446 } else if (pkt->cmd == MemCmd::SwapResp) {
1447 requestCmd = MemCmd::SwapReq;
1448 } else {
1449 panic("unsupported response to request conversion %s\n",
1450 pkt->cmd.toString());
1451 }
1452
1453 if (computeUnit->prefetchDepth) {
1454 int simdId = gpuDynInst->simdId;
1455 int wfSlotId = gpuDynInst->wfSlotId;
1456 Addr last = 0;
1457
1458 switch(computeUnit->prefetchType) {
1459 case Enums::PF_CU:
1460 last = computeUnit->lastVaddrCU[mp_index];
1461 break;
1462 case Enums::PF_PHASE:
1463 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1464 break;
1465 case Enums::PF_WF:
1466 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1467 default:
1468 break;
1469 }
1470
1471 DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1472 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1473
1474 int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1475 roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
1476 : 0;
1477
1478 DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1479
1480 computeUnit->lastVaddrCU[mp_index] = vaddr;
1481 computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1482 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1483
1484 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1485 computeUnit->prefetchStride: stride;
1486
1487 DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1488 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1489
1490 DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1491
1492 // Prefetch Next few pages atomically
1493 for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1494 DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1495 vaddr+stride*pf*TheISA::PageBytes);
1496
1497 if (!stride)
1498 break;
1499
1500 RequestPtr prefetch_req = std::make_shared<Request>(
1501 vaddr + stride * pf * TheISA::PageBytes,
1502 sizeof(uint8_t), 0,
1503 computeUnit->requestorId(),
1504 0, 0, nullptr);
1505
1506 PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1507 uint8_t foo = 0;
1508 prefetch_pkt->dataStatic(&foo);
1509
1510 // Because it's atomic operation, only need TLB translation state
1511 prefetch_pkt->senderState =
1512 new TheISA::GpuTLB::TranslationState(TLB_mode,
1513 computeUnit->shader->gpuTc, true);
1514
1515 // Currently prefetches are zero-latency, hence the sendFunctional
1516 sendFunctional(prefetch_pkt);
1517
1518 /* safe_cast the senderState */
1519 TheISA::GpuTLB::TranslationState *tlb_state =
1520 safe_cast<TheISA::GpuTLB::TranslationState*>(
1521 prefetch_pkt->senderState);
1522
1523
1524 delete tlb_state->tlbEntry;
1525 delete tlb_state;
1526 delete prefetch_pkt;
1527 }
1528 }
1529
1530 // First we must convert the response cmd back to a request cmd so that
1531 // the request can be sent through the cu's request port
1532 PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1533 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1534 delete pkt->senderState;
1535 delete pkt;
1536
1537 // New SenderState for the memory access
1538 new_pkt->senderState =
1539 new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1540 nullptr);
1541
1542 // translation is done. Schedule the mem_req_event at the appropriate
1543 // cycle to send the timing memory request to ruby
1544 EventFunctionWrapper *mem_req_event =
1545 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1546
1547 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1548 computeUnit->cu_id, gpuDynInst->simdId,
1549 gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1550
1551 computeUnit->schedule(mem_req_event, curTick() +
1552 computeUnit->req_tick_latency);
1553
1554 return true;
1555 }
1556
1557 EventFunctionWrapper*
1558 ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt)
1559 {
1560 return new EventFunctionWrapper(
1561 [this, pkt]{ processMemReqEvent(pkt); },
1562 "ComputeUnit memory request event", true);
1563 }
1564
1565 EventFunctionWrapper*
1566 ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt)
1567 {
1568 return new EventFunctionWrapper(
1569 [this, pkt]{ processMemRespEvent(pkt); },
1570 "ComputeUnit memory response event", true);
1571 }
1572
1573 void
1574 ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
1575 {
1576 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1577 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1578 ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1579
1580 if (!(sendTimingReq(pkt))) {
1581 retries.push_back(std::make_pair(pkt, gpuDynInst));
1582
1583 DPRINTF(GPUPort,
1584 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1585 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1586 id, pkt->req->getPaddr());
1587 } else {
1588 DPRINTF(GPUPort,
1589 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1590 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1591 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1592 pkt->req->getPaddr());
1593 }
1594 }
1595
1596 const char*
1597 ComputeUnit::ScalarDataPort::MemReqEvent::description() const
1598 {
1599 return "ComputeUnit scalar memory request event";
1600 }
1601
1602 void
1603 ComputeUnit::ScalarDataPort::MemReqEvent::process()
1604 {
1605 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1606 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1607 ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit;
1608
1609 if (!(scalarDataPort.sendTimingReq(pkt))) {
1610 scalarDataPort.retries.push_back(pkt);
1611
1612 DPRINTF(GPUPort,
1613 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1614 compute_unit->cu_id, gpuDynInst->simdId,
1615 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1616 } else {
1617 DPRINTF(GPUPort,
1618 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1619 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1620 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1621 pkt->req->getPaddr());
1622 }
1623 }
1624
1625 /*
1626 * The initial translation request could have been rejected,
1627 * if <retries> queue is not Retry sending the translation
1628 * request. sendRetry() is called from the peer port whenever
1629 * a translation completes.
1630 */
1631 void
1632 ComputeUnit::DTLBPort::recvReqRetry()
1633 {
1634 int len = retries.size();
1635
1636 DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1637 computeUnit->cu_id, len);
1638
1639 assert(len > 0);
1640 assert(isStalled());
1641 // recvReqRetry is an indication that the resource on which this
1642 // port was stalling on is freed. So, remove the stall first
1643 unstallPort();
1644
1645 for (int i = 0; i < len; ++i) {
1646 PacketPtr pkt = retries.front();
1647 Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1648 DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1649
1650 if (!sendTimingReq(pkt)) {
1651 // Stall port
1652 stallPort();
1653 DPRINTF(GPUTLB, ": failed again\n");
1654 break;
1655 } else {
1656 DPRINTF(GPUTLB, ": successful\n");
1657 retries.pop_front();
1658 }
1659 }
1660 }
1661
1662 bool
1663 ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
1664 {
1665 assert(pkt->senderState);
1666
1667 TheISA::GpuTLB::TranslationState *translation_state =
1668 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1669
1670 // Page faults are not allowed
1671 fatal_if(!translation_state->tlbEntry,
1672 "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1673
1674 delete translation_state->tlbEntry;
1675 assert(!translation_state->ports.size());
1676
1677 pkt->senderState = translation_state->saved;
1678 delete translation_state;
1679
1680 ScalarDTLBPort::SenderState *sender_state =
1681 safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1682
1683 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1684 delete pkt->senderState;
1685
1686 Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
1687
1688 DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1689 "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1690 w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1691
1692 MemCmd mem_cmd;
1693
1694 if (pkt->cmd == MemCmd::ReadResp) {
1695 mem_cmd = MemCmd::ReadReq;
1696 } else if (pkt->cmd == MemCmd::WriteResp) {
1697 mem_cmd = MemCmd::WriteReq;
1698 } else {
1699 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1700 pkt->cmd.toString());
1701 }
1702
1703 PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1704 req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1705 delete pkt;
1706
1707 req_pkt->senderState =
1708 new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
1709
1710 if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1711 computeUnit->scalarDataPort.retries.push_back(req_pkt);
1712 DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1713 gpuDynInst->disassemble());
1714 } else {
1715 DPRINTF(GPUMem, "send scalar req for: %s\n",
1716 gpuDynInst->disassemble());
1717 }
1718
1719 return true;
1720 }
1721
1722 bool
1723 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
1724 {
1725 Addr line M5_VAR_USED = pkt->req->getPaddr();
1726 DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1727 computeUnit->cu_id, pkt->req->getVaddr(), line);
1728
1729 assert(pkt->senderState);
1730
1731 // pop off the TLB translation state
1732 TheISA::GpuTLB::TranslationState *translation_state
1733 = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1734
1735 bool success = translation_state->tlbEntry != nullptr;
1736 delete translation_state->tlbEntry;
1737 assert(!translation_state->ports.size());
1738 pkt->senderState = translation_state->saved;
1739 delete translation_state;
1740
1741 // use the original sender state to know how to close this transaction
1742 ITLBPort::SenderState *sender_state =
1743 safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1744
1745 // get the wavefront associated with this translation request
1746 Wavefront *wavefront = sender_state->wavefront;
1747 delete pkt->senderState;
1748
1749 if (success) {
1750 // pkt is reused in fetch(), don't delete it here. However, we must
1751 // reset the command to be a request so that it can be sent through
1752 // the cu's request port
1753 assert(pkt->cmd == MemCmd::ReadResp);
1754 pkt->cmd = MemCmd::ReadReq;
1755
1756 computeUnit->fetchStage.fetch(pkt, wavefront);
1757 } else {
1758 if (wavefront->dropFetch) {
1759 assert(wavefront->instructionBuffer.empty());
1760 wavefront->dropFetch = false;
1761 }
1762
1763 wavefront->pendingFetch = 0;
1764 }
1765
1766 return true;
1767 }
1768
1769 /*
1770 * The initial translation request could have been rejected, if
1771 * <retries> queue is not empty. Retry sending the translation
1772 * request. sendRetry() is called from the peer port whenever
1773 * a translation completes.
1774 */
1775 void
1776 ComputeUnit::ITLBPort::recvReqRetry()
1777 {
1778
1779 int len = retries.size();
1780 DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1781
1782 assert(len > 0);
1783 assert(isStalled());
1784
1785 // recvReqRetry is an indication that the resource on which this
1786 // port was stalling on is freed. So, remove the stall first
1787 unstallPort();
1788
1789 for (int i = 0; i < len; ++i) {
1790 PacketPtr pkt = retries.front();
1791 Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1792 DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1793
1794 if (!sendTimingReq(pkt)) {
1795 stallPort(); // Stall port
1796 DPRINTF(GPUTLB, ": failed again\n");
1797 break;
1798 } else {
1799 DPRINTF(GPUTLB, ": successful\n");
1800 retries.pop_front();
1801 }
1802 }
1803 }
1804
1805 void
1806 ComputeUnit::regStats()
1807 {
1808 ClockedObject::regStats();
1809
1810 vALUInsts
1811 .name(name() + ".valu_insts")
1812 .desc("Number of vector ALU insts issued.")
1813 ;
1814 vALUInstsPerWF
1815 .name(name() + ".valu_insts_per_wf")
1816 .desc("The avg. number of vector ALU insts issued per-wavefront.")
1817 ;
1818 sALUInsts
1819 .name(name() + ".salu_insts")
1820 .desc("Number of scalar ALU insts issued.")
1821 ;
1822 sALUInstsPerWF
1823 .name(name() + ".salu_insts_per_wf")
1824 .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1825 ;
1826 instCyclesVALU
1827 .name(name() + ".inst_cycles_valu")
1828 .desc("Number of cycles needed to execute VALU insts.")
1829 ;
1830 instCyclesSALU
1831 .name(name() + ".inst_cycles_salu")
1832 .desc("Number of cycles needed to execute SALU insts.")
1833 ;
1834 threadCyclesVALU
1835 .name(name() + ".thread_cycles_valu")
1836 .desc("Number of thread cycles used to execute vector ALU ops. "
1837 "Similar to instCyclesVALU but multiplied by the number of "
1838 "active threads.")
1839 ;
1840 vALUUtilization
1841 .name(name() + ".valu_utilization")
1842 .desc("Percentage of active vector ALU threads in a wave.")
1843 ;
1844 ldsNoFlatInsts
1845 .name(name() + ".lds_no_flat_insts")
1846 .desc("Number of LDS insts issued, not including FLAT "
1847 "accesses that resolve to LDS.")
1848 ;
1849 ldsNoFlatInstsPerWF
1850 .name(name() + ".lds_no_flat_insts_per_wf")
1851 .desc("The avg. number of LDS insts (not including FLAT "
1852 "accesses that resolve to LDS) per-wavefront.")
1853 ;
1854 flatVMemInsts
1855 .name(name() + ".flat_vmem_insts")
1856 .desc("The number of FLAT insts that resolve to vmem issued.")
1857 ;
1858 flatVMemInstsPerWF
1859 .name(name() + ".flat_vmem_insts_per_wf")
1860 .desc("The average number of FLAT insts that resolve to vmem "
1861 "issued per-wavefront.")
1862 ;
1863 flatLDSInsts
1864 .name(name() + ".flat_lds_insts")
1865 .desc("The number of FLAT insts that resolve to LDS issued.")
1866 ;
1867 flatLDSInstsPerWF
1868 .name(name() + ".flat_lds_insts_per_wf")
1869 .desc("The average number of FLAT insts that resolve to LDS "
1870 "issued per-wavefront.")
1871 ;
1872 vectorMemWrites
1873 .name(name() + ".vector_mem_writes")
1874 .desc("Number of vector mem write insts (excluding FLAT insts).")
1875 ;
1876 vectorMemWritesPerWF
1877 .name(name() + ".vector_mem_writes_per_wf")
1878 .desc("The average number of vector mem write insts "
1879 "(excluding FLAT insts) per-wavefront.")
1880 ;
1881 vectorMemReads
1882 .name(name() + ".vector_mem_reads")
1883 .desc("Number of vector mem read insts (excluding FLAT insts).")
1884 ;
1885 vectorMemReadsPerWF
1886 .name(name() + ".vector_mem_reads_per_wf")
1887 .desc("The avg. number of vector mem read insts (excluding "
1888 "FLAT insts) per-wavefront.")
1889 ;
1890 scalarMemWrites
1891 .name(name() + ".scalar_mem_writes")
1892 .desc("Number of scalar mem write insts.")
1893 ;
1894 scalarMemWritesPerWF
1895 .name(name() + ".scalar_mem_writes_per_wf")
1896 .desc("The average number of scalar mem write insts per-wavefront.")
1897 ;
1898 scalarMemReads
1899 .name(name() + ".scalar_mem_reads")
1900 .desc("Number of scalar mem read insts.")
1901 ;
1902 scalarMemReadsPerWF
1903 .name(name() + ".scalar_mem_reads_per_wf")
1904 .desc("The average number of scalar mem read insts per-wavefront.")
1905 ;
1906
1907 vALUInstsPerWF = vALUInsts / completedWfs;
1908 sALUInstsPerWF = sALUInsts / completedWfs;
1909 vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1910 ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
1911 flatVMemInstsPerWF = flatVMemInsts / completedWfs;
1912 flatLDSInstsPerWF = flatLDSInsts / completedWfs;
1913 vectorMemWritesPerWF = vectorMemWrites / completedWfs;
1914 vectorMemReadsPerWF = vectorMemReads / completedWfs;
1915 scalarMemWritesPerWF = scalarMemWrites / completedWfs;
1916 scalarMemReadsPerWF = scalarMemReads / completedWfs;
1917
1918 vectorMemReadsPerKiloInst
1919 .name(name() + ".vector_mem_reads_per_kilo_inst")
1920 .desc("Number of vector mem reads per kilo-instruction")
1921 ;
1922 vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
1923 vectorMemWritesPerKiloInst
1924 .name(name() + ".vector_mem_writes_per_kilo_inst")
1925 .desc("Number of vector mem writes per kilo-instruction")
1926 ;
1927 vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
1928 vectorMemInstsPerKiloInst
1929 .name(name() + ".vector_mem_insts_per_kilo_inst")
1930 .desc("Number of vector mem insts per kilo-instruction")
1931 ;
1932 vectorMemInstsPerKiloInst =
1933 ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
1934 scalarMemReadsPerKiloInst
1935 .name(name() + ".scalar_mem_reads_per_kilo_inst")
1936 .desc("Number of scalar mem reads per kilo-instruction")
1937 ;
1938 scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
1939 scalarMemWritesPerKiloInst
1940 .name(name() + ".scalar_mem_writes_per_kilo_inst")
1941 .desc("Number of scalar mem writes per kilo-instruction")
1942 ;
1943 scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
1944 scalarMemInstsPerKiloInst
1945 .name(name() + ".scalar_mem_insts_per_kilo_inst")
1946 .desc("Number of scalar mem insts per kilo-instruction")
1947 ;
1948 scalarMemInstsPerKiloInst =
1949 ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
1950
1951 instCyclesVMemPerSimd
1952 .init(numVectorALUs)
1953 .name(name() + ".inst_cycles_vector_memory")
1954 .desc("Number of cycles to send address, command, data from VRF to "
1955 "vector memory unit, per SIMD")
1956 ;
1957
1958 instCyclesScMemPerSimd
1959 .init(numVectorALUs)
1960 .name(name() + ".inst_cycles_scalar_memory")
1961 .desc("Number of cycles to send address, command, data from SRF to "
1962 "scalar memory unit, per SIMD")
1963 ;
1964
1965 instCyclesLdsPerSimd
1966 .init(numVectorALUs)
1967 .name(name() + ".inst_cycles_lds")
1968 .desc("Number of cycles to send address, command, data from VRF to "
1969 "LDS unit, per SIMD")
1970 ;
1971
1972 globalReads
1973 .name(name() + ".global_mem_reads")
1974 .desc("Number of reads to the global segment")
1975 ;
1976 globalWrites
1977 .name(name() + ".global_mem_writes")
1978 .desc("Number of writes to the global segment")
1979 ;
1980 globalMemInsts
1981 .name(name() + ".global_mem_insts")
1982 .desc("Number of memory instructions sent to the global segment")
1983 ;
1984 globalMemInsts = globalReads + globalWrites;
1985 argReads
1986 .name(name() + ".arg_reads")
1987 .desc("Number of reads to the arg segment")
1988 ;
1989 argWrites
1990 .name(name() + ".arg_writes")
1991 .desc("NUmber of writes to the arg segment")
1992 ;
1993 argMemInsts
1994 .name(name() + ".arg_mem_insts")
1995 .desc("Number of memory instructions sent to the arg segment")
1996 ;
1997 argMemInsts = argReads + argWrites;
1998 spillReads
1999 .name(name() + ".spill_reads")
2000 .desc("Number of reads to the spill segment")
2001 ;
2002 spillWrites
2003 .name(name() + ".spill_writes")
2004 .desc("Number of writes to the spill segment")
2005 ;
2006 spillMemInsts
2007 .name(name() + ".spill_mem_insts")
2008 .desc("Number of memory instructions sent to the spill segment")
2009 ;
2010 spillMemInsts = spillReads + spillWrites;
2011 groupReads
2012 .name(name() + ".group_reads")
2013 .desc("Number of reads to the group segment")
2014 ;
2015 groupWrites
2016 .name(name() + ".group_writes")
2017 .desc("Number of writes to the group segment")
2018 ;
2019 groupMemInsts
2020 .name(name() + ".group_mem_insts")
2021 .desc("Number of memory instructions sent to the group segment")
2022 ;
2023 groupMemInsts = groupReads + groupWrites;
2024 privReads
2025 .name(name() + ".private_reads")
2026 .desc("Number of reads to the private segment")
2027 ;
2028 privWrites
2029 .name(name() + ".private_writes")
2030 .desc("Number of writes to the private segment")
2031 ;
2032 privMemInsts
2033 .name(name() + ".private_mem_insts")
2034 .desc("Number of memory instructions sent to the private segment")
2035 ;
2036 privMemInsts = privReads + privWrites;
2037 readonlyReads
2038 .name(name() + ".readonly_reads")
2039 .desc("Number of reads to the readonly segment")
2040 ;
2041 readonlyWrites
2042 .name(name() + ".readonly_writes")
2043 .desc("Number of memory instructions sent to the readonly segment")
2044 ;
2045 readonlyMemInsts
2046 .name(name() + ".readonly_mem_insts")
2047 .desc("Number of memory instructions sent to the readonly segment")
2048 ;
2049 readonlyMemInsts = readonlyReads + readonlyWrites;
2050 kernargReads
2051 .name(name() + ".kernarg_reads")
2052 .desc("Number of reads sent to the kernarg segment")
2053 ;
2054 kernargWrites
2055 .name(name() + ".kernarg_writes")
2056 .desc("Number of memory instructions sent to the kernarg segment")
2057 ;
2058 kernargMemInsts
2059 .name(name() + ".kernarg_mem_insts")
2060 .desc("Number of memory instructions sent to the kernarg segment")
2061 ;
2062 kernargMemInsts = kernargReads + kernargWrites;
2063
2064 tlbCycles
2065 .name(name() + ".tlb_cycles")
2066 .desc("total number of cycles for all uncoalesced requests")
2067 ;
2068
2069 tlbRequests
2070 .name(name() + ".tlb_requests")
2071 .desc("number of uncoalesced requests")
2072 ;
2073
2074 tlbLatency
2075 .name(name() + ".avg_translation_latency")
2076 .desc("Avg. translation latency for data translations")
2077 ;
2078
2079 tlbLatency = tlbCycles / tlbRequests;
2080
2081 hitsPerTLBLevel
2082 .init(4)
2083 .name(name() + ".TLB_hits_distribution")
2084 .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
2085 ;
2086
2087 // fixed number of TLB levels
2088 for (int i = 0; i < 4; ++i) {
2089 if (!i)
2090 hitsPerTLBLevel.subname(i,"page_table");
2091 else
2092 hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2093 }
2094
2095 execRateDist
2096 .init(0, 10, 2)
2097 .name(name() + ".inst_exec_rate")
2098 .desc("Instruction Execution Rate: Number of executed vector "
2099 "instructions per cycle")
2100 ;
2101
2102 ldsBankConflictDist
2103 .init(0, wfSize(), 2)
2104 .name(name() + ".lds_bank_conflicts")
2105 .desc("Number of bank conflicts per LDS memory packet")
2106 ;
2107
2108 ldsBankAccesses
2109 .name(name() + ".lds_bank_access_cnt")
2110 .desc("Total number of LDS bank accesses")
2111 ;
2112
2113 pageDivergenceDist
2114 // A wavefront can touch up to N pages per memory instruction where
2115 // N is equal to the wavefront size
2116 // The number of pages per bin can be configured (here it's 4).
2117 .init(1, wfSize(), 4)
2118 .name(name() + ".page_divergence_dist")
2119 .desc("pages touched per wf (over all mem. instr.)")
2120 ;
2121
2122 controlFlowDivergenceDist
2123 .init(1, wfSize(), 4)
2124 .name(name() + ".warp_execution_dist")
2125 .desc("number of lanes active per instruction (oval all instructions)")
2126 ;
2127
2128 activeLanesPerGMemInstrDist
2129 .init(1, wfSize(), 4)
2130 .name(name() + ".gmem_lanes_execution_dist")
2131 .desc("number of active lanes per global memory instruction")
2132 ;
2133
2134 activeLanesPerLMemInstrDist
2135 .init(1, wfSize(), 4)
2136 .name(name() + ".lmem_lanes_execution_dist")
2137 .desc("number of active lanes per local memory instruction")
2138 ;
2139
2140 numInstrExecuted
2141 .name(name() + ".num_instr_executed")
2142 .desc("number of instructions executed")
2143 ;
2144
2145 numVecOpsExecuted
2146 .name(name() + ".num_vec_ops_executed")
2147 .desc("number of vec ops executed (e.g. WF size/inst)")
2148 ;
2149
2150 numVecOpsExecutedF16
2151 .name(name() + ".num_vec_ops_f16_executed")
2152 .desc("number of f16 vec ops executed (e.g. WF size/inst)")
2153 ;
2154
2155 numVecOpsExecutedF32
2156 .name(name() + ".num_vec_ops_f32_executed")
2157 .desc("number of f32 vec ops executed (e.g. WF size/inst)")
2158 ;
2159
2160 numVecOpsExecutedF64
2161 .name(name() + ".num_vec_ops_f64_executed")
2162 .desc("number of f64 vec ops executed (e.g. WF size/inst)")
2163 ;
2164
2165 numVecOpsExecutedFMA16
2166 .name(name() + ".num_vec_ops_fma16_executed")
2167 .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
2168 ;
2169
2170 numVecOpsExecutedFMA32
2171 .name(name() + ".num_vec_ops_fma32_executed")
2172 .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
2173 ;
2174
2175 numVecOpsExecutedFMA64
2176 .name(name() + ".num_vec_ops_fma64_executed")
2177 .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
2178 ;
2179
2180 numVecOpsExecutedMAD16
2181 .name(name() + ".num_vec_ops_mad16_executed")
2182 .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
2183 ;
2184
2185 numVecOpsExecutedMAD32
2186 .name(name() + ".num_vec_ops_mad32_executed")
2187 .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
2188 ;
2189
2190 numVecOpsExecutedMAD64
2191 .name(name() + ".num_vec_ops_mad64_executed")
2192 .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
2193 ;
2194
2195 numVecOpsExecutedMAC16
2196 .name(name() + ".num_vec_ops_mac16_executed")
2197 .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
2198 ;
2199
2200 numVecOpsExecutedMAC32
2201 .name(name() + ".num_vec_ops_mac32_executed")
2202 .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
2203 ;
2204
2205 numVecOpsExecutedMAC64
2206 .name(name() + ".num_vec_ops_mac64_executed")
2207 .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
2208 ;
2209
2210 numVecOpsExecutedTwoOpFP
2211 .name(name() + ".num_vec_ops_two_op_fp_executed")
2212 .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
2213 ;
2214
2215 totalCycles
2216 .name(name() + ".num_total_cycles")
2217 .desc("number of cycles the CU ran for")
2218 ;
2219
2220 ipc
2221 .name(name() + ".ipc")
2222 .desc("Instructions per cycle (this CU only)")
2223 ;
2224
2225 vpc
2226 .name(name() + ".vpc")
2227 .desc("Vector Operations per cycle (this CU only)")
2228 ;
2229
2230 vpc_f16
2231 .name(name() + ".vpc_f16")
2232 .desc("F16 Vector Operations per cycle (this CU only)")
2233 ;
2234
2235 vpc_f32
2236 .name(name() + ".vpc_f32")
2237 .desc("F32 Vector Operations per cycle (this CU only)")
2238 ;
2239
2240 vpc_f64
2241 .name(name() + ".vpc_f64")
2242 .desc("F64 Vector Operations per cycle (this CU only)")
2243 ;
2244
2245 numALUInstsExecuted
2246 .name(name() + ".num_alu_insts_executed")
2247 .desc("Number of dynamic non-GM memory insts executed")
2248 ;
2249
2250 wgBlockedDueBarrierAllocation
2251 .name(name() + ".wg_blocked_due_barrier_alloc")
2252 .desc("WG dispatch was blocked due to lack of barrier resources")
2253 ;
2254
2255 wgBlockedDueLdsAllocation
2256 .name(name() + ".wg_blocked_due_lds_alloc")
2257 .desc("Workgroup blocked due to LDS capacity")
2258 ;
2259
2260 ipc = numInstrExecuted / totalCycles;
2261 vpc = numVecOpsExecuted / totalCycles;
2262 vpc_f16 = numVecOpsExecutedF16 / totalCycles;
2263 vpc_f32 = numVecOpsExecutedF32 / totalCycles;
2264 vpc_f64 = numVecOpsExecutedF64 / totalCycles;
2265
2266 numTimesWgBlockedDueVgprAlloc
2267 .name(name() + ".times_wg_blocked_due_vgpr_alloc")
2268 .desc("Number of times WGs are blocked due to VGPR allocation per "
2269 "SIMD")
2270 ;
2271
2272 numTimesWgBlockedDueSgprAlloc
2273 .name(name() + ".times_wg_blocked_due_sgpr_alloc")
2274 .desc("Number of times WGs are blocked due to SGPR allocation per "
2275 "SIMD")
2276 ;
2277
2278 dynamicGMemInstrCnt
2279 .name(name() + ".global_mem_instr_cnt")
2280 .desc("dynamic non-flat global memory instruction count")
2281 ;
2282
2283 dynamicFlatMemInstrCnt
2284 .name(name() + ".flat_global_mem_instr_cnt")
2285 .desc("dynamic flat global memory instruction count")
2286 ;
2287
2288 dynamicLMemInstrCnt
2289 .name(name() + ".local_mem_instr_cnt")
2290 .desc("dynamic local memory intruction count")
2291 ;
2292
2293 numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
2294 dynamicLMemInstrCnt;
2295
2296 completedWfs
2297 .name(name() + ".num_completed_wfs")
2298 .desc("number of completed wavefronts")
2299 ;
2300
2301 completedWGs
2302 .name(name() + ".num_completed_wgs")
2303 .desc("number of completed workgroups")
2304 ;
2305
2306 numCASOps
2307 .name(name() + ".num_CAS_ops")
2308 .desc("number of compare and swap operations")
2309 ;
2310
2311 numFailedCASOps
2312 .name(name() + ".num_failed_CAS_ops")
2313 .desc("number of compare and swap operations that failed")
2314 ;
2315
2316 headTailLatency
2317 .init(0, 1000000, 10000)
2318 .name(name() + ".head_tail_latency")
2319 .desc("ticks between first and last cache block arrival at coalescer")
2320 .flags(Stats::pdf | Stats::oneline)
2321 ;
2322
2323 waveLevelParallelism
2324 .init(0, shader->n_wf * numVectorALUs, 1)
2325 .name(name() + ".wlp")
2326 .desc("wave level parallelism: count of active waves at wave launch")
2327 ;
2328
2329 instInterleave
2330 .init(numVectorALUs, 0, 20, 1)
2331 .name(name() + ".interleaving")
2332 .desc("Measure of instruction interleaving per SIMD")
2333 ;
2334
2335 // register stats of pipeline stages
2336 fetchStage.regStats();
2337 scoreboardCheckStage.regStats();
2338 scheduleStage.regStats();
2339 execStage.regStats();
2340
2341 // register stats of memory pipelines
2342 globalMemoryPipe.regStats();
2343 localMemoryPipe.regStats();
2344 scalarMemoryPipe.regStats();
2345
2346 registerManager->regStats();
2347 }
2348
2349 void
2350 ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
2351 {
2352 if (gpuDynInst->isScalar()) {
2353 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2354 sALUInsts++;
2355 instCyclesSALU++;
2356 } else if (gpuDynInst->isLoad()) {
2357 scalarMemReads++;
2358 } else if (gpuDynInst->isStore()) {
2359 scalarMemWrites++;
2360 }
2361 } else {
2362 if (gpuDynInst->isALU()) {
2363 shader->total_valu_insts++;
2364 if (shader->total_valu_insts == shader->max_valu_insts) {
2365 exitSimLoop("max vALU insts");
2366 }
2367 vALUInsts++;
2368 instCyclesVALU++;
2369 threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
2370 } else if (gpuDynInst->isFlat()) {
2371 if (gpuDynInst->isLocalMem()) {
2372 flatLDSInsts++;
2373 } else {
2374 flatVMemInsts++;
2375 }
2376 } else if (gpuDynInst->isLocalMem()) {
2377 ldsNoFlatInsts++;
2378 } else if (gpuDynInst->isLoad()) {
2379 vectorMemReads++;
2380 } else if (gpuDynInst->isStore()) {
2381 vectorMemWrites++;
2382 }
2383
2384 if (gpuDynInst->isLoad()) {
2385 switch (gpuDynInst->executedAs()) {
2386 case Enums::SC_SPILL:
2387 spillReads++;
2388 break;
2389 case Enums::SC_GLOBAL:
2390 globalReads++;
2391 break;
2392 case Enums::SC_GROUP:
2393 groupReads++;
2394 break;
2395 case Enums::SC_PRIVATE:
2396 privReads++;
2397 break;
2398 case Enums::SC_READONLY:
2399 readonlyReads++;
2400 break;
2401 case Enums::SC_KERNARG:
2402 kernargReads++;
2403 break;
2404 case Enums::SC_ARG:
2405 argReads++;
2406 break;
2407 case Enums::SC_NONE:
2408 /**
2409 * this case can occur for flat mem insts
2410 * who execute with EXEC = 0
2411 */
2412 break;
2413 default:
2414 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2415 break;
2416 }
2417 } else if (gpuDynInst->isStore()) {
2418 switch (gpuDynInst->executedAs()) {
2419 case Enums::SC_SPILL:
2420 spillWrites++;
2421 break;
2422 case Enums::SC_GLOBAL:
2423 globalWrites++;
2424 break;
2425 case Enums::SC_GROUP:
2426 groupWrites++;
2427 break;
2428 case Enums::SC_PRIVATE:
2429 privWrites++;
2430 break;
2431 case Enums::SC_READONLY:
2432 readonlyWrites++;
2433 break;
2434 case Enums::SC_KERNARG:
2435 kernargWrites++;
2436 break;
2437 case Enums::SC_ARG:
2438 argWrites++;
2439 break;
2440 case Enums::SC_NONE:
2441 /**
2442 * this case can occur for flat mem insts
2443 * who execute with EXEC = 0
2444 */
2445 break;
2446 default:
2447 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2448 break;
2449 }
2450 }
2451 }
2452 }
2453
2454 void
2455 ComputeUnit::updatePageDivergenceDist(Addr addr)
2456 {
2457 Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
2458
2459 if (!pagesTouched.count(virt_page_addr))
2460 pagesTouched[virt_page_addr] = 1;
2461 else
2462 pagesTouched[virt_page_addr]++;
2463 }
2464
2465 void
2466 ComputeUnit::exitCallback()
2467 {
2468 if (countPages) {
2469 std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2470
2471 *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2472 std::endl;
2473
2474 for (auto iter : pageAccesses) {
2475 *page_stat_file << std::hex << iter.first << ",";
2476 *page_stat_file << std::dec << iter.second.first << ",";
2477 *page_stat_file << std::dec << iter.second.second << std::endl;
2478 }
2479 }
2480 }
2481
2482 bool
2483 ComputeUnit::isDone() const
2484 {
2485 for (int i = 0; i < numVectorALUs; ++i) {
2486 if (!isVectorAluIdle(i)) {
2487 return false;
2488 }
2489 }
2490
2491 // TODO: FIXME if more than 1 of any memory pipe supported
2492 if (!srfToScalarMemPipeBus.rdy()) {
2493 return false;
2494 }
2495 if (!vrfToGlobalMemPipeBus.rdy()) {
2496 return false;
2497 }
2498 if (!vrfToLocalMemPipeBus.rdy()) {
2499 return false;
2500 }
2501
2502 if (!globalMemoryPipe.isGMReqFIFOWrRdy()
2503 || !localMemoryPipe.isLMReqFIFOWrRdy()
2504 || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
2505 !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
2506 return false;
2507 }
2508
2509 return true;
2510 }
2511
2512 int32_t
2513 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2514 const uint32_t wgId) const
2515 {
2516 return lds.getRefCounter(dispatchId, wgId);
2517 }
2518
2519 bool
2520 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2521 {
2522 assert(simdId < numVectorALUs);
2523
2524 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2525 if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2526 return false;
2527 }
2528 }
2529
2530 return true;
2531 }
2532
2533 /**
2534 * send a general request to the LDS
2535 * make sure to look at the return value here as your request might be
2536 * NACK'd and returning false means that you have to have some backup plan
2537 */
2538 bool
2539 ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
2540 {
2541 // this is just a request to carry the GPUDynInstPtr
2542 // back and forth
2543 RequestPtr newRequest = std::make_shared<Request>();
2544 newRequest->setPaddr(0x0);
2545
2546 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2547 PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2548
2549 // This is the SenderState needed upon return
2550 newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2551
2552 return ldsPort.sendTimingReq(newPacket);
2553 }
2554
2555 /**
2556 * get the result of packets sent to the LDS when they return
2557 */
2558 bool
2559 ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
2560 {
2561 const ComputeUnit::LDSPort::SenderState *senderState =
2562 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2563
2564 fatal_if(!senderState, "did not get the right sort of sender state");
2565
2566 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2567
2568 delete packet->senderState;
2569 delete packet;
2570
2571 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2572 return true;
2573 }
2574
2575 /**
2576 * attempt to send this packet, either the port is already stalled, the request
2577 * is nack'd and must stall or the request goes through
2578 * when a request cannot be sent, add it to the retries queue
2579 */
2580 bool
2581 ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
2582 {
2583 ComputeUnit::LDSPort::SenderState *sender_state =
2584 dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2585 fatal_if(!sender_state, "packet without a valid sender state");
2586
2587 GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
2588
2589 if (isStalled()) {
2590 fatal_if(retries.empty(), "must have retries waiting to be stalled");
2591
2592 retries.push(pkt);
2593
2594 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2595 computeUnit->cu_id, gpuDynInst->simdId,
2596 gpuDynInst->wfSlotId);
2597 return false;
2598 } else if (!RequestPort::sendTimingReq(pkt)) {
2599 // need to stall the LDS port until a recvReqRetry() is received
2600 // this indicates that there is more space
2601 stallPort();
2602 retries.push(pkt);
2603
2604 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2605 computeUnit->cu_id, gpuDynInst->simdId,
2606 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2607 return false;
2608 } else {
2609 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2610 computeUnit->cu_id, gpuDynInst->simdId,
2611 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2612 return true;
2613 }
2614 }
2615
2616 /**
2617 * the bus is telling the port that there is now space so retrying stalled
2618 * requests should work now
2619 * this allows the port to have a request be nack'd and then have the receiver
2620 * say when there is space, rather than simply retrying the send every cycle
2621 */
2622 void
2623 ComputeUnit::LDSPort::recvReqRetry()
2624 {
2625 auto queueSize = retries.size();
2626
2627 DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2628 computeUnit->cu_id, queueSize);
2629
2630 fatal_if(queueSize < 1,
2631 "why was there a recvReqRetry() with no pending reqs?");
2632 fatal_if(!isStalled(),
2633 "recvReqRetry() happened when the port was not stalled");
2634
2635 unstallPort();
2636
2637 while (!retries.empty()) {
2638 PacketPtr packet = retries.front();
2639
2640 DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2641
2642 if (!RequestPort::sendTimingReq(packet)) {
2643 // Stall port
2644 stallPort();
2645 DPRINTF(GPUPort, ": LDS send failed again\n");
2646 break;
2647 } else {
2648 DPRINTF(GPUTLB, ": LDS send successful\n");
2649 retries.pop();
2650 }
2651 }
2652 }