src/gpu-compute/compute_unit.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/compute_unit.hh"
  35
  36 #include <limits>
  37
  38 #include "arch/x86/isa_traits.hh"
  39 #include "base/output.hh"
  40 #include "debug/GPUDisp.hh"
  41 #include "debug/GPUExec.hh"
  42 #include "debug/GPUFetch.hh"
  43 #include "debug/GPUMem.hh"
  44 #include "debug/GPUPort.hh"
  45 #include "debug/GPUPrefetch.hh"
  46 #include "debug/GPUReg.hh"
  47 #include "debug/GPURename.hh"
  48 #include "debug/GPUSync.hh"
  49 #include "debug/GPUTLB.hh"
  50 #include "gpu-compute/dispatcher.hh"
  51 #include "gpu-compute/gpu_dyn_inst.hh"
  52 #include "gpu-compute/gpu_static_inst.hh"
  53 #include "gpu-compute/scalar_register_file.hh"
  54 #include "gpu-compute/shader.hh"
  55 #include "gpu-compute/simple_pool_manager.hh"
  56 #include "gpu-compute/vector_register_file.hh"
  57 #include "gpu-compute/wavefront.hh"
  58 #include "mem/page_table.hh"
  59 #include "sim/process.hh"
  60 #include "sim/sim_exit.hh"
  61
  62 ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
  63     numVectorGlobalMemUnits(p->num_global_mem_pipes),
  64     numVectorSharedMemUnits(p->num_shared_mem_pipes),
  65     numScalarMemUnits(p->num_scalar_mem_pipes),
  66     numVectorALUs(p->num_SIMDs),
  67     numScalarALUs(p->num_scalar_cores),
  68     vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
  69     coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
  70     registerManager(p->register_manager),
  71     fetchStage(p, *this),
  72     scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
  73     scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
  74     execStage(p, *this, scheduleToExecute),
  75     globalMemoryPipe(p, *this),
  76     localMemoryPipe(p, *this),
  77     scalarMemoryPipe(p, *this),
  78     tickEvent([this]{ exec(); }, "Compute unit tick event",
  79           false, Event::CPU_Tick_Pri),
  80     cu_id(p->cu_id),
  81     vrf(p->vector_register_file), srf(p->scalar_register_file),
  82     simdWidth(p->simd_width),
  83     spBypassPipeLength(p->spbypass_pipe_length),
  84     dpBypassPipeLength(p->dpbypass_pipe_length),
  85     scalarPipeStages(p->scalar_pipe_length),
  86     operandNetworkLength(p->operand_network_length),
  87     issuePeriod(p->issue_period),
  88     vrf_gm_bus_latency(p->vrf_gm_bus_latency),
  89     srf_scm_bus_latency(p->srf_scm_bus_latency),
  90     vrf_lm_bus_latency(p->vrf_lm_bus_latency),
  91     perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
  92     prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
  93     debugSegFault(p->debugSegFault),
  94     functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
  95     countPages(p->countPages),
  96     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
  97     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
  98     _requestorId(p->system->getRequestorId(this, "ComputeUnit")),
  99     lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
 100     ldsPort(csprintf("%s-port", name()), this),
 101     scalarDataPort(csprintf("%s-port", name()), this),
 102     scalarDTLBPort(csprintf("%s-port", name()), this),
 103     sqcPort(csprintf("%s-port", name()), this),
 104     sqcTLBPort(csprintf("%s-port", name()), this),
 105     _cacheLineSize(p->system->cacheLineSize()),
 106     _numBarrierSlots(p->num_barrier_slots),
 107     globalSeqNum(0), wavefrontSize(p->wf_size),
 108     scoreboardCheckToSchedule(p),
 109     scheduleToExecute(p)
 110 {
 111     /**
 112      * This check is necessary because std::bitset only provides conversion
 113      * to unsigned long or unsigned long long via to_ulong() or to_ullong().
 114      * there are a few places in the code where to_ullong() is used, however
 115      * if wavefrontSize is larger than a value the host can support then
 116      * bitset will throw a runtime exception. We should remove all use of
 117      * to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
 118      * however until that is done this assert is required.
 119      */
 120     fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
 121              p->wf_size <= 0,
 122              "WF size is larger than the host can support");
 123     fatal_if(!isPowerOf2(wavefrontSize),
 124              "Wavefront size should be a power of 2");
 125     // calculate how many cycles a vector load or store will need to transfer
 126     // its data over the corresponding buses
 127     numCyclesPerStoreTransfer =
 128         (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
 129                 (double)vrfToCoalescerBusWidth);
 130
 131     numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
 132                                / coalescerToVrfBusWidth;
 133
 134     // Initialization: all WF slots are assumed STOPPED
 135     idleWfs = p->n_wf * numVectorALUs;
 136     lastVaddrWF.resize(numVectorALUs);
 137     wfList.resize(numVectorALUs);
 138
 139     wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
 140
 141     for (int i = 0; i < p->num_barrier_slots; ++i) {
 142         freeBarrierIds.insert(i);
 143     }
 144
 145     for (int j = 0; j < numVectorALUs; ++j) {
 146         lastVaddrWF[j].resize(p->n_wf);
 147
 148         for (int i = 0; i < p->n_wf; ++i) {
 149             lastVaddrWF[j][i].resize(wfSize());
 150
 151             wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
 152             wfList[j][i]->setParent(this);
 153
 154             for (int k = 0; k < wfSize(); ++k) {
 155                 lastVaddrWF[j][i][k] = 0;
 156             }
 157         }
 158     }
 159
 160     lastVaddrSimd.resize(numVectorALUs);
 161
 162     for (int i = 0; i < numVectorALUs; ++i) {
 163         lastVaddrSimd[i].resize(wfSize(), 0);
 164     }
 165
 166     lastVaddrCU.resize(wfSize());
 167
 168     lds.setParent(this);
 169
 170     if (p->execPolicy == "OLDEST-FIRST") {
 171         exec_policy = EXEC_POLICY::OLDEST;
 172     } else if (p->execPolicy == "ROUND-ROBIN") {
 173         exec_policy = EXEC_POLICY::RR;
 174     } else {
 175         fatal("Invalid WF execution policy (CU)\n");
 176     }
 177
 178     for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
 179         memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
 180     }
 181
 182     for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
 183         tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
 184     }
 185
 186     // Setup tokens for response ports. The number of tokens in memPortTokens
 187     // is the total token count for the entire vector port (i.e., this CU).
 188     memPortTokens = new TokenManager(p->max_cu_tokens);
 189
 190     registerExitCallback([this]() { exitCallback(); });
 191
 192     lastExecCycle.resize(numVectorALUs, 0);
 193
 194     for (int i = 0; i < vrf.size(); ++i) {
 195         vrf[i]->setParent(this);
 196     }
 197     for (int i = 0; i < srf.size(); ++i) {
 198         srf[i]->setParent(this);
 199     }
 200     numVecRegsPerSimd = vrf[0]->numRegs();
 201     numScalarRegsPerSimd = srf[0]->numRegs();
 202
 203     registerManager->setParent(this);
 204
 205     activeWaves = 0;
 206
 207     instExecPerSimd.resize(numVectorALUs, 0);
 208
 209     // Calculate the number of bits to address a cache line
 210     panic_if(!isPowerOf2(_cacheLineSize),
 211         "Cache line size should be a power of two.");
 212     cacheLineBits = floorLog2(_cacheLineSize);
 213 }
 214
 215 ComputeUnit::~ComputeUnit()
 216 {
 217     // Delete wavefront slots
 218     for (int j = 0; j < numVectorALUs; ++j) {
 219         for (int i = 0; i < shader->n_wf; ++i) {
 220             delete wfList[j][i];
 221         }
 222         lastVaddrSimd[j].clear();
 223     }
 224     lastVaddrCU.clear();
 225 }
 226
 227 int
 228 ComputeUnit::numExeUnits() const
 229 {
 230     return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
 231         numVectorSharedMemUnits + numScalarMemUnits;
 232 }
 233
 234 // index into readyList of the first memory unit
 235 int
 236 ComputeUnit::firstMemUnit() const
 237 {
 238     return numVectorALUs + numScalarALUs;
 239 }
 240
 241 // index into readyList of the last memory unit
 242 int
 243 ComputeUnit::lastMemUnit() const
 244 {
 245     return numExeUnits() - 1;
 246 }
 247
 248 // index into scalarALUs vector of SALU used by the wavefront
 249 int
 250 ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const
 251 {
 252     if (numScalarALUs == 1) {
 253         return 0;
 254     } else {
 255         return w->simdId % numScalarALUs;
 256     }
 257 }
 258
 259 // index into readyList of Scalar ALU unit used by wavefront
 260 int
 261 ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const
 262 {
 263     return numVectorALUs + mapWaveToScalarAlu(w);
 264 }
 265
 266 // index into readyList of Global Memory unit used by wavefront
 267 int
 268 ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const
 269 {
 270     // TODO: FIXME if more than 1 GM pipe supported
 271     return numVectorALUs + numScalarALUs;
 272 }
 273
 274 // index into readyList of Local Memory unit used by wavefront
 275 int
 276 ComputeUnit::mapWaveToLocalMem(Wavefront *w) const
 277 {
 278     // TODO: FIXME if more than 1 LM pipe supported
 279     return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits;
 280 }
 281
 282 // index into readyList of Scalar Memory unit used by wavefront
 283 int
 284 ComputeUnit::mapWaveToScalarMem(Wavefront *w) const
 285 {
 286     // TODO: FIXME if more than 1 ScM pipe supported
 287     return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
 288         numVectorSharedMemUnits;
 289 }
 290
 291 void
 292 ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
 293 {
 294     w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
 295     w->workGroupSz[0] = task->wgSize(0);
 296     w->workGroupSz[1] = task->wgSize(1);
 297     w->workGroupSz[2] = task->wgSize(2);
 298     w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
 299     w->gridSz[0] = task->gridSize(0);
 300     w->gridSz[1] = task->gridSize(1);
 301     w->gridSz[2] = task->gridSize(2);
 302     w->computeActualWgSz(task);
 303 }
 304
 305 void
 306 ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 307                             HSAQueueEntry *task, int bar_id, bool fetchContext)
 308 {
 309     static int _n_wave = 0;
 310
 311     VectorMask init_mask;
 312     init_mask.reset();
 313
 314     for (int k = 0; k < wfSize(); ++k) {
 315         if (k + waveId * wfSize() < w->actualWgSzTotal)
 316             init_mask[k] = 1;
 317     }
 318
 319     w->execMask() = init_mask;
 320
 321     w->kernId = task->dispatchId();
 322     w->wfId = waveId;
 323     w->initMask = init_mask.to_ullong();
 324
 325     if (bar_id > WFBarrier::InvalidID) {
 326         w->barrierId(bar_id);
 327     } else {
 328         assert(!w->hasBarrier());
 329     }
 330
 331     for (int k = 0; k < wfSize(); ++k) {
 332         w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
 333         w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
 334                              w->actualWgSz[1];
 335         w->workItemId[2][k] = (k + waveId * wfSize()) /
 336                               (w->actualWgSz[0] * w->actualWgSz[1]);
 337
 338         w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
 339             w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
 340             w->workItemId[0][k];
 341     }
 342
 343     // WG state
 344     w->wgId = task->globalWgId();
 345     w->dispatchId = task->dispatchId();
 346     w->workGroupId[0] = w->wgId % task->numWg(0);
 347     w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
 348     w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
 349
 350     // set the wavefront context to have a pointer to this section of the LDS
 351     w->ldsChunk = ldsChunk;
 352
 353     int32_t refCount M5_VAR_USED =
 354                 lds.increaseRefCounter(w->dispatchId, w->wgId);
 355     DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
 356                     cu_id, w->wgId, refCount);
 357
 358     w->instructionBuffer.clear();
 359
 360     if (w->pendingFetch)
 361         w->dropFetch = true;
 362
 363     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
 364             "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
 365             w->simdId, w->wfSlotId, refCount);
 366
 367     w->initRegState(task, w->actualWgSzTotal);
 368     w->start(_n_wave++, task->codeAddr());
 369
 370     waveLevelParallelism.sample(activeWaves);
 371     activeWaves++;
 372 }
 373
 374 /**
 375  * trigger invalidate operation in the cu
 376  *
 377  * req: request initialized in shader, carrying the invlidate flags
 378  */
 379 void
 380 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
 381     GPUDynInstPtr gpuDynInst
 382         = std::make_shared<GPUDynInst>(this, nullptr,
 383             new KernelLaunchStaticInst(), getAndIncSeqNum());
 384
 385     // kern_id will be used in inv responses
 386     gpuDynInst->kern_id = kernId;
 387     // update contextId field
 388     req->setContext(gpuDynInst->wfDynId);
 389
 390     injectGlobalMemFence(gpuDynInst, true, req);
 391 }
 392
 393 /**
 394  * trigger flush operation in the cu
 395  *
 396  * gpuDynInst: inst passed to the request
 397  */
 398 void
 399 ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
 400     injectGlobalMemFence(gpuDynInst, true);
 401 }
 402
 403 void
 404 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
 405 {
 406     // If we aren't ticking, start it up!
 407     if (!tickEvent.scheduled()) {
 408         DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
 409         schedule(tickEvent, nextCycle());
 410     }
 411
 412     // the kernel's invalidate must have finished before any wg dispatch
 413     assert(task->isInvDone());
 414
 415     // reserve the LDS capacity allocated to the work group
 416     // disambiguated by the dispatch ID and workgroup ID, which should be
 417     // globally unique
 418     LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
 419                                           task->globalWgId(),
 420                                           task->ldsSize());
 421
 422     panic_if(!ldsChunk, "was not able to reserve space for this WG");
 423
 424     // calculate the number of 32-bit vector registers required
 425     // by each work item
 426     int vregDemand = task->numVectorRegs();
 427     int sregDemand = task->numScalarRegs();
 428     int wave_id = 0;
 429
 430     int barrier_id = WFBarrier::InvalidID;
 431
 432     /**
 433      * If this WG only has one WF it will not consume any barrier
 434      * resources because it has no need of them.
 435      */
 436     if (num_wfs_in_wg > 1) {
 437         /**
 438          * Find a free barrier slot for this WG. Each WF in the WG will
 439          * receive the same barrier ID.
 440          */
 441         barrier_id = getFreeBarrierId();
 442         auto &wf_barrier = barrierSlot(barrier_id);
 443         assert(!wf_barrier.maxBarrierCnt());
 444         assert(!wf_barrier.numAtBarrier());
 445         wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
 446
 447         DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
 448                 "%d waves using this barrier.\n", cu_id, barrier_id,
 449                 num_wfs_in_wg);
 450     }
 451
 452     // Assign WFs according to numWfsToSched vector, which is computed by
 453     // hasDispResources()
 454     for (int j = 0; j < shader->n_wf; ++j) {
 455         for (int i = 0; i < numVectorALUs; ++i) {
 456             Wavefront *w = wfList[i][j];
 457             // Check if this wavefront slot is available and there are WFs
 458             // remaining to be dispatched to current SIMD:
 459             // WF slot must be stopped and not waiting
 460             // for a release to complete S_RETURNING
 461             if (w->getStatus() == Wavefront::S_STOPPED &&
 462                 numWfsToSched[i] > 0) {
 463                 // decrement number of WFs awaiting dispatch to current SIMD
 464                 numWfsToSched[i] -= 1;
 465
 466                 fillKernelState(w, task);
 467
 468                 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
 469                     "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
 470                     vregDemand, sregDemand);
 471
 472                 registerManager->allocateRegisters(w, vregDemand, sregDemand);
 473
 474                 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
 475                 ++wave_id;
 476             }
 477         }
 478     }
 479 }
 480
 481 void
 482 ComputeUnit::insertInPipeMap(Wavefront *w)
 483 {
 484     panic_if(w->instructionBuffer.empty(),
 485              "Instruction Buffer of WF%d can't be empty", w->wgId);
 486     GPUDynInstPtr ii = w->instructionBuffer.front();
 487     pipeMap.emplace(ii->seqNum());
 488 }
 489
 490 void
 491 ComputeUnit::deleteFromPipeMap(Wavefront *w)
 492 {
 493     panic_if(w->instructionBuffer.empty(),
 494              "Instruction Buffer of WF%d can't be empty", w->wgId);
 495     GPUDynInstPtr ii = w->instructionBuffer.front();
 496     // delete the dynamic instruction from the pipeline map
 497     auto it = pipeMap.find(ii->seqNum());
 498     panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
 499     pipeMap.erase(it);
 500 }
 501
 502 bool
 503 ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
 504 {
 505     // compute true size of workgroup (after clamping to grid size)
 506     int trueWgSize[HSAQueueEntry::MAX_DIM];
 507     int trueWgSizeTotal = 1;
 508
 509     for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
 510         trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
 511                                  task->wgId(d) * task->wgSize(d));
 512
 513         trueWgSizeTotal *= trueWgSize[d];
 514         DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
 515     }
 516
 517     DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
 518
 519     // calculate the number of WFs in this WG
 520     int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
 521     num_wfs_in_wg = numWfs;
 522
 523     bool barrier_avail = true;
 524
 525     if (numWfs > 1 && !freeBarrierIds.size()) {
 526         barrier_avail = false;
 527     }
 528
 529     // calculate the number of 32-bit vector registers required by each
 530     // work item of the work group
 531     int vregDemandPerWI = task->numVectorRegs();
 532     // calculate the number of 32-bit scalar registers required by each
 533     // work item of the work group
 534     int sregDemandPerWI = task->numScalarRegs();
 535
 536     // check if the total number of VGPRs snd SGPRs required by all WFs
 537     // of the WG fit in the VRFs of all SIMD units and the CU's SRF
 538     panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
 539              "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
 540              "that has %d VGPRs\n",
 541              numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
 542     panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
 543              "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
 544              "with %d SGPRs\n",
 545              numWfs, sregDemandPerWI, numScalarRegsPerSimd);
 546
 547     // number of WF slots that are not occupied
 548     int freeWfSlots = 0;
 549     // number of Wfs from WG that were successfully mapped to a SIMD
 550     int numMappedWfs = 0;
 551     numWfsToSched.clear();
 552     numWfsToSched.resize(numVectorALUs, 0);
 553
 554     // attempt to map WFs to the SIMDs, based on WF slot availability
 555     // and register file availability
 556     for (int j = 0; j < shader->n_wf; ++j) {
 557         for (int i = 0; i < numVectorALUs; ++i) {
 558             if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
 559                 ++freeWfSlots;
 560                 // check if current WF will fit onto current SIMD/VRF
 561                 // if all WFs have not yet been mapped to the SIMDs
 562                 if (numMappedWfs < numWfs &&
 563                     registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
 564                                                       sregDemandPerWI) &&
 565                     registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
 566                                                       vregDemandPerWI)) {
 567                     numWfsToSched[i]++;
 568                     numMappedWfs++;
 569                 }
 570             }
 571         }
 572     }
 573
 574     // check that the number of mapped WFs is not greater
 575     // than the actual number of WFs
 576     assert(numMappedWfs <= numWfs);
 577
 578     bool vregAvail = true;
 579     bool sregAvail = true;
 580     // if a WF to SIMD mapping was not found, find the limiting resource
 581     if (numMappedWfs < numWfs) {
 582
 583         for (int j = 0; j < numVectorALUs; ++j) {
 584             // find if there are enough free VGPRs in the SIMD's VRF
 585             // to accomodate the WFs of the new WG that would be mapped
 586             // to this SIMD unit
 587             vregAvail &= registerManager->
 588                 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
 589             // find if there are enough free SGPRs in the SIMD's SRF
 590             // to accomodate the WFs of the new WG that would be mapped
 591             // to this SIMD unit
 592             sregAvail &= registerManager->
 593                 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
 594         }
 595     }
 596
 597     DPRINTF(GPUDisp, "Free WF slots =  %d, Mapped WFs = %d, \
 598             VGPR Availability = %d, SGPR Availability = %d\n",
 599             freeWfSlots, numMappedWfs, vregAvail, sregAvail);
 600
 601     if (!vregAvail) {
 602         ++numTimesWgBlockedDueVgprAlloc;
 603     }
 604
 605     if (!sregAvail) {
 606         ++numTimesWgBlockedDueSgprAlloc;
 607     }
 608
 609     // Return true if enough WF slots to submit workgroup and if there are
 610     // enough VGPRs to schedule all WFs to their SIMD units
 611     bool ldsAvail = lds.canReserve(task->ldsSize());
 612     if (!ldsAvail) {
 613         wgBlockedDueLdsAllocation++;
 614     }
 615
 616     if (!barrier_avail) {
 617         wgBlockedDueBarrierAllocation++;
 618     }
 619
 620     // Return true if the following are all true:
 621     // (a) all WFs of the WG were mapped to free WF slots
 622     // (b) there are enough VGPRs to schedule all WFs to their SIMD units
 623     // (c) there are enough SGPRs on the CU to schedule all WFs
 624     // (d) there is enough space in LDS to allocate for all WFs
 625     bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
 626                         && ldsAvail && barrier_avail;
 627     return can_dispatch;
 628 }
 629
 630 int
 631 ComputeUnit::numYetToReachBarrier(int bar_id)
 632 {
 633     auto &wf_barrier = barrierSlot(bar_id);
 634     return wf_barrier.numYetToReachBarrier();
 635 }
 636
 637 bool
 638 ComputeUnit::allAtBarrier(int bar_id)
 639 {
 640     auto &wf_barrier = barrierSlot(bar_id);
 641     return wf_barrier.allAtBarrier();
 642 }
 643
 644 void
 645 ComputeUnit::incNumAtBarrier(int bar_id)
 646 {
 647     auto &wf_barrier = barrierSlot(bar_id);
 648     wf_barrier.incNumAtBarrier();
 649 }
 650
 651 int
 652 ComputeUnit::numAtBarrier(int bar_id)
 653 {
 654     auto &wf_barrier = barrierSlot(bar_id);
 655     return wf_barrier.numAtBarrier();
 656 }
 657
 658 int
 659 ComputeUnit::maxBarrierCnt(int bar_id)
 660 {
 661     auto &wf_barrier = barrierSlot(bar_id);
 662     return wf_barrier.maxBarrierCnt();
 663 }
 664
 665 void
 666 ComputeUnit::resetBarrier(int bar_id)
 667 {
 668     auto &wf_barrier = barrierSlot(bar_id);
 669     wf_barrier.reset();
 670 }
 671
 672 void
 673 ComputeUnit::decMaxBarrierCnt(int bar_id)
 674 {
 675     auto &wf_barrier = barrierSlot(bar_id);
 676     wf_barrier.decMaxBarrierCnt();
 677 }
 678
 679 void
 680 ComputeUnit::releaseBarrier(int bar_id)
 681 {
 682     auto &wf_barrier = barrierSlot(bar_id);
 683     wf_barrier.release();
 684     freeBarrierIds.insert(bar_id);
 685 }
 686
 687 void
 688 ComputeUnit::releaseWFsFromBarrier(int bar_id)
 689 {
 690     for (int i = 0; i < numVectorALUs; ++i) {
 691         for (int j = 0; j < shader->n_wf; ++j) {
 692             Wavefront *wf = wfList[i][j];
 693             if (wf->barrierId() == bar_id) {
 694                 assert(wf->getStatus() == Wavefront::S_BARRIER);
 695                 wf->setStatus(Wavefront::S_RUNNING);
 696             }
 697         }
 698     }
 699 }
 700
 701 // Execute one clock worth of work on the ComputeUnit.
 702 void
 703 ComputeUnit::exec()
 704 {
 705     // process reads and writes in the RFs
 706     for (auto &vecRegFile : vrf) {
 707         vecRegFile->exec();
 708     }
 709
 710     for (auto &scRegFile : srf) {
 711         scRegFile->exec();
 712     }
 713
 714     // Execute pipeline stages in reverse order to simulate
 715     // the pipeline latency
 716     scalarMemoryPipe.exec();
 717     globalMemoryPipe.exec();
 718     localMemoryPipe.exec();
 719     execStage.exec();
 720     scheduleStage.exec();
 721     scoreboardCheckStage.exec();
 722     fetchStage.exec();
 723
 724     totalCycles++;
 725
 726     // Put this CU to sleep if there is no more work to be done.
 727     if (!isDone()) {
 728         schedule(tickEvent, nextCycle());
 729     } else {
 730         shader->notifyCuSleep();
 731         DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
 732     }
 733 }
 734
 735 void
 736 ComputeUnit::init()
 737 {
 738     // Initialize CU Bus models and execution resources
 739
 740     // Vector ALUs
 741     vectorALUs.clear();
 742     for (int i = 0; i < numVectorALUs; i++) {
 743         vectorALUs.emplace_back(this, clockPeriod());
 744     }
 745
 746     // Scalar ALUs
 747     scalarALUs.clear();
 748     for (int i = 0; i < numScalarALUs; i++) {
 749         scalarALUs.emplace_back(this, clockPeriod());
 750     }
 751
 752     // Vector Global Memory
 753     fatal_if(numVectorGlobalMemUnits > 1,
 754              "No support for multiple Global Memory Pipelines exists!!!");
 755     vectorGlobalMemUnit.init(this, clockPeriod());
 756     vrfToGlobalMemPipeBus.init(this, clockPeriod());
 757     glbMemToVrfBus.init(this, clockPeriod());
 758
 759     // Vector Local/Shared Memory
 760     fatal_if(numVectorSharedMemUnits > 1,
 761              "No support for multiple Local Memory Pipelines exists!!!");
 762     vectorSharedMemUnit.init(this, clockPeriod());
 763     vrfToLocalMemPipeBus.init(this, clockPeriod());
 764     locMemToVrfBus.init(this, clockPeriod());
 765
 766     // Scalar Memory
 767     fatal_if(numScalarMemUnits > 1,
 768              "No support for multiple Scalar Memory Pipelines exists!!!");
 769     scalarMemUnit.init(this, clockPeriod());
 770     srfToScalarMemPipeBus.init(this, clockPeriod());
 771     scalarMemToSrfBus.init(this, clockPeriod());
 772
 773     vectorRegsReserved.resize(numVectorALUs, 0);
 774     scalarRegsReserved.resize(numVectorALUs, 0);
 775
 776     fetchStage.init();
 777     scheduleStage.init();
 778     execStage.init();
 779     globalMemoryPipe.init();
 780
 781     gmTokenPort.setTokenManager(memPortTokens);
 782 }
 783
 784 bool
 785 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
 786 {
 787     // Ruby has completed the memory op. Schedule the mem_resp_event at the
 788     // appropriate cycle to process the timing memory response
 789     // This delay represents the pipeline delay
 790     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
 791     PortID index = sender_state->port_index;
 792     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
 793     GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
 794
 795     // MemSyncResp + WriteAckResp are handled completely here and we don't
 796     // schedule a MemRespEvent to process the responses further
 797     if (pkt->cmd == MemCmd::MemSyncResp) {
 798         // This response is for 1 of the following request types:
 799         //  - kernel launch
 800         //  - kernel end
 801         //  - non-kernel mem sync
 802
 803         // Kernel Launch
 804         // wavefront was nullptr when launching kernel, so it is meaningless
 805         // here (simdId=-1, wfSlotId=-1)
 806         if (gpuDynInst->isKernelLaunch()) {
 807             // for kernel launch, the original request must be both kernel-type
 808             // and acquire
 809             assert(pkt->req->isKernel());
 810             assert(pkt->req->isAcquire());
 811
 812             // one D-Cache inv is done, decrement counter
 813             dispatcher.updateInvCounter(gpuDynInst->kern_id);
 814
 815             delete pkt->senderState;
 816             delete pkt;
 817             return true;
 818         }
 819
 820         // retrieve wavefront from inst
 821         Wavefront *w = gpuDynInst->wavefront();
 822
 823         // Check if we are waiting on Kernel End Release
 824         if (w->getStatus() == Wavefront::S_RETURNING
 825             && gpuDynInst->isEndOfKernel()) {
 826             // for kernel end, the original request must be both kernel-type
 827             // and release
 828             assert(pkt->req->isKernel());
 829             assert(pkt->req->isRelease());
 830
 831             // one wb done, decrement counter, and return whether all wbs are
 832             // done for the kernel
 833             bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
 834
 835             // not all wbs are done for the kernel, just release pkt
 836             // resources
 837             if (!isWbDone) {
 838                 delete pkt->senderState;
 839                 delete pkt;
 840                 return true;
 841             }
 842
 843             // all wbs are completed for the kernel, do retirement work
 844             // for the workgroup
 845             DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
 846                     computeUnit->cu_id, w->simdId, w->wfSlotId,
 847                     w->wfDynId, w->wgId);
 848
 849             dispatcher.notifyWgCompl(w);
 850             w->setStatus(Wavefront::S_STOPPED);
 851         }
 852
 853         if (!pkt->req->isKernel()) {
 854             w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
 855             DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
 856                             "outstanding reqs %d => %d\n", gpuDynInst->simdId,
 857                             gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
 858                             gpuDynInst->disassemble(), w->outstandingReqs,
 859                             w->outstandingReqs - 1);
 860             computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
 861         }
 862
 863         delete pkt->senderState;
 864         delete pkt;
 865         return true;
 866     } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
 867         // this is for writeComplete callback
 868         // we simply get decrement write-related wait counters
 869         assert(gpuDynInst);
 870         Wavefront *w M5_VAR_USED =
 871             computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
 872         assert(w);
 873         DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
 874                         "outstanding reqs %d => %d\n", gpuDynInst->simdId,
 875                         gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
 876                         gpuDynInst->disassemble(), w->outstandingReqs,
 877                         w->outstandingReqs - 1);
 878         if (gpuDynInst->allLanesZero()) {
 879             // ask gm pipe to decrement request counters, instead of directly
 880             // performing here, to avoid asynchronous counter update and
 881             // instruction retirement (which may hurt waincnt effects)
 882             computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
 883
 884             DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
 885                             computeUnit->cu_id, gpuDynInst->simdId,
 886                             gpuDynInst->wfSlotId);
 887         }
 888
 889         delete pkt->senderState;
 890         delete pkt;
 891
 892         return true;
 893     }
 894
 895     EventFunctionWrapper *mem_resp_event =
 896         computeUnit->memPort[index].createMemRespEvent(pkt);
 897
 898     DPRINTF(GPUPort,
 899             "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
 900             computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 901             gpuDynInst->seqNum(), index, pkt->req->getPaddr());
 902
 903     computeUnit->schedule(mem_resp_event,
 904                           curTick() + computeUnit->resp_tick_latency);
 905
 906     return true;
 907 }
 908
 909 bool
 910 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 911 {
 912     assert(!pkt->req->isKernel());
 913
 914     // retrieve sender state
 915     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
 916     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
 917
 918     assert(pkt->isRead() || pkt->isWrite());
 919     assert(gpuDynInst->numScalarReqs > 0);
 920
 921     gpuDynInst->numScalarReqs--;
 922
 923     /**
 924      * for each returned scalar request we decrement the
 925      * numScalarReqs counter that is associated with this
 926      * gpuDynInst, which should have been set to correspond
 927      * to the number of packets sent for the memory op.
 928      * once all packets return, the memory op is finished
 929      * and we can push it into the response queue.
 930      */
 931     if (!gpuDynInst->numScalarReqs) {
 932         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 933                 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
 934                                 gpuDynInst);
 935         } else {
 936                 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
 937                                 gpuDynInst);
 938         }
 939     }
 940
 941     delete pkt->senderState;
 942     delete pkt;
 943
 944     return true;
 945 }
 946
 947 void
 948 ComputeUnit::ScalarDataPort::recvReqRetry()
 949 {
 950     for (const auto &pkt : retries) {
 951         if (!sendTimingReq(pkt)) {
 952             break;
 953         } else {
 954             retries.pop_front();
 955         }
 956     }
 957 }
 958
 959 void
 960 ComputeUnit::DataPort::recvReqRetry()
 961 {
 962     int len = retries.size();
 963
 964     assert(len > 0);
 965
 966     for (int i = 0; i < len; ++i) {
 967         PacketPtr pkt = retries.front().first;
 968         GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
 969         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
 970                 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 971                 pkt->req->getPaddr());
 972
 973         /** Currently Ruby can return false due to conflicts for the particular
 974          *  cache block or address.  Thus other requests should be allowed to
 975          *  pass and the data port should expect multiple retries. */
 976         if (!sendTimingReq(pkt)) {
 977             DPRINTF(GPUMem, "failed again!\n");
 978             break;
 979         } else {
 980             DPRINTF(GPUMem, "successful!\n");
 981             retries.pop_front();
 982         }
 983     }
 984 }
 985
 986 bool
 987 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 988 {
 989     computeUnit->fetchStage.processFetchReturn(pkt);
 990     return true;
 991 }
 992
 993 void
 994 ComputeUnit::SQCPort::recvReqRetry()
 995 {
 996     int len = retries.size();
 997
 998     assert(len > 0);
 999
1000     for (int i = 0; i < len; ++i) {
1001         PacketPtr pkt = retries.front().first;
1002         Wavefront *wavefront M5_VAR_USED = retries.front().second;
1003         DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1004                 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1005                 pkt->req->getPaddr());
1006         if (!sendTimingReq(pkt)) {
1007             DPRINTF(GPUFetch, "failed again!\n");
1008             break;
1009         } else {
1010             DPRINTF(GPUFetch, "successful!\n");
1011             retries.pop_front();
1012         }
1013     }
1014 }
1015
1016 void
1017 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
1018 {
1019     // There must be a way around this check to do the globalMemStart...
1020     Addr tmp_vaddr = pkt->req->getVaddr();
1021
1022     updatePageDivergenceDist(tmp_vaddr);
1023
1024     // set PC in request
1025     pkt->req->setPC(gpuDynInst->wavefront()->pc());
1026
1027     pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1028
1029     // figure out the type of the request to set read/write
1030     BaseTLB::Mode TLB_mode;
1031     assert(pkt->isRead() || pkt->isWrite());
1032
1033     // only do some things if actually accessing data
1034     bool isDataAccess = pkt->isWrite() || pkt->isRead();
1035
1036     // Check write before read for atomic operations
1037     // since atomic operations should use BaseTLB::Write
1038     if (pkt->isWrite()) {
1039         TLB_mode = BaseTLB::Write;
1040     } else if (pkt->isRead()) {
1041         TLB_mode = BaseTLB::Read;
1042     } else {
1043         fatal("pkt is not a read nor a write\n");
1044     }
1045
1046     tlbCycles -= curTick();
1047     ++tlbRequests;
1048
1049     PortID tlbPort_index = perLaneTLB ? index : 0;
1050
1051     if (shader->timingSim) {
1052         if (debugSegFault) {
1053             Process *p = shader->gpuTc->getProcessPtr();
1054             Addr vaddr = pkt->req->getVaddr();
1055             unsigned size = pkt->getSize();
1056
1057             if ((vaddr + size - 1) % 64 < vaddr % 64) {
1058                 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059                       cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1060             }
1061
1062             Addr paddr;
1063
1064             if (!p->pTable->translate(vaddr, paddr)) {
1065                 if (!p->fixupFault(vaddr)) {
1066                     panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067                           cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1068                           vaddr);
1069                 }
1070             }
1071         }
1072
1073         // This is the SenderState needed upon return
1074         pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1075
1076         // This is the senderState needed by the TLB hierarchy to function
1077         TheISA::GpuTLB::TranslationState *translation_state =
1078           new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
1079                                                pkt->senderState);
1080
1081         pkt->senderState = translation_state;
1082
1083         if (functionalTLB) {
1084             tlbPort[tlbPort_index].sendFunctional(pkt);
1085
1086             // update the hitLevel distribution
1087             int hit_level = translation_state->hitLevel;
1088             assert(hit_level != -1);
1089             hitsPerTLBLevel[hit_level]++;
1090
1091             // New SenderState for the memory access
1092             X86ISA::GpuTLB::TranslationState *sender_state =
1093                 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1094
1095             delete sender_state->tlbEntry;
1096             delete sender_state->saved;
1097             delete sender_state;
1098
1099             assert(pkt->req->hasPaddr());
1100             assert(pkt->req->hasSize());
1101
1102             // this is necessary because the GPU TLB receives packets instead
1103             // of requests. when the translation is complete, all relevent
1104             // fields in the request will be populated, but not in the packet.
1105             // here we create the new packet so we can set the size, addr,
1106             // and proper flags.
1107             PacketPtr oldPkt = pkt;
1108             pkt = new Packet(oldPkt->req, oldPkt->cmd);
1109             if (isDataAccess) {
1110                 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1111                 pkt->dataStatic(tmpData);
1112             }
1113             delete oldPkt;
1114
1115
1116             // New SenderState for the memory access
1117             pkt->senderState =
1118                 new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1119                     nullptr);
1120
1121             gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1122             gpuDynInst->tlbHitLevel[index] = hit_level;
1123
1124             // translation is done. Schedule the mem_req_event at the
1125             // appropriate cycle to send the timing memory request to ruby
1126             EventFunctionWrapper *mem_req_event =
1127                 memPort[index].createMemReqEvent(pkt);
1128
1129             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1130                     "scheduled\n", cu_id, gpuDynInst->simdId,
1131                     gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1132
1133             schedule(mem_req_event, curTick() + req_tick_latency);
1134         } else if (tlbPort[tlbPort_index].isStalled()) {
1135             assert(tlbPort[tlbPort_index].retries.size() > 0);
1136
1137             DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1138                     "failed!\n", cu_id, gpuDynInst->simdId,
1139                     gpuDynInst->wfSlotId, tmp_vaddr);
1140
1141             tlbPort[tlbPort_index].retries.push_back(pkt);
1142         } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1143             // Stall the data port;
1144             // No more packet will be issued till
1145             // ruby indicates resources are freed by
1146             // a recvReqRetry() call back on this port.
1147             tlbPort[tlbPort_index].stallPort();
1148
1149             DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1150                     "failed!\n", cu_id, gpuDynInst->simdId,
1151                     gpuDynInst->wfSlotId, tmp_vaddr);
1152
1153             tlbPort[tlbPort_index].retries.push_back(pkt);
1154         } else {
1155            DPRINTF(GPUTLB,
1156                    "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157                    cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1158         }
1159     } else {
1160         if (pkt->cmd == MemCmd::MemSyncReq) {
1161             gpuDynInst->resetEntireStatusVector();
1162         } else {
1163             gpuDynInst->decrementStatusVector(index);
1164         }
1165
1166         // New SenderState for the memory access
1167         delete pkt->senderState;
1168
1169         // Because it's atomic operation, only need TLB translation state
1170         pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
1171                                                                 shader->gpuTc);
1172
1173         tlbPort[tlbPort_index].sendFunctional(pkt);
1174
1175         // the addr of the packet is not modified, so we need to create a new
1176         // packet, or otherwise the memory access will have the old virtual
1177         // address sent in the translation packet, instead of the physical
1178         // address returned by the translation.
1179         PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1180         new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1181
1182         // Translation is done. It is safe to send the packet to memory.
1183         memPort[0].sendFunctional(new_pkt);
1184
1185         DPRINTF(GPUMem, "Functional sendRequest\n");
1186         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1187                 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1188                 new_pkt->req->getPaddr());
1189
1190         // safe_cast the senderState
1191         TheISA::GpuTLB::TranslationState *sender_state =
1192              safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1193
1194         delete sender_state->tlbEntry;
1195         delete new_pkt;
1196         delete pkt->senderState;
1197         delete pkt;
1198     }
1199 }
1200
1201 void
1202 ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
1203 {
1204     assert(pkt->isWrite() || pkt->isRead());
1205
1206     BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
1207
1208     pkt->senderState =
1209         new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
1210
1211     pkt->senderState =
1212         new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
1213                                              pkt->senderState);
1214
1215     if (scalarDTLBPort.isStalled()) {
1216         assert(scalarDTLBPort.retries.size());
1217         scalarDTLBPort.retries.push_back(pkt);
1218     } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1219         scalarDTLBPort.stallPort();
1220         scalarDTLBPort.retries.push_back(pkt);
1221     } else {
1222         DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1223                 tlb_mode == BaseTLB::Read ? "read" : "write",
1224                 pkt->req->getVaddr());
1225     }
1226 }
1227
1228 void
1229 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
1230                                   bool kernelMemSync,
1231                                   RequestPtr req)
1232 {
1233     assert(gpuDynInst->isGlobalSeg() ||
1234            gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1235
1236     if (!req) {
1237         req = std::make_shared<Request>(
1238             0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1239     }
1240
1241     // all mem sync requests have Paddr == 0
1242     req->setPaddr(0);
1243
1244     PacketPtr pkt = nullptr;
1245
1246     if (kernelMemSync) {
1247         if (gpuDynInst->isKernelLaunch()) {
1248             req->setCacheCoherenceFlags(Request::ACQUIRE);
1249             req->setReqInstSeqNum(gpuDynInst->seqNum());
1250             req->setFlags(Request::KERNEL);
1251             pkt = new Packet(req, MemCmd::MemSyncReq);
1252             pkt->pushSenderState(
1253                new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1254
1255             EventFunctionWrapper *mem_req_event =
1256               memPort[0].createMemReqEvent(pkt);
1257
1258             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259                     "an acquire\n", cu_id, gpuDynInst->simdId,
1260                     gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1261
1262             schedule(mem_req_event, curTick() + req_tick_latency);
1263         } else {
1264           // kernel end release must be enabled
1265           assert(shader->impl_kern_end_rel);
1266           assert(gpuDynInst->isEndOfKernel());
1267
1268           req->setCacheCoherenceFlags(Request::WB_L2);
1269           req->setReqInstSeqNum(gpuDynInst->seqNum());
1270           req->setFlags(Request::KERNEL);
1271           pkt = new Packet(req, MemCmd::MemSyncReq);
1272           pkt->pushSenderState(
1273              new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1274
1275           EventFunctionWrapper *mem_req_event =
1276             memPort[0].createMemReqEvent(pkt);
1277
1278           DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1279                   "a release\n", cu_id, gpuDynInst->simdId,
1280                   gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1281
1282           schedule(mem_req_event, curTick() + req_tick_latency);
1283         }
1284     } else {
1285         gpuDynInst->setRequestFlags(req);
1286
1287         req->setReqInstSeqNum(gpuDynInst->seqNum());
1288
1289         pkt = new Packet(req, MemCmd::MemSyncReq);
1290         pkt->pushSenderState(
1291             new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1292
1293         EventFunctionWrapper *mem_req_event =
1294           memPort[0].createMemReqEvent(pkt);
1295
1296         DPRINTF(GPUPort,
1297                 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1298                 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1299                 pkt->req->getPaddr());
1300
1301         schedule(mem_req_event, curTick() + req_tick_latency);
1302     }
1303 }
1304
1305 void
1306 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
1307 {
1308     DataPort::SenderState *sender_state =
1309         safe_cast<DataPort::SenderState*>(pkt->senderState);
1310
1311     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1312     ComputeUnit *compute_unit = computeUnit;
1313
1314     assert(gpuDynInst);
1315
1316     DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1317             compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1318             pkt->req->getPaddr(), id);
1319
1320     Addr paddr = pkt->req->getPaddr();
1321
1322     // mem sync resp and write-complete callback must be handled already in
1323     // DataPort::recvTimingResp
1324     assert(pkt->cmd != MemCmd::MemSyncResp);
1325     assert(pkt->cmd != MemCmd::WriteCompleteResp);
1326
1327     // this is for read, write and atomic
1328     int index = gpuDynInst->memStatusVector[paddr].back();
1329
1330     DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1331             pkt->req->getPaddr(), id);
1332
1333     gpuDynInst->memStatusVector[paddr].pop_back();
1334     gpuDynInst->pAddr = pkt->req->getPaddr();
1335
1336     gpuDynInst->decrementStatusVector(index);
1337     DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1338
1339     if (gpuDynInst->allLanesZero()) {
1340         auto iter = gpuDynInst->memStatusVector.begin();
1341         auto end = gpuDynInst->memStatusVector.end();
1342
1343         while (iter != end) {
1344             assert(iter->second.empty());
1345             ++iter;
1346         }
1347
1348         // Calculate the difference between the arrival of the first cache
1349         // block and the last cache block to arrive if we have the time
1350         // for the first cache block.
1351         if (compute_unit->headTailMap.count(gpuDynInst)) {
1352             Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1353             compute_unit->headTailLatency.sample(curTick() - headTick);
1354             compute_unit->headTailMap.erase(gpuDynInst);
1355         }
1356
1357         gpuDynInst->memStatusVector.clear();
1358
1359         // note: only handle read response here; for write, the response
1360         // is separately handled when writeComplete callback is received
1361         if (pkt->isRead()) {
1362             gpuDynInst->
1363                 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1364             compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1365
1366             DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1367                     compute_unit->cu_id, gpuDynInst->simdId,
1368                     gpuDynInst->wfSlotId);
1369         }
1370     } else {
1371         if (pkt->isRead()) {
1372             if (!compute_unit->headTailMap.count(gpuDynInst)) {
1373                 compute_unit->headTailMap
1374                     .insert(std::make_pair(gpuDynInst, curTick()));
1375             }
1376         }
1377     }
1378
1379     delete pkt->senderState;
1380     delete pkt;
1381 }
1382
1383 ComputeUnit*
1384 ComputeUnitParams::create()
1385 {
1386     return new ComputeUnit(this);
1387 }
1388
1389 bool
1390 ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
1391 {
1392     Addr line = pkt->req->getPaddr();
1393
1394     DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1395             pkt->req->getVaddr(), line);
1396
1397     assert(pkt->senderState);
1398     computeUnit->tlbCycles += curTick();
1399
1400     // pop off the TLB translation state
1401     TheISA::GpuTLB::TranslationState *translation_state =
1402                safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1403
1404     // no PageFaults are permitted for data accesses
1405     if (!translation_state->tlbEntry) {
1406         DTLBPort::SenderState *sender_state =
1407             safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1408
1409         Wavefront *w M5_VAR_USED =
1410             computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1411             [sender_state->_gpuDynInst->wfSlotId];
1412
1413         DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1414                  pkt->req->getVaddr());
1415     }
1416
1417     // update the hitLevel distribution
1418     int hit_level = translation_state->hitLevel;
1419     computeUnit->hitsPerTLBLevel[hit_level]++;
1420
1421     delete translation_state->tlbEntry;
1422     assert(!translation_state->ports.size());
1423     pkt->senderState = translation_state->saved;
1424
1425     // for prefetch pkt
1426     BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1427
1428     delete translation_state;
1429
1430     // use the original sender state to know how to close this transaction
1431     DTLBPort::SenderState *sender_state =
1432         safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1433
1434     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1435     PortID mp_index = sender_state->portIndex;
1436     Addr vaddr = pkt->req->getVaddr();
1437     gpuDynInst->memStatusVector[line].push_back(mp_index);
1438     gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1439
1440     MemCmd requestCmd;
1441
1442     if (pkt->cmd == MemCmd::ReadResp) {
1443         requestCmd = MemCmd::ReadReq;
1444     } else if (pkt->cmd == MemCmd::WriteResp) {
1445         requestCmd = MemCmd::WriteReq;
1446     } else if (pkt->cmd == MemCmd::SwapResp) {
1447         requestCmd = MemCmd::SwapReq;
1448     } else {
1449         panic("unsupported response to request conversion %s\n",
1450               pkt->cmd.toString());
1451     }
1452
1453     if (computeUnit->prefetchDepth) {
1454         int simdId = gpuDynInst->simdId;
1455         int wfSlotId = gpuDynInst->wfSlotId;
1456         Addr last = 0;
1457
1458         switch(computeUnit->prefetchType) {
1459         case Enums::PF_CU:
1460             last = computeUnit->lastVaddrCU[mp_index];
1461             break;
1462         case Enums::PF_PHASE:
1463             last = computeUnit->lastVaddrSimd[simdId][mp_index];
1464             break;
1465         case Enums::PF_WF:
1466             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1467         default:
1468             break;
1469         }
1470
1471         DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1472                 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1473
1474         int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1475                      roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
1476                      : 0;
1477
1478         DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1479
1480         computeUnit->lastVaddrCU[mp_index] = vaddr;
1481         computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1482         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1483
1484         stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1485             computeUnit->prefetchStride: stride;
1486
1487         DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1488                 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1489
1490         DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1491
1492         // Prefetch Next few pages atomically
1493         for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1494             DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1495                     vaddr+stride*pf*TheISA::PageBytes);
1496
1497             if (!stride)
1498                 break;
1499
1500             RequestPtr prefetch_req = std::make_shared<Request>(
1501                 vaddr + stride * pf * TheISA::PageBytes,
1502                 sizeof(uint8_t), 0,
1503                 computeUnit->requestorId(),
1504                 0, 0, nullptr);
1505
1506             PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1507             uint8_t foo = 0;
1508             prefetch_pkt->dataStatic(&foo);
1509
1510             // Because it's atomic operation, only need TLB translation state
1511             prefetch_pkt->senderState =
1512                 new TheISA::GpuTLB::TranslationState(TLB_mode,
1513                     computeUnit->shader->gpuTc, true);
1514
1515             // Currently prefetches are zero-latency, hence the sendFunctional
1516             sendFunctional(prefetch_pkt);
1517
1518             /* safe_cast the senderState */
1519             TheISA::GpuTLB::TranslationState *tlb_state =
1520                  safe_cast<TheISA::GpuTLB::TranslationState*>(
1521                          prefetch_pkt->senderState);
1522
1523
1524             delete tlb_state->tlbEntry;
1525             delete tlb_state;
1526             delete prefetch_pkt;
1527         }
1528     }
1529
1530     // First we must convert the response cmd back to a request cmd so that
1531     // the request can be sent through the cu's request port
1532     PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1533     new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1534     delete pkt->senderState;
1535     delete pkt;
1536
1537     // New SenderState for the memory access
1538     new_pkt->senderState =
1539             new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1540                                                    nullptr);
1541
1542     // translation is done. Schedule the mem_req_event at the appropriate
1543     // cycle to send the timing memory request to ruby
1544     EventFunctionWrapper *mem_req_event =
1545         computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1546
1547     DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1548             computeUnit->cu_id, gpuDynInst->simdId,
1549             gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1550
1551     computeUnit->schedule(mem_req_event, curTick() +
1552                           computeUnit->req_tick_latency);
1553
1554     return true;
1555 }
1556
1557 EventFunctionWrapper*
1558 ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt)
1559 {
1560     return new EventFunctionWrapper(
1561         [this, pkt]{ processMemReqEvent(pkt); },
1562         "ComputeUnit memory request event", true);
1563 }
1564
1565 EventFunctionWrapper*
1566 ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt)
1567 {
1568     return new EventFunctionWrapper(
1569         [this, pkt]{ processMemRespEvent(pkt); },
1570         "ComputeUnit memory response event", true);
1571 }
1572
1573 void
1574 ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
1575 {
1576     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1577     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1578     ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1579
1580     if (!(sendTimingReq(pkt))) {
1581         retries.push_back(std::make_pair(pkt, gpuDynInst));
1582
1583         DPRINTF(GPUPort,
1584                 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1585                 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1586                 id, pkt->req->getPaddr());
1587     } else {
1588         DPRINTF(GPUPort,
1589                 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1590                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1591                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1592                 pkt->req->getPaddr());
1593     }
1594 }
1595
1596 const char*
1597 ComputeUnit::ScalarDataPort::MemReqEvent::description() const
1598 {
1599     return "ComputeUnit scalar memory request event";
1600 }
1601
1602 void
1603 ComputeUnit::ScalarDataPort::MemReqEvent::process()
1604 {
1605     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1606     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1607     ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit;
1608
1609     if (!(scalarDataPort.sendTimingReq(pkt))) {
1610         scalarDataPort.retries.push_back(pkt);
1611
1612         DPRINTF(GPUPort,
1613                 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1614                 compute_unit->cu_id, gpuDynInst->simdId,
1615                 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1616     } else {
1617         DPRINTF(GPUPort,
1618                 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1619                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1620                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1621                 pkt->req->getPaddr());
1622     }
1623 }
1624
1625 /*
1626  * The initial translation request could have been rejected,
1627  * if <retries> queue is not Retry sending the translation
1628  * request. sendRetry() is called from the peer port whenever
1629  * a translation completes.
1630  */
1631 void
1632 ComputeUnit::DTLBPort::recvReqRetry()
1633 {
1634     int len = retries.size();
1635
1636     DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1637             computeUnit->cu_id, len);
1638
1639     assert(len > 0);
1640     assert(isStalled());
1641     // recvReqRetry is an indication that the resource on which this
1642     // port was stalling on is freed. So, remove the stall first
1643     unstallPort();
1644
1645     for (int i = 0; i < len; ++i) {
1646         PacketPtr pkt = retries.front();
1647         Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1648         DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1649
1650         if (!sendTimingReq(pkt)) {
1651             // Stall port
1652             stallPort();
1653             DPRINTF(GPUTLB, ": failed again\n");
1654             break;
1655         } else {
1656             DPRINTF(GPUTLB, ": successful\n");
1657             retries.pop_front();
1658         }
1659     }
1660 }
1661
1662 bool
1663 ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
1664 {
1665     assert(pkt->senderState);
1666
1667     TheISA::GpuTLB::TranslationState *translation_state =
1668         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1669
1670     // Page faults are not allowed
1671     fatal_if(!translation_state->tlbEntry,
1672             "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1673
1674     delete translation_state->tlbEntry;
1675     assert(!translation_state->ports.size());
1676
1677     pkt->senderState = translation_state->saved;
1678     delete translation_state;
1679
1680     ScalarDTLBPort::SenderState *sender_state =
1681         safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1682
1683     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1684     delete pkt->senderState;
1685
1686     Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
1687
1688     DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1689         "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1690         w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1691
1692     MemCmd mem_cmd;
1693
1694     if (pkt->cmd == MemCmd::ReadResp) {
1695         mem_cmd = MemCmd::ReadReq;
1696     } else if (pkt->cmd == MemCmd::WriteResp) {
1697         mem_cmd = MemCmd::WriteReq;
1698     } else {
1699       fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1700             pkt->cmd.toString());
1701     }
1702
1703     PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1704     req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1705     delete pkt;
1706
1707     req_pkt->senderState =
1708         new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
1709
1710     if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1711         computeUnit->scalarDataPort.retries.push_back(req_pkt);
1712         DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1713                 gpuDynInst->disassemble());
1714     } else {
1715         DPRINTF(GPUMem, "send scalar req for: %s\n",
1716                 gpuDynInst->disassemble());
1717     }
1718
1719     return true;
1720 }
1721
1722 bool
1723 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
1724 {
1725     Addr line M5_VAR_USED = pkt->req->getPaddr();
1726     DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1727             computeUnit->cu_id, pkt->req->getVaddr(), line);
1728
1729     assert(pkt->senderState);
1730
1731     // pop off the TLB translation state
1732     TheISA::GpuTLB::TranslationState *translation_state
1733         = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1734
1735     bool success = translation_state->tlbEntry != nullptr;
1736     delete translation_state->tlbEntry;
1737     assert(!translation_state->ports.size());
1738     pkt->senderState = translation_state->saved;
1739     delete translation_state;
1740
1741     // use the original sender state to know how to close this transaction
1742     ITLBPort::SenderState *sender_state =
1743         safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1744
1745     // get the wavefront associated with this translation request
1746     Wavefront *wavefront = sender_state->wavefront;
1747     delete pkt->senderState;
1748
1749     if (success) {
1750         // pkt is reused in fetch(), don't delete it here.  However, we must
1751         // reset the command to be a request so that it can be sent through
1752         // the cu's request port
1753         assert(pkt->cmd == MemCmd::ReadResp);
1754         pkt->cmd = MemCmd::ReadReq;
1755
1756         computeUnit->fetchStage.fetch(pkt, wavefront);
1757     } else {
1758         if (wavefront->dropFetch) {
1759             assert(wavefront->instructionBuffer.empty());
1760             wavefront->dropFetch = false;
1761         }
1762
1763         wavefront->pendingFetch = 0;
1764     }
1765
1766     return true;
1767 }
1768
1769 /*
1770  * The initial translation request could have been rejected, if
1771  * <retries> queue is not empty. Retry sending the translation
1772  * request. sendRetry() is called from the peer port whenever
1773  * a translation completes.
1774  */
1775 void
1776 ComputeUnit::ITLBPort::recvReqRetry()
1777 {
1778
1779     int len = retries.size();
1780     DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1781
1782     assert(len > 0);
1783     assert(isStalled());
1784
1785     // recvReqRetry is an indication that the resource on which this
1786     // port was stalling on is freed. So, remove the stall first
1787     unstallPort();
1788
1789     for (int i = 0; i < len; ++i) {
1790         PacketPtr pkt = retries.front();
1791         Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1792         DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1793
1794         if (!sendTimingReq(pkt)) {
1795             stallPort(); // Stall port
1796             DPRINTF(GPUTLB, ": failed again\n");
1797             break;
1798         } else {
1799             DPRINTF(GPUTLB, ": successful\n");
1800             retries.pop_front();
1801         }
1802     }
1803 }
1804
1805 void
1806 ComputeUnit::regStats()
1807 {
1808     ClockedObject::regStats();
1809
1810     vALUInsts
1811         .name(name() + ".valu_insts")
1812         .desc("Number of vector ALU insts issued.")
1813         ;
1814     vALUInstsPerWF
1815         .name(name() + ".valu_insts_per_wf")
1816         .desc("The avg. number of vector ALU insts issued per-wavefront.")
1817         ;
1818     sALUInsts
1819         .name(name() + ".salu_insts")
1820         .desc("Number of scalar ALU insts issued.")
1821         ;
1822     sALUInstsPerWF
1823         .name(name() + ".salu_insts_per_wf")
1824         .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1825         ;
1826     instCyclesVALU
1827         .name(name() + ".inst_cycles_valu")
1828         .desc("Number of cycles needed to execute VALU insts.")
1829         ;
1830     instCyclesSALU
1831         .name(name() + ".inst_cycles_salu")
1832         .desc("Number of cycles needed to execute SALU insts.")
1833         ;
1834     threadCyclesVALU
1835         .name(name() + ".thread_cycles_valu")
1836         .desc("Number of thread cycles used to execute vector ALU ops. "
1837               "Similar to instCyclesVALU but multiplied by the number of "
1838               "active threads.")
1839         ;
1840     vALUUtilization
1841         .name(name() + ".valu_utilization")
1842         .desc("Percentage of active vector ALU threads in a wave.")
1843         ;
1844     ldsNoFlatInsts
1845         .name(name() + ".lds_no_flat_insts")
1846         .desc("Number of LDS insts issued, not including FLAT "
1847               "accesses that resolve to LDS.")
1848         ;
1849     ldsNoFlatInstsPerWF
1850         .name(name() + ".lds_no_flat_insts_per_wf")
1851         .desc("The avg. number of LDS insts (not including FLAT "
1852               "accesses that resolve to LDS) per-wavefront.")
1853         ;
1854     flatVMemInsts
1855         .name(name() + ".flat_vmem_insts")
1856         .desc("The number of FLAT insts that resolve to vmem issued.")
1857         ;
1858     flatVMemInstsPerWF
1859         .name(name() + ".flat_vmem_insts_per_wf")
1860         .desc("The average number of FLAT insts that resolve to vmem "
1861               "issued per-wavefront.")
1862         ;
1863     flatLDSInsts
1864         .name(name() + ".flat_lds_insts")
1865         .desc("The number of FLAT insts that resolve to LDS issued.")
1866         ;
1867     flatLDSInstsPerWF
1868         .name(name() + ".flat_lds_insts_per_wf")
1869         .desc("The average number of FLAT insts that resolve to LDS "
1870               "issued per-wavefront.")
1871         ;
1872     vectorMemWrites
1873         .name(name() + ".vector_mem_writes")
1874         .desc("Number of vector mem write insts (excluding FLAT insts).")
1875         ;
1876     vectorMemWritesPerWF
1877         .name(name() + ".vector_mem_writes_per_wf")
1878         .desc("The average number of vector mem write insts "
1879               "(excluding FLAT insts) per-wavefront.")
1880         ;
1881     vectorMemReads
1882         .name(name() + ".vector_mem_reads")
1883         .desc("Number of vector mem read insts (excluding FLAT insts).")
1884         ;
1885     vectorMemReadsPerWF
1886         .name(name() + ".vector_mem_reads_per_wf")
1887         .desc("The avg. number of vector mem read insts (excluding "
1888               "FLAT insts) per-wavefront.")
1889         ;
1890     scalarMemWrites
1891         .name(name() + ".scalar_mem_writes")
1892         .desc("Number of scalar mem write insts.")
1893         ;
1894     scalarMemWritesPerWF
1895         .name(name() + ".scalar_mem_writes_per_wf")
1896         .desc("The average number of scalar mem write insts per-wavefront.")
1897         ;
1898     scalarMemReads
1899         .name(name() + ".scalar_mem_reads")
1900         .desc("Number of scalar mem read insts.")
1901         ;
1902     scalarMemReadsPerWF
1903         .name(name() + ".scalar_mem_reads_per_wf")
1904         .desc("The average number of scalar mem read insts per-wavefront.")
1905         ;
1906
1907     vALUInstsPerWF = vALUInsts / completedWfs;
1908     sALUInstsPerWF = sALUInsts / completedWfs;
1909     vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1910     ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
1911     flatVMemInstsPerWF = flatVMemInsts / completedWfs;
1912     flatLDSInstsPerWF = flatLDSInsts / completedWfs;
1913     vectorMemWritesPerWF = vectorMemWrites / completedWfs;
1914     vectorMemReadsPerWF = vectorMemReads / completedWfs;
1915     scalarMemWritesPerWF = scalarMemWrites / completedWfs;
1916     scalarMemReadsPerWF = scalarMemReads / completedWfs;
1917
1918     vectorMemReadsPerKiloInst
1919         .name(name() + ".vector_mem_reads_per_kilo_inst")
1920         .desc("Number of vector mem reads per kilo-instruction")
1921         ;
1922     vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
1923     vectorMemWritesPerKiloInst
1924         .name(name() + ".vector_mem_writes_per_kilo_inst")
1925         .desc("Number of vector mem writes per kilo-instruction")
1926         ;
1927     vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
1928     vectorMemInstsPerKiloInst
1929         .name(name() + ".vector_mem_insts_per_kilo_inst")
1930         .desc("Number of vector mem insts per kilo-instruction")
1931         ;
1932     vectorMemInstsPerKiloInst =
1933         ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
1934     scalarMemReadsPerKiloInst
1935         .name(name() + ".scalar_mem_reads_per_kilo_inst")
1936         .desc("Number of scalar mem reads per kilo-instruction")
1937     ;
1938     scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
1939     scalarMemWritesPerKiloInst
1940         .name(name() + ".scalar_mem_writes_per_kilo_inst")
1941         .desc("Number of scalar mem writes per kilo-instruction")
1942     ;
1943     scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
1944     scalarMemInstsPerKiloInst
1945         .name(name() + ".scalar_mem_insts_per_kilo_inst")
1946         .desc("Number of scalar mem insts per kilo-instruction")
1947         ;
1948     scalarMemInstsPerKiloInst =
1949         ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
1950
1951     instCyclesVMemPerSimd
1952        .init(numVectorALUs)
1953        .name(name() + ".inst_cycles_vector_memory")
1954        .desc("Number of cycles to send address, command, data from VRF to "
1955              "vector memory unit, per SIMD")
1956        ;
1957
1958     instCyclesScMemPerSimd
1959        .init(numVectorALUs)
1960        .name(name() + ".inst_cycles_scalar_memory")
1961        .desc("Number of cycles to send address, command, data from SRF to "
1962              "scalar memory unit, per SIMD")
1963        ;
1964
1965     instCyclesLdsPerSimd
1966        .init(numVectorALUs)
1967        .name(name() + ".inst_cycles_lds")
1968        .desc("Number of cycles to send address, command, data from VRF to "
1969              "LDS unit, per SIMD")
1970        ;
1971
1972     globalReads
1973         .name(name() + ".global_mem_reads")
1974         .desc("Number of reads to the global segment")
1975     ;
1976     globalWrites
1977         .name(name() + ".global_mem_writes")
1978         .desc("Number of writes to the global segment")
1979     ;
1980     globalMemInsts
1981         .name(name() + ".global_mem_insts")
1982         .desc("Number of memory instructions sent to the global segment")
1983     ;
1984     globalMemInsts = globalReads + globalWrites;
1985     argReads
1986         .name(name() + ".arg_reads")
1987         .desc("Number of reads to the arg segment")
1988     ;
1989     argWrites
1990         .name(name() + ".arg_writes")
1991         .desc("NUmber of writes to the arg segment")
1992     ;
1993     argMemInsts
1994         .name(name() + ".arg_mem_insts")
1995         .desc("Number of memory instructions sent to the arg segment")
1996     ;
1997     argMemInsts = argReads + argWrites;
1998     spillReads
1999         .name(name() + ".spill_reads")
2000         .desc("Number of reads to the spill segment")
2001     ;
2002     spillWrites
2003         .name(name() + ".spill_writes")
2004         .desc("Number of writes to the spill segment")
2005     ;
2006     spillMemInsts
2007         .name(name() + ".spill_mem_insts")
2008         .desc("Number of memory instructions sent to the spill segment")
2009     ;
2010     spillMemInsts = spillReads + spillWrites;
2011     groupReads
2012         .name(name() + ".group_reads")
2013         .desc("Number of reads to the group segment")
2014     ;
2015     groupWrites
2016         .name(name() + ".group_writes")
2017         .desc("Number of writes to the group segment")
2018     ;
2019     groupMemInsts
2020         .name(name() + ".group_mem_insts")
2021         .desc("Number of memory instructions sent to the group segment")
2022     ;
2023     groupMemInsts = groupReads + groupWrites;
2024     privReads
2025         .name(name() + ".private_reads")
2026         .desc("Number of reads to the private segment")
2027     ;
2028     privWrites
2029         .name(name() + ".private_writes")
2030         .desc("Number of writes to the private segment")
2031     ;
2032     privMemInsts
2033         .name(name() + ".private_mem_insts")
2034         .desc("Number of memory instructions sent to the private segment")
2035     ;
2036     privMemInsts = privReads + privWrites;
2037     readonlyReads
2038         .name(name() + ".readonly_reads")
2039         .desc("Number of reads to the readonly segment")
2040     ;
2041     readonlyWrites
2042         .name(name() + ".readonly_writes")
2043         .desc("Number of memory instructions sent to the readonly segment")
2044     ;
2045     readonlyMemInsts
2046         .name(name() + ".readonly_mem_insts")
2047         .desc("Number of memory instructions sent to the readonly segment")
2048     ;
2049     readonlyMemInsts = readonlyReads + readonlyWrites;
2050     kernargReads
2051         .name(name() + ".kernarg_reads")
2052         .desc("Number of reads sent to the kernarg segment")
2053     ;
2054     kernargWrites
2055         .name(name() + ".kernarg_writes")
2056         .desc("Number of memory instructions sent to the kernarg segment")
2057     ;
2058     kernargMemInsts
2059         .name(name() + ".kernarg_mem_insts")
2060         .desc("Number of memory instructions sent to the kernarg segment")
2061     ;
2062     kernargMemInsts = kernargReads + kernargWrites;
2063
2064     tlbCycles
2065         .name(name() + ".tlb_cycles")
2066         .desc("total number of cycles for all uncoalesced requests")
2067         ;
2068
2069     tlbRequests
2070         .name(name() + ".tlb_requests")
2071         .desc("number of uncoalesced requests")
2072         ;
2073
2074     tlbLatency
2075         .name(name() + ".avg_translation_latency")
2076         .desc("Avg. translation latency for data translations")
2077         ;
2078
2079     tlbLatency = tlbCycles / tlbRequests;
2080
2081     hitsPerTLBLevel
2082        .init(4)
2083        .name(name() + ".TLB_hits_distribution")
2084        .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
2085        ;
2086
2087     // fixed number of TLB levels
2088     for (int i = 0; i < 4; ++i) {
2089         if (!i)
2090             hitsPerTLBLevel.subname(i,"page_table");
2091         else
2092             hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2093     }
2094
2095     execRateDist
2096         .init(0, 10, 2)
2097         .name(name() + ".inst_exec_rate")
2098         .desc("Instruction Execution Rate: Number of executed vector "
2099               "instructions per cycle")
2100         ;
2101
2102     ldsBankConflictDist
2103        .init(0, wfSize(), 2)
2104        .name(name() + ".lds_bank_conflicts")
2105        .desc("Number of bank conflicts per LDS memory packet")
2106        ;
2107
2108     ldsBankAccesses
2109         .name(name() + ".lds_bank_access_cnt")
2110         .desc("Total number of LDS bank accesses")
2111         ;
2112
2113     pageDivergenceDist
2114         // A wavefront can touch up to N pages per memory instruction where
2115         // N is equal to the wavefront size
2116         // The number of pages per bin can be configured (here it's 4).
2117        .init(1, wfSize(), 4)
2118        .name(name() + ".page_divergence_dist")
2119        .desc("pages touched per wf (over all mem. instr.)")
2120        ;
2121
2122     controlFlowDivergenceDist
2123         .init(1, wfSize(), 4)
2124         .name(name() + ".warp_execution_dist")
2125         .desc("number of lanes active per instruction (oval all instructions)")
2126         ;
2127
2128     activeLanesPerGMemInstrDist
2129         .init(1, wfSize(), 4)
2130         .name(name() + ".gmem_lanes_execution_dist")
2131         .desc("number of active lanes per global memory instruction")
2132         ;
2133
2134     activeLanesPerLMemInstrDist
2135         .init(1, wfSize(), 4)
2136         .name(name() + ".lmem_lanes_execution_dist")
2137         .desc("number of active lanes per local memory instruction")
2138         ;
2139
2140     numInstrExecuted
2141         .name(name() + ".num_instr_executed")
2142         .desc("number of instructions executed")
2143         ;
2144
2145     numVecOpsExecuted
2146         .name(name() + ".num_vec_ops_executed")
2147         .desc("number of vec ops executed (e.g. WF size/inst)")
2148         ;
2149
2150     numVecOpsExecutedF16
2151         .name(name() + ".num_vec_ops_f16_executed")
2152         .desc("number of f16 vec ops executed (e.g. WF size/inst)")
2153         ;
2154
2155     numVecOpsExecutedF32
2156         .name(name() + ".num_vec_ops_f32_executed")
2157         .desc("number of f32 vec ops executed (e.g. WF size/inst)")
2158         ;
2159
2160     numVecOpsExecutedF64
2161         .name(name() + ".num_vec_ops_f64_executed")
2162         .desc("number of f64 vec ops executed (e.g. WF size/inst)")
2163         ;
2164
2165     numVecOpsExecutedFMA16
2166         .name(name() + ".num_vec_ops_fma16_executed")
2167         .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
2168         ;
2169
2170     numVecOpsExecutedFMA32
2171         .name(name() + ".num_vec_ops_fma32_executed")
2172         .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
2173         ;
2174
2175     numVecOpsExecutedFMA64
2176         .name(name() + ".num_vec_ops_fma64_executed")
2177         .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
2178         ;
2179
2180     numVecOpsExecutedMAD16
2181         .name(name() + ".num_vec_ops_mad16_executed")
2182         .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
2183         ;
2184
2185     numVecOpsExecutedMAD32
2186         .name(name() + ".num_vec_ops_mad32_executed")
2187         .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
2188         ;
2189
2190     numVecOpsExecutedMAD64
2191         .name(name() + ".num_vec_ops_mad64_executed")
2192         .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
2193         ;
2194
2195     numVecOpsExecutedMAC16
2196         .name(name() + ".num_vec_ops_mac16_executed")
2197         .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
2198         ;
2199
2200     numVecOpsExecutedMAC32
2201         .name(name() + ".num_vec_ops_mac32_executed")
2202         .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
2203         ;
2204
2205     numVecOpsExecutedMAC64
2206         .name(name() + ".num_vec_ops_mac64_executed")
2207         .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
2208         ;
2209
2210     numVecOpsExecutedTwoOpFP
2211         .name(name() + ".num_vec_ops_two_op_fp_executed")
2212         .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
2213         ;
2214
2215     totalCycles
2216         .name(name() + ".num_total_cycles")
2217         .desc("number of cycles the CU ran for")
2218         ;
2219
2220     ipc
2221         .name(name() + ".ipc")
2222         .desc("Instructions per cycle (this CU only)")
2223         ;
2224
2225     vpc
2226         .name(name() + ".vpc")
2227         .desc("Vector Operations per cycle (this CU only)")
2228         ;
2229
2230     vpc_f16
2231         .name(name() + ".vpc_f16")
2232         .desc("F16 Vector Operations per cycle (this CU only)")
2233         ;
2234
2235     vpc_f32
2236         .name(name() + ".vpc_f32")
2237         .desc("F32 Vector Operations per cycle (this CU only)")
2238         ;
2239
2240     vpc_f64
2241         .name(name() + ".vpc_f64")
2242         .desc("F64 Vector Operations per cycle (this CU only)")
2243         ;
2244
2245     numALUInstsExecuted
2246         .name(name() + ".num_alu_insts_executed")
2247         .desc("Number of dynamic non-GM memory insts executed")
2248         ;
2249
2250     wgBlockedDueBarrierAllocation
2251         .name(name() + ".wg_blocked_due_barrier_alloc")
2252         .desc("WG dispatch was blocked due to lack of barrier resources")
2253         ;
2254
2255     wgBlockedDueLdsAllocation
2256         .name(name() + ".wg_blocked_due_lds_alloc")
2257         .desc("Workgroup blocked due to LDS capacity")
2258         ;
2259
2260     ipc = numInstrExecuted / totalCycles;
2261     vpc = numVecOpsExecuted / totalCycles;
2262     vpc_f16 = numVecOpsExecutedF16 / totalCycles;
2263     vpc_f32 = numVecOpsExecutedF32 / totalCycles;
2264     vpc_f64 = numVecOpsExecutedF64 / totalCycles;
2265
2266     numTimesWgBlockedDueVgprAlloc
2267         .name(name() + ".times_wg_blocked_due_vgpr_alloc")
2268         .desc("Number of times WGs are blocked due to VGPR allocation per "
2269               "SIMD")
2270         ;
2271
2272     numTimesWgBlockedDueSgprAlloc
2273         .name(name() + ".times_wg_blocked_due_sgpr_alloc")
2274         .desc("Number of times WGs are blocked due to SGPR allocation per "
2275               "SIMD")
2276         ;
2277
2278     dynamicGMemInstrCnt
2279         .name(name() + ".global_mem_instr_cnt")
2280         .desc("dynamic non-flat global memory instruction count")
2281         ;
2282
2283     dynamicFlatMemInstrCnt
2284         .name(name() + ".flat_global_mem_instr_cnt")
2285         .desc("dynamic flat global memory instruction count")
2286         ;
2287
2288     dynamicLMemInstrCnt
2289         .name(name() + ".local_mem_instr_cnt")
2290         .desc("dynamic local memory intruction count")
2291         ;
2292
2293     numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
2294         dynamicLMemInstrCnt;
2295
2296     completedWfs
2297         .name(name() + ".num_completed_wfs")
2298         .desc("number of completed wavefronts")
2299         ;
2300
2301     completedWGs
2302         .name(name() + ".num_completed_wgs")
2303         .desc("number of completed workgroups")
2304         ;
2305
2306     numCASOps
2307         .name(name() + ".num_CAS_ops")
2308         .desc("number of compare and swap operations")
2309         ;
2310
2311     numFailedCASOps
2312         .name(name() + ".num_failed_CAS_ops")
2313         .desc("number of compare and swap operations that failed")
2314         ;
2315
2316     headTailLatency
2317         .init(0, 1000000, 10000)
2318         .name(name() + ".head_tail_latency")
2319         .desc("ticks between first and last cache block arrival at coalescer")
2320         .flags(Stats::pdf | Stats::oneline)
2321         ;
2322
2323     waveLevelParallelism
2324         .init(0, shader->n_wf * numVectorALUs, 1)
2325         .name(name() + ".wlp")
2326         .desc("wave level parallelism: count of active waves at wave launch")
2327         ;
2328
2329     instInterleave
2330         .init(numVectorALUs, 0, 20, 1)
2331         .name(name() + ".interleaving")
2332         .desc("Measure of instruction interleaving per SIMD")
2333         ;
2334
2335     // register stats of pipeline stages
2336     fetchStage.regStats();
2337     scoreboardCheckStage.regStats();
2338     scheduleStage.regStats();
2339     execStage.regStats();
2340
2341     // register stats of memory pipelines
2342     globalMemoryPipe.regStats();
2343     localMemoryPipe.regStats();
2344     scalarMemoryPipe.regStats();
2345
2346     registerManager->regStats();
2347 }
2348
2349 void
2350 ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
2351 {
2352     if (gpuDynInst->isScalar()) {
2353         if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2354             sALUInsts++;
2355             instCyclesSALU++;
2356         } else if (gpuDynInst->isLoad()) {
2357             scalarMemReads++;
2358         } else if (gpuDynInst->isStore()) {
2359             scalarMemWrites++;
2360         }
2361     } else {
2362         if (gpuDynInst->isALU()) {
2363             shader->total_valu_insts++;
2364             if (shader->total_valu_insts == shader->max_valu_insts) {
2365                 exitSimLoop("max vALU insts");
2366             }
2367             vALUInsts++;
2368             instCyclesVALU++;
2369             threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
2370         } else if (gpuDynInst->isFlat()) {
2371             if (gpuDynInst->isLocalMem()) {
2372                 flatLDSInsts++;
2373             } else {
2374                 flatVMemInsts++;
2375             }
2376         } else if (gpuDynInst->isLocalMem()) {
2377             ldsNoFlatInsts++;
2378         } else if (gpuDynInst->isLoad()) {
2379             vectorMemReads++;
2380         } else if (gpuDynInst->isStore()) {
2381             vectorMemWrites++;
2382         }
2383
2384         if (gpuDynInst->isLoad()) {
2385             switch (gpuDynInst->executedAs()) {
2386               case Enums::SC_SPILL:
2387                 spillReads++;
2388                 break;
2389               case Enums::SC_GLOBAL:
2390                 globalReads++;
2391                 break;
2392               case Enums::SC_GROUP:
2393                 groupReads++;
2394                 break;
2395               case Enums::SC_PRIVATE:
2396                 privReads++;
2397                 break;
2398               case Enums::SC_READONLY:
2399                 readonlyReads++;
2400                 break;
2401               case Enums::SC_KERNARG:
2402                 kernargReads++;
2403                 break;
2404               case Enums::SC_ARG:
2405                 argReads++;
2406                 break;
2407               case Enums::SC_NONE:
2408                 /**
2409                  * this case can occur for flat mem insts
2410                  * who execute with EXEC = 0
2411                  */
2412                 break;
2413               default:
2414                 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2415                 break;
2416             }
2417         } else if (gpuDynInst->isStore()) {
2418             switch (gpuDynInst->executedAs()) {
2419               case Enums::SC_SPILL:
2420                 spillWrites++;
2421                 break;
2422               case Enums::SC_GLOBAL:
2423                 globalWrites++;
2424                 break;
2425               case Enums::SC_GROUP:
2426                 groupWrites++;
2427                 break;
2428               case Enums::SC_PRIVATE:
2429                 privWrites++;
2430                 break;
2431               case Enums::SC_READONLY:
2432                 readonlyWrites++;
2433                 break;
2434               case Enums::SC_KERNARG:
2435                 kernargWrites++;
2436                 break;
2437               case Enums::SC_ARG:
2438                 argWrites++;
2439                 break;
2440               case Enums::SC_NONE:
2441                 /**
2442                  * this case can occur for flat mem insts
2443                  * who execute with EXEC = 0
2444                  */
2445                 break;
2446               default:
2447                 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2448                 break;
2449             }
2450         }
2451     }
2452 }
2453
2454 void
2455 ComputeUnit::updatePageDivergenceDist(Addr addr)
2456 {
2457     Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
2458
2459     if (!pagesTouched.count(virt_page_addr))
2460         pagesTouched[virt_page_addr] = 1;
2461     else
2462         pagesTouched[virt_page_addr]++;
2463 }
2464
2465 void
2466 ComputeUnit::exitCallback()
2467 {
2468     if (countPages) {
2469         std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2470
2471         *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2472             std::endl;
2473
2474         for (auto iter : pageAccesses) {
2475             *page_stat_file << std::hex << iter.first << ",";
2476             *page_stat_file << std::dec << iter.second.first << ",";
2477             *page_stat_file << std::dec << iter.second.second << std::endl;
2478         }
2479     }
2480 }
2481
2482 bool
2483 ComputeUnit::isDone() const
2484 {
2485     for (int i = 0; i < numVectorALUs; ++i) {
2486         if (!isVectorAluIdle(i)) {
2487             return false;
2488         }
2489     }
2490
2491     // TODO: FIXME if more than 1 of any memory pipe supported
2492     if (!srfToScalarMemPipeBus.rdy()) {
2493         return false;
2494     }
2495     if (!vrfToGlobalMemPipeBus.rdy()) {
2496         return false;
2497     }
2498     if (!vrfToLocalMemPipeBus.rdy()) {
2499         return false;
2500     }
2501
2502     if (!globalMemoryPipe.isGMReqFIFOWrRdy()
2503         || !localMemoryPipe.isLMReqFIFOWrRdy()
2504         || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
2505         !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
2506         return false;
2507     }
2508
2509     return true;
2510 }
2511
2512 int32_t
2513 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2514     const uint32_t wgId) const
2515 {
2516     return lds.getRefCounter(dispatchId, wgId);
2517 }
2518
2519 bool
2520 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2521 {
2522     assert(simdId < numVectorALUs);
2523
2524     for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2525         if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2526             return false;
2527         }
2528     }
2529
2530     return true;
2531 }
2532
2533 /**
2534  * send a general request to the LDS
2535  * make sure to look at the return value here as your request might be
2536  * NACK'd and returning false means that you have to have some backup plan
2537  */
2538 bool
2539 ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
2540 {
2541     // this is just a request to carry the GPUDynInstPtr
2542     // back and forth
2543     RequestPtr newRequest = std::make_shared<Request>();
2544     newRequest->setPaddr(0x0);
2545
2546     // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2547     PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2548
2549     // This is the SenderState needed upon return
2550     newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2551
2552     return ldsPort.sendTimingReq(newPacket);
2553 }
2554
2555 /**
2556  * get the result of packets sent to the LDS when they return
2557  */
2558 bool
2559 ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
2560 {
2561     const ComputeUnit::LDSPort::SenderState *senderState =
2562         dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2563
2564     fatal_if(!senderState, "did not get the right sort of sender state");
2565
2566     GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2567
2568     delete packet->senderState;
2569     delete packet;
2570
2571     computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2572     return true;
2573 }
2574
2575 /**
2576  * attempt to send this packet, either the port is already stalled, the request
2577  * is nack'd and must stall or the request goes through
2578  * when a request cannot be sent, add it to the retries queue
2579  */
2580 bool
2581 ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
2582 {
2583     ComputeUnit::LDSPort::SenderState *sender_state =
2584             dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2585     fatal_if(!sender_state, "packet without a valid sender state");
2586
2587     GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
2588
2589     if (isStalled()) {
2590         fatal_if(retries.empty(), "must have retries waiting to be stalled");
2591
2592         retries.push(pkt);
2593
2594         DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2595                         computeUnit->cu_id, gpuDynInst->simdId,
2596                         gpuDynInst->wfSlotId);
2597         return false;
2598     } else if (!RequestPort::sendTimingReq(pkt)) {
2599         // need to stall the LDS port until a recvReqRetry() is received
2600         // this indicates that there is more space
2601         stallPort();
2602         retries.push(pkt);
2603
2604         DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2605                 computeUnit->cu_id, gpuDynInst->simdId,
2606                 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2607         return false;
2608     } else {
2609         DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2610                 computeUnit->cu_id, gpuDynInst->simdId,
2611                 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2612         return true;
2613     }
2614 }
2615
2616 /**
2617  * the bus is telling the port that there is now space so retrying stalled
2618  * requests should work now
2619  * this allows the port to have a request be nack'd and then have the receiver
2620  * say when there is space, rather than simply retrying the send every cycle
2621  */
2622 void
2623 ComputeUnit::LDSPort::recvReqRetry()
2624 {
2625     auto queueSize = retries.size();
2626
2627     DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2628             computeUnit->cu_id, queueSize);
2629
2630     fatal_if(queueSize < 1,
2631              "why was there a recvReqRetry() with no pending reqs?");
2632     fatal_if(!isStalled(),
2633              "recvReqRetry() happened when the port was not stalled");
2634
2635     unstallPort();
2636
2637     while (!retries.empty()) {
2638         PacketPtr packet = retries.front();
2639
2640         DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2641
2642         if (!RequestPort::sendTimingReq(packet)) {
2643             // Stall port
2644             stallPort();
2645             DPRINTF(GPUPort, ": LDS send failed again\n");
2646             break;
2647         } else {
2648             DPRINTF(GPUTLB, ": LDS send successful\n");
2649             retries.pop();
2650         }
2651     }
2652 }