src/gpu-compute/wavefront.cc

   1 /*
   2  * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/wavefront.hh"
  35
  36 #include "base/bitfield.hh"
  37 #include "debug/GPUExec.hh"
  38 #include "debug/GPUInitAbi.hh"
  39 #include "debug/WavefrontStack.hh"
  40 #include "gpu-compute/compute_unit.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/scalar_register_file.hh"
  43 #include "gpu-compute/shader.hh"
  44 #include "gpu-compute/simple_pool_manager.hh"
  45 #include "gpu-compute/vector_register_file.hh"
  46
  47 Wavefront::Wavefront(const Params &p)
  48   : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
  49     maxIbSize(p.max_ib_size), _gpuISA(*this),
  50     vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
  51     vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
  52     barId(WFBarrier::InvalidID)
  53 {
  54     lastTrace = 0;
  55     execUnitId = -1;
  56     status = S_STOPPED;
  57     reservedVectorRegs = 0;
  58     reservedScalarRegs = 0;
  59     startVgprIndex = 0;
  60     startSgprIndex = 0;
  61     outstandingReqs = 0;
  62     outstandingReqsWrGm = 0;
  63     outstandingReqsWrLm = 0;
  64     outstandingReqsRdGm = 0;
  65     outstandingReqsRdLm = 0;
  66     rdLmReqsInPipe = 0;
  67     rdGmReqsInPipe = 0;
  68     wrLmReqsInPipe = 0;
  69     wrGmReqsInPipe = 0;
  70     scalarRdGmReqsInPipe = 0;
  71     scalarWrGmReqsInPipe = 0;
  72     scalarOutstandingReqsRdGm = 0;
  73     scalarOutstandingReqsWrGm = 0;
  74     lastNonIdleTick = 0;
  75     ldsChunk = nullptr;
  76
  77     memTraceBusy = 0;
  78     oldVgprTcnt = 0xffffffffffffffffll;
  79     oldDgprTcnt = 0xffffffffffffffffll;
  80     oldVgpr.resize(p.wf_size);
  81
  82     pendingFetch = false;
  83     dropFetch = false;
  84     maxVgprs = 0;
  85     maxSgprs = 0;
  86
  87     lastAddr.resize(p.wf_size);
  88     workItemFlatId.resize(p.wf_size);
  89     oldDgpr.resize(p.wf_size);
  90     for (int i = 0; i < 3; ++i) {
  91         workItemId[i].resize(p.wf_size);
  92     }
  93
  94     _execMask.set();
  95     rawDist.clear();
  96     lastInstExec = 0;
  97     vecReads.clear();
  98 }
  99
 100 void
 101 Wavefront::regStats()
 102 {
 103     SimObject::regStats();
 104
 105     // FIXME: the name of the WF needs to be unique
 106     numTimesBlockedDueWAXDependencies
 107         .name(name() + ".timesBlockedDueWAXDependencies")
 108         .desc("number of times the wf's instructions are blocked due to WAW "
 109               "or WAR dependencies")
 110         ;
 111
 112     // FIXME: the name of the WF needs to be unique
 113     numTimesBlockedDueRAWDependencies
 114         .name(name() + ".timesBlockedDueRAWDependencies")
 115         .desc("number of times the wf's instructions are blocked due to RAW "
 116               "dependencies")
 117         ;
 118
 119     numInstrExecuted
 120         .name(name() + ".num_instr_executed")
 121         .desc("number of instructions executed by this WF slot")
 122         ;
 123
 124     schCycles
 125         .name(name() + ".sch_cycles")
 126         .desc("number of cycles spent in schedule stage")
 127         ;
 128
 129     schStalls
 130         .name(name() + ".sch_stalls")
 131         .desc("number of cycles WF is stalled in SCH stage")
 132         ;
 133
 134     schRfAccessStalls
 135         .name(name() + ".sch_rf_access_stalls")
 136         .desc("number of cycles wave selected in SCH but RF denied adding "
 137               "instruction")
 138         ;
 139
 140     schResourceStalls
 141         .name(name() + ".sch_resource_stalls")
 142         .desc("number of cycles stalled in sch by resource not available")
 143         ;
 144
 145     schOpdNrdyStalls
 146         .name(name() + ".sch_opd_nrdy_stalls")
 147         .desc("number of cycles stalled in sch waiting for RF reads to "
 148               "complete")
 149         ;
 150
 151     schLdsArbStalls
 152         .name(name() + ".sch_lds_arb_stalls")
 153         .desc("number of cycles wave stalled due to LDS-VRF arbitration")
 154         ;
 155
 156     vecRawDistance
 157         .init(0,20,1)
 158         .name(name() + ".vec_raw_distance")
 159         .desc("Count of RAW distance in dynamic instructions for this WF")
 160         ;
 161
 162     readsPerWrite
 163         .init(0,4,1)
 164         .name(name() + ".vec_reads_per_write")
 165         .desc("Count of Vector reads per write for this WF")
 166         ;
 167 }
 168
 169 void
 170 Wavefront::init()
 171 {
 172     reservedVectorRegs = 0;
 173     reservedScalarRegs = 0;
 174     startVgprIndex = 0;
 175     startSgprIndex = 0;
 176
 177     scalarAlu = computeUnit->mapWaveToScalarAlu(this);
 178     scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
 179     globalMem = computeUnit->mapWaveToGlobalMem(this);
 180     localMem = computeUnit->mapWaveToLocalMem(this);
 181     scalarMem = computeUnit->mapWaveToScalarMem(this);
 182 }
 183
 184 void
 185 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 186 {
 187     int regInitIdx = 0;
 188
 189     // iterate over all the init fields and check which
 190     // bits are enabled
 191     for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
 192
 193         if (task->sgprBitEnabled(en_bit)) {
 194             int physSgprIdx = 0;
 195             uint32_t wiCount = 0;
 196             uint32_t firstWave = 0;
 197             int orderedAppendTerm = 0;
 198             int numWfsInWg = 0;
 199             uint32_t finalValue = 0;
 200             Addr host_disp_pkt_addr = task->hostDispPktAddr();
 201             Addr kernarg_addr = task->kernargAddr();
 202             Addr hidden_priv_base(0);
 203
 204             switch (en_bit) {
 205               case PrivateSegBuf:
 206                     physSgprIdx =
 207                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 208                 computeUnit->srf[simdId]->write(physSgprIdx,
 209                         task->amdQueue.scratch_resource_descriptor[0]);
 210                 ++regInitIdx;
 211                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 212                         "Setting PrivateSegBuffer: s[%d] = %x\n",
 213                         computeUnit->cu_id, simdId,
 214                         wfSlotId, wfDynId, physSgprIdx,
 215                         task->amdQueue.scratch_resource_descriptor[0]);
 216
 217                 physSgprIdx =
 218                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 219                 computeUnit->srf[simdId]->write(physSgprIdx,
 220                         task->amdQueue.scratch_resource_descriptor[1]);
 221                 ++regInitIdx;
 222                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 223                         "Setting PrivateSegBuffer: s[%d] = %x\n",
 224                         computeUnit->cu_id, simdId,
 225                         wfSlotId, wfDynId, physSgprIdx,
 226                         task->amdQueue.scratch_resource_descriptor[1]);
 227
 228                 physSgprIdx =
 229                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 230                 computeUnit->srf[simdId]->write(physSgprIdx,
 231                         task->amdQueue.scratch_resource_descriptor[2]);
 232                 ++regInitIdx;
 233                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 234                         "Setting PrivateSegBuffer: s[%d] = %x\n",
 235                         computeUnit->cu_id, simdId,
 236                         wfSlotId, wfDynId, physSgprIdx,
 237                         task->amdQueue.scratch_resource_descriptor[2]);
 238
 239                 physSgprIdx =
 240                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 241                 computeUnit->srf[simdId]->write(physSgprIdx,
 242                         task->amdQueue.scratch_resource_descriptor[3]);
 243
 244                 ++regInitIdx;
 245                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 246                         "Setting PrivateSegBuffer: s[%d] = %x\n",
 247                         computeUnit->cu_id, simdId,
 248                         wfSlotId, wfDynId, physSgprIdx,
 249                         task->amdQueue.scratch_resource_descriptor[3]);
 250                 break;
 251               case DispatchPtr:
 252                 physSgprIdx =
 253                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 254                 computeUnit->srf[simdId]->write(physSgprIdx,
 255                         bits(host_disp_pkt_addr, 31, 0));
 256                 ++regInitIdx;
 257                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 258                         "Setting DispatchPtr: s[%d] = %x\n",
 259                         computeUnit->cu_id, simdId,
 260                         wfSlotId, wfDynId, physSgprIdx,
 261                         bits(host_disp_pkt_addr, 31, 0));
 262
 263                 physSgprIdx =
 264                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 265                 computeUnit->srf[simdId]->write(physSgprIdx,
 266                         bits(host_disp_pkt_addr, 63, 32));
 267                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 268                         "Setting DispatchPtr: s[%d] = %x\n",
 269                         computeUnit->cu_id, simdId,
 270                         wfSlotId, wfDynId, physSgprIdx,
 271                         bits(host_disp_pkt_addr, 63, 32));
 272
 273                 ++regInitIdx;
 274                 break;
 275               case QueuePtr:
 276                 physSgprIdx =
 277                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 278                 computeUnit->srf[simdId]->write(physSgprIdx,
 279                         bits(task->hostAMDQueueAddr, 31, 0));
 280                 ++regInitIdx;
 281                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 282                         "Setting QueuePtr: s[%d] = %x\n",
 283                         computeUnit->cu_id, simdId,
 284                         wfSlotId, wfDynId, physSgprIdx,
 285                         bits(task->hostAMDQueueAddr, 31, 0));
 286
 287                 physSgprIdx =
 288                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 289                 computeUnit->srf[simdId]->write(physSgprIdx,
 290                         bits(task->hostAMDQueueAddr, 63, 32));
 291                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 292                         "Setting QueuePtr: s[%d] = %x\n",
 293                         computeUnit->cu_id, simdId,
 294                         wfSlotId, wfDynId, physSgprIdx,
 295                         bits(task->hostAMDQueueAddr, 63, 32));
 296
 297                 ++regInitIdx;
 298                 break;
 299               case KernargSegPtr:
 300                 physSgprIdx =
 301                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 302                 computeUnit->srf[simdId]->write(physSgprIdx,
 303                         bits(kernarg_addr, 31, 0));
 304                 ++regInitIdx;
 305                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 306                         "Setting KernargSegPtr: s[%d] = %x\n",
 307                         computeUnit->cu_id, simdId,
 308                         wfSlotId, wfDynId, physSgprIdx,
 309                         bits(kernarg_addr, 31, 0));
 310
 311                 physSgprIdx =
 312                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 313                 computeUnit->srf[simdId]->write(physSgprIdx,
 314                         bits(kernarg_addr, 63, 32));
 315                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 316                         "Setting KernargSegPtr: s[%d] = %x\n",
 317                         computeUnit->cu_id, simdId,
 318                         wfSlotId, wfDynId, physSgprIdx,
 319                         bits(kernarg_addr, 63, 32));
 320
 321                 ++regInitIdx;
 322                 break;
 323               case FlatScratchInit:
 324                 physSgprIdx
 325                     = computeUnit->registerManager->mapSgpr(this, regInitIdx);
 326                 computeUnit->srf[simdId]->write(physSgprIdx,
 327                     (TheGpuISA::ScalarRegU32)(task->amdQueue
 328                         .scratch_backing_memory_location & 0xffffffff));
 329                 ++regInitIdx;
 330                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 331                         "Setting FlatScratch Addr: s[%d] = %x\n",
 332                         computeUnit->cu_id, simdId,
 333                         wfSlotId, wfDynId, physSgprIdx,
 334                         (TheGpuISA::ScalarRegU32)(task->amdQueue
 335                         .scratch_backing_memory_location & 0xffffffff));
 336
 337                 physSgprIdx =
 338                        computeUnit->registerManager->mapSgpr(this, regInitIdx);
 339                 // This vallue should be sizeof(DWORD) aligned, that is
 340                 // 4 byte aligned
 341                 computeUnit->srf[simdId]->write(physSgprIdx,
 342                     task->amdQueue.scratch_workitem_byte_size);
 343                 ++regInitIdx;
 344                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 345                         "Setting FlatScratch size: s[%d] = %x\n",
 346                         computeUnit->cu_id, simdId,
 347                         wfSlotId, wfDynId, physSgprIdx,
 348                         task->amdQueue.scratch_workitem_byte_size);
 349                 /**
 350                  * Since flat scratch init is needed for this kernel, this
 351                  * kernel is going to have flat memory instructions and we
 352                  * need to initialize the hidden private base for this queue.
 353                  * scratch_resource_descriptor[0] has this queue's scratch
 354                  * base address. scratch_backing_memory_location has the
 355                  * offset to this queue's scratch base address from the
 356                  * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
 357                  * queue's scratch base address for address calculation
 358                  * (stored in scratch_resource_descriptor[0]). But that
 359                  * address calculation shoule be done by first finding the
 360                  * queue's scratch base address using the calculation
 361                  * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
 362                  * SH_HIDDEN_PRIVATE_BASE_VMID.
 363                  *
 364                  * For more details see:
 365                  *     http://rocm-documentation.readthedocs.io/en/latest/
 366                  *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
 367                  *
 368                  *     https://github.com/ROCm-Developer-Tools/
 369                  *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
 370                  *     #flat-addressing
 371                  */
 372                 hidden_priv_base =
 373                     (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
 374                     (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
 375                     & 0x000000000000ffff) << 32);
 376                 computeUnit->shader->initShHiddenPrivateBase(
 377                        hidden_priv_base,
 378                        task->amdQueue.scratch_backing_memory_location);
 379                 break;
 380               case GridWorkgroupCountX:
 381                 physSgprIdx =
 382                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 383                 wiCount = ((task->gridSize(0) +
 384                            task->wgSize(0) - 1) /
 385                            task->wgSize(0));
 386                 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
 387
 388                 ++regInitIdx;
 389                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 390                         "Setting num WG X: s[%d] = %x\n",
 391                         computeUnit->cu_id, simdId,
 392                         wfSlotId, wfDynId, physSgprIdx, wiCount);
 393                 break;
 394               case GridWorkgroupCountY:
 395                 physSgprIdx =
 396                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 397                 wiCount = ((task->gridSize(1) +
 398                            task->wgSize(1) - 1) /
 399                            task->wgSize(1));
 400                 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
 401
 402                 ++regInitIdx;
 403                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 404                         "Setting num WG Y: s[%d] = %x\n",
 405                         computeUnit->cu_id, simdId,
 406                         wfSlotId, wfDynId, physSgprIdx, wiCount);
 407                 break;
 408               case GridWorkgroupCountZ:
 409                 physSgprIdx =
 410                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 411                 wiCount = ((task->gridSize(2) +
 412                            task->wgSize(2) - 1) /
 413                            task->wgSize(2));
 414                 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
 415
 416                 ++regInitIdx;
 417                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 418                         "Setting num WG Z: s[%d] = %x\n",
 419                         computeUnit->cu_id, simdId,
 420                         wfSlotId, wfDynId, physSgprIdx, wiCount);
 421                 break;
 422               case WorkgroupIdX:
 423                 physSgprIdx =
 424                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 425                 computeUnit->srf[simdId]->write(physSgprIdx,
 426                                                      workGroupId[0]);
 427
 428                 ++regInitIdx;
 429                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 430                         "Setting WG ID X: s[%d] = %x\n",
 431                         computeUnit->cu_id, simdId,
 432                         wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
 433                 break;
 434               case WorkgroupIdY:
 435                 physSgprIdx =
 436                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 437                 computeUnit->srf[simdId]->write(physSgprIdx,
 438                                                      workGroupId[1]);
 439
 440                 ++regInitIdx;
 441                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 442                         "Setting WG ID Y: s[%d] = %x\n",
 443                         computeUnit->cu_id, simdId,
 444                         wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
 445                 break;
 446               case WorkgroupIdZ:
 447                 physSgprIdx =
 448                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 449                 computeUnit->srf[simdId]->write(physSgprIdx,
 450                                                      workGroupId[2]);
 451
 452                 ++regInitIdx;
 453                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 454                         "Setting WG ID Z: s[%d] = %x\n",
 455                         computeUnit->cu_id, simdId,
 456                         wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
 457                 break;
 458               case PrivSegWaveByteOffset:
 459                 physSgprIdx =
 460                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 461                 /**
 462                   * the compute_tmpring_size_wavesize specifies the number of
 463                   * kB allocated per wavefront, hence the multiplication by
 464                   * 1024.
 465                   *
 466                   * to get the per wavefront offset into the scratch
 467                   * memory, we also multiply this by the wfId. the wfId stored
 468                   * in the Wavefront class, however, is the wave ID within the
 469                   * WG, whereas here we need the global WFID because the
 470                   * scratch space will be divided amongst all waves in the
 471                   * kernel. to get the global ID we multiply the WGID by
 472                   * the WG size, then add the WFID of the wave within its WG.
 473                   */
 474                 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
 475                     (wgId * (wgSz / 64) + wfId) *
 476                     task->amdQueue.compute_tmpring_size_wavesize);
 477
 478                 ++regInitIdx;
 479                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 480                         "Setting Private Seg Offset: s[%d] = %x\n",
 481                         computeUnit->cu_id, simdId,
 482                         wfSlotId, wfDynId, physSgprIdx,
 483                         1024 * (wgId * (wgSz / 64) + wfId) *
 484                         task->amdQueue.compute_tmpring_size_wavesize);
 485                 break;
 486               case WorkgroupInfo:
 487                 firstWave = (wfId == 0) ? 1 : 0;
 488                 numWfsInWg = divCeil(wgSizeInWorkItems,
 489                                          computeUnit->wfSize());
 490                 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
 491                 finalValue |= (orderedAppendTerm << 6);
 492                 finalValue |= numWfsInWg;
 493                 physSgprIdx =
 494                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
 495                 computeUnit->srf[simdId]->
 496                     write(physSgprIdx, finalValue);
 497
 498                 ++regInitIdx;
 499                 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
 500                         "Setting WG Info: s[%d] = %x\n",
 501                         computeUnit->cu_id, simdId,
 502                         wfSlotId, wfDynId, physSgprIdx, finalValue);
 503                 break;
 504               default:
 505                 fatal("SGPR enable bit %i not supported\n", en_bit);
 506                 break;
 507             }
 508         }
 509     }
 510
 511     regInitIdx = 0;
 512
 513     // iterate over all the init fields and check which
 514     // bits are enabled
 515     for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
 516         if (task->vgprBitEnabled(en_bit)) {
 517             uint32_t physVgprIdx = 0;
 518             TheGpuISA::VecRegContainerU32 raw_vgpr;
 519
 520             switch (en_bit) {
 521               case WorkitemIdX:
 522                 {
 523                     physVgprIdx = computeUnit->registerManager
 524                         ->mapVgpr(this, regInitIdx);
 525                     TheGpuISA::VecRegU32 vgpr_x
 526                         = raw_vgpr.as<TheGpuISA::VecElemU32>();
 527
 528                     for (int lane = 0; lane < workItemId[0].size(); ++lane) {
 529                         vgpr_x[lane] = workItemId[0][lane];
 530                     }
 531
 532                     computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
 533                     rawDist[regInitIdx] = 0;
 534                     ++regInitIdx;
 535                 }
 536                 break;
 537               case WorkitemIdY:
 538                 {
 539                     physVgprIdx = computeUnit->registerManager
 540                         ->mapVgpr(this, regInitIdx);
 541                     TheGpuISA::VecRegU32 vgpr_y
 542                         = raw_vgpr.as<TheGpuISA::VecElemU32>();
 543
 544                     for (int lane = 0; lane < workItemId[1].size(); ++lane) {
 545                         vgpr_y[lane] = workItemId[1][lane];
 546                     }
 547
 548                     computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
 549                     rawDist[regInitIdx] = 0;
 550                     ++regInitIdx;
 551                 }
 552                 break;
 553               case WorkitemIdZ:
 554                 {
 555                     physVgprIdx = computeUnit->registerManager->
 556                         mapVgpr(this, regInitIdx);
 557                     TheGpuISA::VecRegU32 vgpr_z
 558                         = raw_vgpr.as<TheGpuISA::VecElemU32>();
 559
 560                     for (int lane = 0; lane < workItemId[2].size(); ++lane) {
 561                         vgpr_z[lane] = workItemId[2][lane];
 562                     }
 563
 564                     computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
 565                     rawDist[regInitIdx] = 0;
 566                     ++regInitIdx;
 567                 }
 568                 break;
 569             }
 570         }
 571     }
 572 }
 573
 574 void
 575 Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
 576 {
 577     maxVgprs = num_vregs;
 578     maxSgprs = num_sregs;
 579 }
 580
 581 Wavefront::~Wavefront()
 582 {
 583 }
 584
 585 void
 586 Wavefront::setStatus(status_e newStatus)
 587 {
 588     if (computeUnit->idleCUTimeout > 0) {
 589         // Wavefront's status transitions to stalled or stopped
 590         if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
 591              newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
 592             (status != newStatus)) {
 593             computeUnit->idleWfs++;
 594             assert(computeUnit->idleWfs <=
 595                    (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
 596             if (computeUnit->idleWfs ==
 597                 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
 598                 lastNonIdleTick = curTick();
 599             }
 600             // Wavefront's status transitions to an active state (from
 601             // a stopped or stalled state)
 602         } else if ((status == S_STOPPED || status == S_STALLED ||
 603                     status == S_WAITCNT || status == S_BARRIER) &&
 604                    (status != newStatus)) {
 605             // if all WFs in the CU were idle then check if the idleness
 606             // period exceeded the timeout threshold
 607             if (computeUnit->idleWfs ==
 608                 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
 609                 panic_if((curTick() - lastNonIdleTick) >=
 610                          computeUnit->idleCUTimeout,
 611                          "CU%d has been idle for %d ticks at tick %d",
 612                          computeUnit->cu_id, computeUnit->idleCUTimeout,
 613                          curTick());
 614             }
 615             computeUnit->idleWfs--;
 616             assert(computeUnit->idleWfs >= 0);
 617         }
 618     }
 619     status = newStatus;
 620 }
 621
 622 void
 623 Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
 624 {
 625     wfDynId = _wf_dyn_id;
 626     _pc = init_pc;
 627
 628     status = S_RUNNING;
 629
 630     vecReads.resize(maxVgprs, 0);
 631 }
 632
 633 bool
 634 Wavefront::isGmInstruction(GPUDynInstPtr ii)
 635 {
 636     if (ii->isGlobalMem() ||
 637         (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
 638         return true;
 639     }
 640
 641     return false;
 642 }
 643
 644 bool
 645 Wavefront::isLmInstruction(GPUDynInstPtr ii)
 646 {
 647     if (ii->isLocalMem() ||
 648         (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
 649         return true;
 650     }
 651
 652     return false;
 653 }
 654
 655 bool
 656 Wavefront::isOldestInstWaitcnt()
 657 {
 658     if (instructionBuffer.empty())
 659         return false;
 660
 661     GPUDynInstPtr ii = instructionBuffer.front();
 662
 663     if (ii->isWaitcnt()) {
 664         // waitcnt is a scalar
 665         assert(ii->isScalar());
 666         return true;
 667     }
 668
 669     return false;
 670 }
 671
 672 bool
 673 Wavefront::isOldestInstScalarALU()
 674 {
 675     assert(!instructionBuffer.empty());
 676     GPUDynInstPtr ii = instructionBuffer.front();
 677
 678     if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
 679         || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
 680         (ii->isKernArgSeg() && ii->isLoad()))) {
 681         return true;
 682     }
 683
 684     return false;
 685 }
 686
 687 bool
 688 Wavefront::isOldestInstVectorALU()
 689 {
 690     assert(!instructionBuffer.empty());
 691     GPUDynInstPtr ii = instructionBuffer.front();
 692
 693     if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
 694         ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
 695         || (ii->isKernArgSeg() && ii->isLoad()))) {
 696         return true;
 697     }
 698
 699     return false;
 700 }
 701
 702 bool
 703 Wavefront::isOldestInstBarrier()
 704 {
 705     assert(!instructionBuffer.empty());
 706     GPUDynInstPtr ii = instructionBuffer.front();
 707
 708     if (status != S_STOPPED && ii->isBarrier()) {
 709         return true;
 710     }
 711
 712     return false;
 713 }
 714
 715 bool
 716 Wavefront::isOldestInstGMem()
 717 {
 718     assert(!instructionBuffer.empty());
 719     GPUDynInstPtr ii = instructionBuffer.front();
 720
 721     if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
 722         return true;
 723     }
 724
 725     return false;
 726 }
 727
 728 bool
 729 Wavefront::isOldestInstScalarMem()
 730 {
 731     assert(!instructionBuffer.empty());
 732     GPUDynInstPtr ii = instructionBuffer.front();
 733
 734     if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
 735         return true;
 736     }
 737
 738     return false;
 739 }
 740
 741 bool
 742 Wavefront::isOldestInstLMem()
 743 {
 744     assert(!instructionBuffer.empty());
 745     GPUDynInstPtr ii = instructionBuffer.front();
 746
 747     if (status != S_STOPPED && ii->isLocalMem()) {
 748         return true;
 749     }
 750
 751     return false;
 752 }
 753
 754 bool
 755 Wavefront::isOldestInstPrivMem()
 756 {
 757     assert(!instructionBuffer.empty());
 758     GPUDynInstPtr ii = instructionBuffer.front();
 759
 760     if (status != S_STOPPED && ii->isPrivateSeg()) {
 761         return true;
 762     }
 763
 764     return false;
 765 }
 766
 767 bool
 768 Wavefront::isOldestInstFlatMem()
 769 {
 770     assert(!instructionBuffer.empty());
 771     GPUDynInstPtr ii = instructionBuffer.front();
 772
 773     if (status != S_STOPPED && ii->isFlat()) {
 774         return true;
 775     }
 776
 777     return false;
 778 }
 779
 780 bool
 781 Wavefront::stopFetch()
 782 {
 783     for (auto it : instructionBuffer) {
 784         GPUDynInstPtr ii = it;
 785         if (ii->isReturn() || ii->isBranch() ||
 786             ii->isEndOfKernel()) {
 787             return true;
 788         }
 789     }
 790
 791     return false;
 792 }
 793
 794 void
 795 Wavefront::freeResources()
 796 {
 797     execUnitId = -1;
 798 }
 799
 800 void Wavefront::validateRequestCounters()
 801 {
 802     panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
 803              wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
 804              outstandingReqs < 0,
 805              "Negative requests in pipe for WF%d for slot%d"
 806              " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
 807              " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
 808              " Outstanding Reqs=%d\n",
 809              wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
 810              rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
 811 }
 812
 813 void
 814 Wavefront::reserveGmResource(GPUDynInstPtr ii)
 815 {
 816     if (!ii->isScalar()) {
 817         if (ii->isLoad()) {
 818             rdGmReqsInPipe++;
 819         } else if (ii->isStore()) {
 820             wrGmReqsInPipe++;
 821         } else if (ii->isAtomic() || ii->isMemSync()) {
 822             rdGmReqsInPipe++;
 823             wrGmReqsInPipe++;
 824         } else {
 825             panic("Invalid memory operation!\n");
 826         }
 827         execUnitId = globalMem;
 828     } else {
 829         if (ii->isLoad()) {
 830             scalarRdGmReqsInPipe++;
 831         } else if (ii->isStore()) {
 832             scalarWrGmReqsInPipe++;
 833         } else if (ii->isAtomic() || ii->isMemSync()) {
 834             scalarWrGmReqsInPipe++;
 835             scalarRdGmReqsInPipe++;
 836         } else {
 837             panic("Invalid memory operation!\n");
 838         }
 839         execUnitId = scalarMem;
 840     }
 841 }
 842
 843 void
 844 Wavefront::reserveLmResource(GPUDynInstPtr ii)
 845 {
 846     fatal_if(ii->isScalar(),
 847              "Scalar instructions can not access Shared memory!!!");
 848     if (ii->isLoad()) {
 849         rdLmReqsInPipe++;
 850     } else if (ii->isStore()) {
 851         wrLmReqsInPipe++;
 852     } else if (ii->isAtomic() || ii->isMemSync()) {
 853         wrLmReqsInPipe++;
 854         rdLmReqsInPipe++;
 855     } else {
 856         panic("Invalid memory operation!\n");
 857     }
 858     execUnitId = localMem;
 859 }
 860
 861 std::vector<int>
 862 Wavefront::reserveResources()
 863 {
 864     // vector of execution unit IDs to return to schedule stage
 865     // this return is only used for debugging and an assertion...
 866     std::vector<int> execUnitIds;
 867
 868     // Get current instruction
 869     GPUDynInstPtr ii = instructionBuffer.front();
 870     assert(ii);
 871
 872     // Single precision ALU or Branch or Return or Special instruction
 873     if (ii->isALU() || ii->isSpecialOp() ||
 874         ii->isBranch() || ii->isNop() ||
 875         (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
 876         ii->isReturn() || ii->isEndOfKernel()) {
 877         if (!ii->isScalar()) {
 878             execUnitId = simdId;
 879         } else {
 880             execUnitId = scalarAluGlobalIdx;
 881         }
 882         // this is to enforce a fixed number of cycles per issue slot per SIMD
 883     } else if (ii->isBarrier()) {
 884         execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
 885     } else if (ii->isFlat()) {
 886         assert(!ii->isScalar());
 887         reserveLmResource(ii);
 888         // add execUnitId, reserved by reserveLmResource, list before it is
 889         // overwriten by reserveGmResource
 890         execUnitIds.push_back(execUnitId);
 891         flatLmUnitId = execUnitId;
 892         reserveGmResource(ii);
 893         flatGmUnitId = execUnitId;
 894         execUnitIds.push_back(flatGmUnitId);
 895         execUnitId = -1;
 896     } else if (ii->isGlobalMem()) {
 897         reserveGmResource(ii);
 898     } else if (ii->isLocalMem()) {
 899         reserveLmResource(ii);
 900     } else if (ii->isPrivateSeg()) {
 901         fatal_if(ii->isScalar(),
 902                  "Scalar instructions can not access Private memory!!!");
 903         reserveGmResource(ii);
 904     } else {
 905         panic("reserveResources -> Couldn't process op!\n");
 906     }
 907
 908     if (execUnitId != -1) {
 909         execUnitIds.push_back(execUnitId);
 910     }
 911     assert(execUnitIds.size());
 912     return execUnitIds;
 913 }
 914
 915 void
 916 Wavefront::exec()
 917 {
 918     // ---- Exit if wavefront is inactive ----------------------------- //
 919
 920     if (status == S_STOPPED || status == S_RETURNING ||
 921         status==S_STALLED || instructionBuffer.empty()) {
 922         return;
 923     }
 924
 925     if (status == S_WAITCNT) {
 926         /**
 927          * if this wave is in S_WAITCNT state, then
 928          * it should enter exec() precisely one time
 929          * before the waitcnts are satisfied, in order
 930          * to execute the waitcnt instruction itself
 931          * thus we assert that the waitcnt is the
 932          * oldest instruction. if we enter exec() with
 933          * active waitcnts, and we're not executing
 934          * the waitcnt instruction, something must be
 935          * wrong
 936          */
 937         assert(isOldestInstWaitcnt());
 938     }
 939
 940     // Get current instruction
 941
 942     GPUDynInstPtr ii = instructionBuffer.front();
 943
 944     const Addr old_pc = pc();
 945     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
 946             "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
 947             wfDynId, ii->disassemble(), old_pc, ii->seqNum());
 948
 949     ii->execute(ii);
 950     // delete the dynamic instruction from the pipeline map
 951     computeUnit->deleteFromPipeMap(this);
 952     // update the instruction stats in the CU
 953     computeUnit->updateInstStats(ii);
 954
 955     // inform VRF of instruction execution to schedule write-back
 956     // and scoreboard ready for registers
 957     if (!ii->isScalar()) {
 958         computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
 959     }
 960     computeUnit->srf[simdId]->waveExecuteInst(this, ii);
 961
 962     computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
 963     computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
 964     computeUnit->numInstrExecuted++;
 965     numInstrExecuted++;
 966     computeUnit->instExecPerSimd[simdId]++;
 967     computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
 968                                      computeUnit->lastExecCycle[simdId]);
 969     computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
 970
 971     if (lastInstExec) {
 972         computeUnit->instInterleave[simdId].
 973             sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
 974     }
 975     lastInstExec = computeUnit->instExecPerSimd[simdId];
 976
 977     // want to track:
 978     // number of reads that occur per value written
 979
 980     // vector RAW dependency tracking
 981     for (int i = 0; i < ii->getNumOperands(); i++) {
 982         if (ii->isVectorRegister(i)) {
 983             int vgpr = ii->getRegisterIndex(i, ii);
 984             int nReg = ii->getOperandSize(i) <= 4 ? 1 :
 985                 ii->getOperandSize(i) / 4;
 986             for (int n = 0; n < nReg; n++) {
 987                 if (ii->isSrcOperand(i)) {
 988                     // This check should never fail, but to be safe we check
 989                     if (rawDist.find(vgpr+n) != rawDist.end()) {
 990                         vecRawDistance.
 991                             sample(numInstrExecuted.value() - rawDist[vgpr+n]);
 992                     }
 993                     // increment number of reads to this register
 994                     vecReads[vgpr+n]++;
 995                 } else if (ii->isDstOperand(i)) {
 996                     // rawDist is set on writes, but will not be set
 997                     // for the first write to each physical register
 998                     if (rawDist.find(vgpr+n) != rawDist.end()) {
 999                         // sample the number of reads that were performed
1000                         readsPerWrite.sample(vecReads[vgpr+n]);
1001                     }
1002                     // on a write, reset count of reads to 0
1003                     vecReads[vgpr+n] = 0;
1004
1005                     rawDist[vgpr+n] = numInstrExecuted.value();
1006                 }
1007             }
1008         }
1009     }
1010
1011     if (pc() == old_pc) {
1012         // PC not modified by instruction, proceed to next
1013         _gpuISA.advancePC(ii);
1014         instructionBuffer.pop_front();
1015     } else {
1016         DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1017                 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
1018                 ii->disassemble());
1019         discardFetch();
1020     }
1021     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1022             computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
1023
1024     if (computeUnit->shader->hsail_mode==Shader::SIMT) {
1025         const int num_active_lanes = execMask().count();
1026         computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
1027         computeUnit->numVecOpsExecuted += num_active_lanes;
1028
1029         if (ii->isF16() && ii->isALU()) {
1030             if (ii->isF32() || ii->isF64()) {
1031                 fatal("Instruction is tagged as both (1) F16, and (2)"
1032                        "either F32 or F64.");
1033             }
1034             computeUnit->numVecOpsExecutedF16 += num_active_lanes;
1035             if (ii->isFMA()) {
1036                 computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
1037                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1038             }
1039             else if (ii->isMAC()) {
1040                 computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
1041                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1042             }
1043             else if (ii->isMAD()) {
1044                 computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
1045                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1046             }
1047         }
1048         if (ii->isF32() && ii->isALU()) {
1049             if (ii->isF16() || ii->isF64()) {
1050                 fatal("Instruction is tagged as both (1) F32, and (2)"
1051                        "either F16 or F64.");
1052             }
1053             computeUnit->numVecOpsExecutedF32 += num_active_lanes;
1054             if (ii->isFMA()) {
1055                 computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
1056                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1057             }
1058             else if (ii->isMAC()) {
1059                 computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
1060                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1061             }
1062             else if (ii->isMAD()) {
1063                 computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
1064                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1065             }
1066         }
1067         if (ii->isF64() && ii->isALU()) {
1068             if (ii->isF16() || ii->isF32()) {
1069                 fatal("Instruction is tagged as both (1) F64, and (2)"
1070                        "either F16 or F32.");
1071             }
1072             computeUnit->numVecOpsExecutedF64 += num_active_lanes;
1073             if (ii->isFMA()) {
1074                 computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
1075                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1076             }
1077             else if (ii->isMAC()) {
1078                 computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
1079                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1080             }
1081             else if (ii->isMAD()) {
1082                 computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
1083                 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1084             }
1085         }
1086         if (isGmInstruction(ii)) {
1087             computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
1088         } else if (isLmInstruction(ii)) {
1089             computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
1090         }
1091     }
1092
1093     /**
1094      * we return here to avoid spurious errors related to flat insts
1095      * and their address segment resolution.
1096      */
1097     if (execMask().none() && ii->isFlat()) {
1098         computeUnit->getTokenManager()->recvTokens(1);
1099         return;
1100     }
1101
1102     // Update Vector ALU pipeline and other resources
1103     bool flat_as_gm = false;
1104     bool flat_as_lm = false;
1105     if (ii->isFlat()) {
1106         flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
1107                      (ii->executedAs() == Enums::SC_PRIVATE);
1108         flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
1109     }
1110
1111     // Single precision ALU or Branch or Return or Special instruction
1112     // Note, we use the same timing regardless of SP or DP ALU operation.
1113     if (ii->isALU() || ii->isSpecialOp() ||
1114         ii->isBranch() || ii->isNop() ||
1115         (ii->isKernArgSeg() && ii->isLoad()) ||
1116         ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1117         // this is to enforce a fixed number of cycles per issue slot per SIMD
1118         if (!ii->isScalar()) {
1119             computeUnit->vectorALUs[simdId].set(computeUnit->
1120                 cyclesToTicks(computeUnit->issuePeriod));
1121         } else {
1122             computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1123                 cyclesToTicks(computeUnit->issuePeriod));
1124         }
1125     // Barrier on Scalar ALU
1126     } else if (ii->isBarrier()) {
1127         computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1128             cyclesToTicks(computeUnit->issuePeriod));
1129     // GM or Flat as GM Load
1130     } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1131         if (!ii->isScalar()) {
1132             computeUnit->vrfToGlobalMemPipeBus.set(
1133                 computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
1134             computeUnit->vectorGlobalMemUnit.
1135                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1136             computeUnit->instCyclesVMemPerSimd[simdId] +=
1137                 computeUnit->vrf_gm_bus_latency;
1138         } else {
1139             computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1140                 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1141             computeUnit->scalarMemUnit.
1142                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1143             computeUnit->instCyclesScMemPerSimd[simdId] +=
1144                 computeUnit->srf_scm_bus_latency;
1145         }
1146     // GM or Flat as GM Store
1147     } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1148         if (!ii->isScalar()) {
1149             computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1150                 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1151             computeUnit->vectorGlobalMemUnit.
1152                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1153             computeUnit->instCyclesVMemPerSimd[simdId] +=
1154                 (2 * computeUnit->vrf_gm_bus_latency);
1155         } else {
1156             computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1157                 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1158             computeUnit->scalarMemUnit.
1159                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1160             computeUnit->instCyclesScMemPerSimd[simdId] +=
1161                 (2 * computeUnit->srf_scm_bus_latency);
1162         }
1163     } else if ((ii->isAtomic() || ii->isMemSync()) &&
1164                (ii->isGlobalMem() || flat_as_gm)) {
1165         if (!ii->isScalar()) {
1166             computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1167                 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1168             computeUnit->vectorGlobalMemUnit.
1169                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1170             computeUnit->instCyclesVMemPerSimd[simdId] +=
1171                 (2 * computeUnit->vrf_gm_bus_latency);
1172         } else {
1173             computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1174                 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1175             computeUnit->scalarMemUnit.
1176                 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1177             computeUnit->instCyclesScMemPerSimd[simdId] +=
1178                 (2 * computeUnit->srf_scm_bus_latency);
1179         }
1180     // LM or Flat as LM Load
1181     } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1182         computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1183             cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1184         computeUnit->vectorSharedMemUnit.
1185             set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
1186         computeUnit->instCyclesLdsPerSimd[simdId] +=
1187             computeUnit->vrf_lm_bus_latency;
1188     // LM or Flat as LM Store
1189     } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1190         computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1191             cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1192         computeUnit->vectorSharedMemUnit.
1193             set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1194         computeUnit->instCyclesLdsPerSimd[simdId] +=
1195             (2 * computeUnit->vrf_lm_bus_latency);
1196     // LM or Flat as LM, Atomic or MemFence
1197     } else if ((ii->isAtomic() || ii->isMemSync()) &&
1198                (ii->isLocalMem() || flat_as_lm)) {
1199         computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1200             cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1201         computeUnit->vectorSharedMemUnit.
1202             set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1203         computeUnit->instCyclesLdsPerSimd[simdId] +=
1204             (2 * computeUnit->vrf_lm_bus_latency);
1205     } else {
1206         panic("Bad instruction type!\n");
1207     }
1208 }
1209
1210 GPUDynInstPtr
1211 Wavefront::nextInstr()
1212 {
1213     // Read next instruction from instruction buffer
1214     GPUDynInstPtr ii = instructionBuffer.front();
1215     // if the WF has been dispatched in the schedule stage then
1216     // check the next oldest instruction for readiness
1217     if (computeUnit->pipeMap.find(ii->seqNum()) !=
1218         computeUnit->pipeMap.end()) {
1219         if (instructionBuffer.size() > 1) {
1220             auto it = instructionBuffer.begin() + 1;
1221             return *it;
1222         } else { // No new instructions to check
1223             return nullptr;
1224         }
1225     }
1226     return ii;
1227 }
1228
1229 void
1230 Wavefront::discardFetch()
1231 {
1232     instructionBuffer.clear();
1233     dropFetch |= pendingFetch;
1234
1235     /**
1236      * clear the fetch buffer for this wave in order to
1237      * remove any stale inst data
1238      */
1239     computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
1240 }
1241
1242 bool
1243 Wavefront::waitCntsSatisfied()
1244 {
1245     // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1246     // waitCnt instruction has been dispatched but not executed yet: next
1247     // instruction should be blocked until waitCnt is executed.
1248     if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1249         return false;
1250     }
1251
1252     /**
1253      * If we reach here, that means an s_waitcnt instruction was executed
1254      * and the waitcnts are set by the execute method. Check if waitcnts
1255      * are satisfied.
1256      */
1257     if (vmWaitCnt != -1) {
1258         if (vmemInstsIssued > vmWaitCnt) {
1259             // vmWaitCnt not satisfied
1260             return false;
1261         }
1262     }
1263
1264     if (expWaitCnt != -1) {
1265         if (expInstsIssued > expWaitCnt) {
1266             // expWaitCnt not satisfied
1267             return false;
1268         }
1269     }
1270
1271     if (lgkmWaitCnt != -1) {
1272         if (lgkmInstsIssued > lgkmWaitCnt) {
1273             // lgkmWaitCnt not satisfied
1274             return false;
1275         }
1276     }
1277
1278     // if we get here all outstanding waitcnts must
1279     // be satisfied, so we resume normal operation
1280     clearWaitCnts();
1281
1282     return true;
1283 }
1284
1285 void
1286 Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1287 {
1288     // the scoreboard should have set the status
1289     // to S_WAITCNT once a waitcnt instruction
1290     // was marked as ready
1291     assert(status == S_WAITCNT);
1292
1293     // waitcnt instruction shouldn't be sending
1294     // negative counts
1295     assert(vm_wait_cnt >= 0);
1296     assert(exp_wait_cnt >= 0);
1297     assert(lgkm_wait_cnt >= 0);
1298     // waitcnts are a max of 15 because we have
1299     // only 1 nibble (4 bits) to set the counts
1300     assert(vm_wait_cnt <= 0xf);
1301     assert(exp_wait_cnt <= 0x7);
1302     assert(lgkm_wait_cnt <= 0x1f);
1303
1304     /**
1305      * prior waitcnts should be satisfied,
1306      * at which time the WF resets them
1307      * back to -1, indicating they are no
1308      * longer active
1309      */
1310     assert(vmWaitCnt == -1);
1311     assert(expWaitCnt == -1);
1312     assert(lgkmWaitCnt == -1);
1313
1314     /**
1315      * if the instruction encoding
1316      * indicates a waitcnt of 0xf,
1317      * that means the waitcnt is
1318      * not being used
1319      */
1320     if (vm_wait_cnt != 0xf)
1321         vmWaitCnt = vm_wait_cnt;
1322
1323     if (exp_wait_cnt != 0x7)
1324         expWaitCnt = exp_wait_cnt;
1325
1326     if (lgkm_wait_cnt != 0x1f)
1327         lgkmWaitCnt = lgkm_wait_cnt;
1328 }
1329
1330 void
1331 Wavefront::clearWaitCnts()
1332 {
1333     // reset the waitcnts back to
1334     // -1, indicating they are no
1335     // longer valid
1336     vmWaitCnt = -1;
1337     expWaitCnt = -1;
1338     lgkmWaitCnt = -1;
1339
1340     // resume running normally
1341     status = S_RUNNING;
1342 }
1343
1344 void
1345 Wavefront::incVMemInstsIssued()
1346 {
1347     ++vmemInstsIssued;
1348 }
1349
1350 void
1351 Wavefront::incExpInstsIssued()
1352 {
1353     ++expInstsIssued;
1354 }
1355
1356 void
1357 Wavefront::incLGKMInstsIssued()
1358 {
1359     ++lgkmInstsIssued;
1360 }
1361
1362 void
1363 Wavefront::decVMemInstsIssued()
1364 {
1365     --vmemInstsIssued;
1366 }
1367
1368 void
1369 Wavefront::decExpInstsIssued()
1370 {
1371     --expInstsIssued;
1372 }
1373
1374 void
1375 Wavefront::decLGKMInstsIssued()
1376 {
1377     --lgkmInstsIssued;
1378 }
1379
1380 Addr
1381 Wavefront::pc() const
1382 {
1383     return _pc;
1384 }
1385
1386 void
1387 Wavefront::pc(Addr new_pc)
1388 {
1389     _pc = new_pc;
1390 }
1391
1392 VectorMask&
1393 Wavefront::execMask()
1394 {
1395     return _execMask;
1396 }
1397
1398 bool
1399 Wavefront::execMask(int lane) const
1400 {
1401     return _execMask[lane];
1402 }
1403
1404 void
1405 Wavefront::freeRegisterFile()
1406 {
1407     /* clear busy registers */
1408     for (int i=0; i < maxVgprs; i++) {
1409         int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1410         computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1411     }
1412
1413     /* Free registers used by this wavefront */
1414     uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1415                          computeUnit->vrf[simdId]->numRegs();
1416     computeUnit->registerManager->vrfPoolMgrs[simdId]->
1417         freeRegion(startVgprIndex, endIndex);
1418 }
1419
1420 void
1421 Wavefront::computeActualWgSz(HSAQueueEntry *task)
1422 {
1423     actualWgSzTotal = 1;
1424     for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1425         actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1426                                  - task->wgId(d) * workGroupSz[d]);
1427         actualWgSzTotal *= actualWgSz[d];
1428     }
1429 }
1430
1431 void
1432 Wavefront::barrierId(int bar_id)
1433 {
1434     assert(bar_id >= WFBarrier::InvalidID);
1435     assert(bar_id < computeUnit->numBarrierSlots());
1436     barId = bar_id;
1437 }
1438
1439 int
1440 Wavefront::barrierId() const
1441 {
1442     return barId;
1443 }
1444
1445 bool
1446 Wavefront::hasBarrier() const
1447 {
1448     return barId > WFBarrier::InvalidID;
1449 }
1450
1451 void
1452 Wavefront::releaseBarrier()
1453 {
1454     barId = WFBarrier::InvalidID;
1455 }