src/gpu-compute/fetch_unit.cc

   1 /*
   2  * Copyright (c) 2014-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/fetch_unit.hh"
  35
  36 #include "debug/GPUFetch.hh"
  37 #include "debug/GPUPort.hh"
  38 #include "debug/GPUTLB.hh"
  39 #include "gpu-compute/compute_unit.hh"
  40 #include "gpu-compute/gpu_dyn_inst.hh"
  41 #include "gpu-compute/gpu_static_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/wavefront.hh"
  44 #include "mem/ruby/system/RubySystem.hh"
  45
  46 uint32_t FetchUnit::globalFetchUnitID;
  47
  48 FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit &cu)
  49     : timingSim(true), computeUnit(cu), fetchScheduler(p),
  50       waveList(nullptr), fetchDepth(p->fetch_depth)
  51 {
  52 }
  53
  54 FetchUnit::~FetchUnit()
  55 {
  56     fetchQueue.clear();
  57     fetchStatusQueue.clear();
  58 }
  59
  60 void
  61 FetchUnit::init()
  62 {
  63     timingSim = computeUnit.shader->timingSim;
  64     fetchQueue.clear();
  65     fetchStatusQueue.resize(computeUnit.shader->n_wf);
  66     fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
  67
  68     for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
  69         Wavefront *wf = waveList->at(i);
  70         assert(wf->wfSlotId == i);
  71         fetchStatusQueue[i] = std::make_pair(wf, false);
  72         fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
  73         fetchBuf[i].decoder(&decoder);
  74     }
  75
  76     fetchScheduler.bindList(&fetchQueue);
  77 }
  78
  79 void
  80 FetchUnit::exec()
  81 {
  82     /**
  83      * now we check if any of the fetch buffers have
  84      * buffered instruction data that can be decoded
  85      * and sent to its wavefront's instruction buffer.
  86      * then we check if any of the fetch buffer entries
  87      * can be released. we only check if we can
  88      * release a buffer
  89      */
  90     for (auto &fetch_buf : fetchBuf) {
  91         if (!fetch_buf.hasFreeSpace()) {
  92             fetch_buf.checkWaveReleaseBuf();
  93         }
  94         if (fetch_buf.hasFetchDataToProcess()) {
  95             fetch_buf.decodeInsts();
  96         }
  97     }
  98
  99     // re-evaluate waves which are marked as not ready for fetch
 100     for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
 101         // Following code assumes 64-bit opertaion and all insts are
 102         // represented by 64-bit pointers to inst objects.
 103         Wavefront *curWave = fetchStatusQueue[j].first;
 104         assert (curWave);
 105
 106         // The wavefront has to be active, the IB occupancy has to be
 107         // 4 or less instructions and it can not have any branches to
 108         // prevent speculative instruction fetches
 109         if (!fetchStatusQueue[j].second) {
 110             if ((curWave->getStatus() == Wavefront::S_RUNNING ||
 111                 curWave->getStatus() == Wavefront::S_WAITCNT) &&
 112                 fetchBuf[j].hasFreeSpace() &&
 113                 !curWave->stopFetch() &&
 114                 !curWave->pendingFetch) {
 115                 fetchQueue.push_back(curWave);
 116                 fetchStatusQueue[j].second = true;
 117             }
 118         }
 119     }
 120
 121     // Fetch only if there is some wave ready to be fetched
 122     // An empty fetchQueue will cause the schedular to panic
 123     if (fetchQueue.size()) {
 124         Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
 125         waveToBeFetched->pendingFetch = true;
 126         fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
 127         initiateFetch(waveToBeFetched);
 128     }
 129 }
 130
 131 void
 132 FetchUnit::initiateFetch(Wavefront *wavefront)
 133 {
 134     assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
 135
 136     /**
 137      * calculate the virtual address to fetch from the SQC. the fetch
 138      * buffer holds a configurable number of cache lines. we start
 139      * fetching at the address of the cache line immediately following
 140      * the buffered line(s).
 141      */
 142     Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
 143
 144     // this should already be aligned to a cache line
 145     assert(vaddr == makeLineAddress(vaddr,
 146            computeUnit.getCacheLineBits()));
 147
 148     // shouldn't be fetching a line that is already buffered
 149     assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
 150
 151     fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
 152
 153     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
 154             "from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
 155             wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
 156
 157     DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
 158             computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
 159
 160     // set up virtual request
 161     RequestPtr req = std::make_shared<Request>(
 162         vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
 163         computeUnit.masterId(), 0, 0, nullptr);
 164
 165     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 166
 167     if (timingSim) {
 168         // SenderState needed on Return
 169         pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
 170
 171         // Sender State needed by TLB hierarchy
 172         pkt->senderState =
 173             new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
 174                                                  computeUnit.shader->gpuTc,
 175                                                  false, pkt->senderState);
 176
 177         if (computeUnit.sqcTLBPort.isStalled()) {
 178             assert(computeUnit.sqcTLBPort.retries.size() > 0);
 179
 180             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
 181                     vaddr);
 182
 183             computeUnit.sqcTLBPort.retries.push_back(pkt);
 184         } else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) {
 185             // Stall the data port;
 186             // No more packet is issued till
 187             // ruby indicates resources are freed by
 188             // a recvReqRetry() call back on this port.
 189             computeUnit.sqcTLBPort.stallPort();
 190
 191             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
 192                     vaddr);
 193
 194             computeUnit.sqcTLBPort.retries.push_back(pkt);
 195         } else {
 196             DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
 197         }
 198     } else {
 199         pkt->senderState =
 200             new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
 201                                                  computeUnit.shader->gpuTc);
 202
 203         computeUnit.sqcTLBPort.sendFunctional(pkt);
 204
 205         TheISA::GpuTLB::TranslationState *sender_state =
 206              safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 207
 208         delete sender_state->tlbEntry;
 209         delete sender_state;
 210         // fetch the instructions from the SQC when we operate in
 211         // functional mode only
 212         fetch(pkt, wavefront);
 213     }
 214 }
 215
 216 void
 217 FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
 218 {
 219     assert(pkt->req->hasPaddr());
 220     assert(pkt->req->hasSize());
 221
 222     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
 223             computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
 224             pkt->req->getPaddr());
 225
 226     /**
 227      * this is necessary because the GPU TLB receives packets instead of
 228      * requests. when the translation is complete, all relevent fields in
 229      * the request will be populated, but not in the packet. here we create
 230      * the new packet so we can set the size, addr, and proper flags.
 231      */
 232     PacketPtr oldPkt = pkt;
 233     pkt = new Packet(oldPkt->req, oldPkt->cmd);
 234     delete oldPkt;
 235
 236     /**
 237      * if we have not reserved an entry in the fetch buffer,
 238      * stop fetching. this can happen due to a branch instruction
 239      * flushing the fetch buffer while an ITLB or I-cache request is still
 240      * pending, in the same cycle another instruction is trying to fetch.
 241      */
 242     if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) {
 243         return;
 244     }
 245
 246     /**
 247      * we should have reserved an entry in the fetch buffer
 248      * for this cache line. here we get the pointer to the
 249      * entry used to buffer this request's line data.
 250      */
 251     pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
 252                     .reservedBuf(pkt->req->getVaddr()));
 253
 254     // New SenderState for the memory access
 255     pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
 256
 257     if (timingSim) {
 258         // translation is done. Send the appropriate timing memory request.
 259
 260         if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
 261             computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,
 262                                                                    wavefront));
 263
 264             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
 265                     computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
 266                     pkt->req->getPaddr());
 267         } else {
 268             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
 269                     computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
 270                     pkt->req->getPaddr());
 271         }
 272     } else {
 273         computeUnit.sqcPort.sendFunctional(pkt);
 274         processFetchReturn(pkt);
 275     }
 276 }
 277
 278 void
 279 FetchUnit::processFetchReturn(PacketPtr pkt)
 280 {
 281     ComputeUnit::SQCPort::SenderState *sender_state =
 282         safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
 283
 284     Wavefront *wavefront = sender_state->wavefront;
 285
 286     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
 287             "%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
 288             wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
 289
 290     if (wavefront->dropFetch) {
 291         assert(wavefront->instructionBuffer.empty());
 292         assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
 293         wavefront->dropFetch = false;
 294     } else {
 295         fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
 296     }
 297
 298     wavefront->pendingFetch = false;
 299
 300     delete pkt->senderState;
 301     delete pkt;
 302 }
 303
 304 void
 305 FetchUnit::flushBuf(int wfSlotId)
 306 {
 307     fetchBuf.at(wfSlotId).flushBuf();
 308 }
 309
 310 void
 311 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 312 {
 313     waveList = wave_list;
 314 }
 315
 316 /** FetchBufDesc */
 317 void
 318 FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
 319                                      Wavefront *wf)
 320 {
 321     wavefront = wf;
 322     fetchDepth = fetch_depth;
 323     maxIbSize = wavefront->maxIbSize;
 324     cacheLineSize = cache_line_size;
 325     maxFbSize = cacheLineSize * fetchDepth;
 326
 327     // Calculate the number of bits to address a cache line
 328     panic_if(!isPowerOf2(cacheLineSize),
 329         "Cache line size should be a power of two.");
 330     cacheLineBits = floorLog2(cacheLineSize);
 331
 332     bufStart = new uint8_t[maxFbSize];
 333     readPtr = bufStart;
 334     bufEnd = bufStart + maxFbSize;
 335
 336     for (int i = 0; i < fetchDepth; ++i) {
 337         freeList.emplace_back(readPtr + i * cacheLineSize);
 338     }
 339 }
 340
 341 void
 342 FetchUnit::FetchBufDesc::flushBuf()
 343 {
 344     restartFromBranch = true;
 345     /**
 346      * free list may have some entries
 347      * so we clear it here to avoid duplicates
 348      */
 349     freeList.clear();
 350     bufferedPCs.clear();
 351     reservedPCs.clear();
 352     readPtr = bufStart;
 353
 354     for (int i = 0; i < fetchDepth; ++i) {
 355         freeList.push_back(bufStart + i * cacheLineSize);
 356     }
 357
 358     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
 359             "buffer\n", wavefront->simdId, wavefront->wfSlotId,
 360             wavefront->wfDynId);
 361 }
 362
 363 Addr
 364 FetchUnit::FetchBufDesc::nextFetchAddr()
 365 {
 366     Addr next_line = 0;
 367
 368     if (bufferedAndReservedLines()) {
 369         Addr last_line_fetched = 0;
 370         if (!reservedLines()) {
 371             /**
 372              * get the PC of the most recently fetched cache line,
 373              * then return the address of the next line.
 374              */
 375             last_line_fetched = bufferedPCs.rbegin()->first;
 376         } else {
 377             last_line_fetched = reservedPCs.rbegin()->first;
 378         }
 379
 380         next_line = last_line_fetched + cacheLineSize;
 381
 382         /**
 383          * should not be trying to fetch a line that has already
 384          * been fetched.
 385          */
 386         assert(bufferedPCs.find(next_line) == bufferedPCs.end());
 387         assert(reservedPCs.find(next_line) == reservedPCs.end());
 388     } else {
 389         /**
 390          * we do not have any buffered cache lines yet, so we
 391          * assume this is the initial fetch, or the first fetch
 392          * after a branch, and get the PC directly from the WF.
 393          * in the case of a branch, we may not start at the
 394          * beginning of a cache line, so we adjust the readPtr by
 395          * the current PC's offset from the start of the line.
 396          */
 397         next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
 398         readPtr = bufStart;
 399
 400         /**
 401          * if we are here we have no buffered lines. in the case we flushed
 402          * the buffer due to a branch, we may need to start fetching from
 403          * some offset from the start of the fetch buffer, so we adjust for
 404          * that here.
 405          */
 406         if (restartFromBranch) {
 407             restartFromBranch = false;
 408             int byte_offset
 409                 = wavefront->pc() - makeLineAddress(wavefront->pc(),
 410                                     cacheLineBits);
 411             readPtr += byte_offset;
 412         }
 413     }
 414
 415     return next_line;
 416 }
 417
 418 void
 419 FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 420 {
 421     // we should have free buffer space, and the line
 422     // at vaddr should not already be cached.
 423     assert(hasFreeSpace());
 424     assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
 425     assert(reservedPCs.find(vaddr) == reservedPCs.end());
 426     assert(bufferedAndReservedLines() < fetchDepth);
 427
 428     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
 429             "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
 430             wavefront->wfDynId, vaddr);
 431
 432     /**
 433      * we reserve buffer space, by moving it out of the
 434      * free list, however we do not mark the buffered
 435      * line as valid until the fetch unit for this buffer
 436      * has receieved the response from the memory system.
 437      */
 438     uint8_t *inst_buf = freeList.front();
 439     reservedPCs.emplace(vaddr, inst_buf);
 440     freeList.pop_front();
 441 }
 442
 443 void
 444 FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
 445 {
 446     assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
 447     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
 448             wavefront->simdId, wavefront->wfSlotId,
 449             wavefront->wfDynId, vaddr);
 450
 451     /**
 452      * this address should have an entry reserved in the
 453      * fetch buffer already, however it should be invalid
 454      * until the fetch completes.
 455      */
 456     auto reserved_pc = reservedPCs.find(vaddr);
 457     assert(reserved_pc != reservedPCs.end());
 458     bufferedPCs.emplace(vaddr, reserved_pc->second);
 459
 460     if (readPtr == bufEnd) {
 461         readPtr = bufStart;
 462     }
 463
 464     reserved_pc->second = nullptr;
 465     reservedPCs.erase(reserved_pc);
 466 }
 467
 468 bool
 469 FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
 470 {
 471     return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
 472 }
 473
 474 void
 475 FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
 476 {
 477     Addr cur_wave_pc = roundDown(wavefront->pc(),
 478                                  wavefront->computeUnit->cacheLineSize());
 479     if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
 480         DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
 481                 "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
 482                 wavefront->wfDynId, cur_wave_pc);
 483
 484         // should be reserved, but not buffered yet
 485         assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
 486
 487         return;
 488     }
 489
 490     auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
 491     auto oldest_buffered_pc = bufferedPCs.begin();
 492
 493     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
 494             "(PC = %#x) can be released.\n", wavefront->simdId,
 495             wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
 496             wavefront->pc());
 497
 498 #ifdef DEBUG
 499     int idx = 0;
 500     for (const auto &buf_pc : bufferedPCs) {
 501         DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
 502         ++idx;
 503     }
 504 #endif
 505
 506     // if we haven't buffered data for this PC, we shouldn't
 507     // be fetching from it.
 508     assert(current_buffered_pc != bufferedPCs.end());
 509
 510     /**
 511      * we're using a std::map so the addresses are sorted. if this
 512      * PC is not the oldest one in the map, we must be fetching from
 513      * a newer block, and we can release the oldest PC's fetch buffer
 514      * entry back to the free list.
 515      */
 516     if (current_buffered_pc != oldest_buffered_pc) {
 517         DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
 518                 "removing it from the fetch buffer.\n", wavefront->simdId,
 519                 wavefront->wfSlotId, wavefront->wfDynId,
 520                 oldest_buffered_pc->first);
 521
 522         freeList.emplace_back(oldest_buffered_pc->second);
 523         oldest_buffered_pc->second = nullptr;
 524         bufferedPCs.erase(oldest_buffered_pc);
 525         DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
 526                 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
 527                 bufferedLines());
 528     }
 529 }
 530
 531 void
 532 FetchUnit::FetchBufDesc::decodeInsts()
 533 {
 534     assert(readPtr);
 535
 536     if (splitDecode()) {
 537         decodeSplitInst();
 538     }
 539
 540     while (wavefront->instructionBuffer.size() < maxIbSize
 541            && hasFetchDataToProcess()) {
 542         if (splitDecode()) {
 543             decodeSplitInst();
 544         } else {
 545             TheGpuISA::MachInst mach_inst
 546                 = reinterpret_cast<TheGpuISA::MachInst>(readPtr);
 547             GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
 548             readPtr += gpu_static_inst->instSize();
 549
 550             assert(readPtr <= bufEnd);
 551
 552             GPUDynInstPtr gpu_dyn_inst
 553                 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
 554                                                wavefront, gpu_static_inst,
 555                                                wavefront->computeUnit->
 556                                                 getAndIncSeqNum());
 557             wavefront->instructionBuffer.push_back(gpu_dyn_inst);
 558
 559             DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
 560                     "%d bytes remain.\n", wavefront->simdId,
 561                     wavefront->wfSlotId, wavefront->wfDynId,
 562                     gpu_static_inst->disassemble(),
 563                     gpu_static_inst->instSize(),
 564                     fetchBytesRemaining());
 565         }
 566     }
 567 }
 568
 569 void
 570 FetchUnit::FetchBufDesc::decodeSplitInst()
 571 {
 572     TheGpuISA::RawMachInst split_inst = 0;
 573     int dword_size = sizeof(uint32_t);
 574     int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
 575
 576     for (int i = 0; i < num_dwords; ++i) {
 577         ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
 578         if (readPtr + dword_size >= bufEnd) {
 579             readPtr = bufStart;
 580         }
 581     }
 582
 583     assert(readPtr == bufStart);
 584
 585     TheGpuISA::MachInst mach_inst
 586         = reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
 587     GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
 588     readPtr += (gpu_static_inst->instSize() - dword_size);
 589     assert(readPtr < bufEnd);
 590
 591     GPUDynInstPtr gpu_dyn_inst
 592         = std::make_shared<GPUDynInst>(wavefront->computeUnit,
 593                                        wavefront, gpu_static_inst,
 594                                        wavefront->computeUnit->
 595                                            getAndIncSeqNum());
 596     wavefront->instructionBuffer.push_back(gpu_dyn_inst);
 597
 598     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
 599             "(%d bytes). %d bytes remain in %d buffered lines.\n",
 600             wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
 601             gpu_static_inst->disassemble(), split_inst,
 602             gpu_static_inst->instSize(), fetchBytesRemaining(),
 603             bufferedLines());
 604 }
 605
 606 bool
 607 FetchUnit::FetchBufDesc::splitDecode() const
 608 {
 609     /**
 610      * if a read of a raw instruction would go beyond the end
 611      * of the fetch buffer, then we must perform a split decode.
 612      */
 613     bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
 614
 615     return is_split;
 616 }
 617
 618 int
 619 FetchUnit::FetchBufDesc::fetchBytesRemaining() const
 620 {
 621     int bytes_remaining = 0;
 622
 623     if (bufferedLines() && readPtr != bufEnd) {
 624         auto last_buf_pc = bufferedPCs.rbegin();
 625         uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
 626         int byte_diff = end_ptr - readPtr;
 627
 628         if (end_ptr > readPtr) {
 629             bytes_remaining = byte_diff;
 630         } else if (end_ptr < readPtr) {
 631             bytes_remaining = bufferedBytes() + byte_diff;
 632         }
 633     }
 634
 635     assert(bytes_remaining <= bufferedBytes());
 636     return bytes_remaining;
 637 }