src/gpu-compute/fetch_unit.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Brad Beckmann, Sooraj Puthoor
  34  */
  35
  36 #include "gpu-compute/fetch_unit.hh"
  37
  38 #include "debug/GPUFetch.hh"
  39 #include "debug/GPUPort.hh"
  40 #include "debug/GPUTLB.hh"
  41 #include "gpu-compute/compute_unit.hh"
  42 #include "gpu-compute/gpu_dyn_inst.hh"
  43 #include "gpu-compute/gpu_static_inst.hh"
  44 #include "gpu-compute/shader.hh"
  45 #include "gpu-compute/wavefront.hh"
  46 #include "mem/ruby/system/RubySystem.hh"
  47
  48 uint32_t FetchUnit::globalFetchUnitID;
  49
  50 FetchUnit::FetchUnit(const ComputeUnitParams* params) :
  51     timingSim(true),
  52     computeUnit(nullptr),
  53     fetchScheduler(params),
  54     waveList(nullptr)
  55 {
  56 }
  57
  58 FetchUnit::~FetchUnit()
  59 {
  60     fetchQueue.clear();
  61     fetchStatusQueue.clear();
  62 }
  63
  64 void
  65 FetchUnit::init(ComputeUnit *cu)
  66 {
  67     computeUnit = cu;
  68     timingSim = computeUnit->shader->timingSim;
  69     fetchQueue.clear();
  70     fetchStatusQueue.resize(computeUnit->shader->n_wf);
  71
  72     for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
  73         fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
  74     }
  75
  76     fetchScheduler.bindList(&fetchQueue);
  77 }
  78
  79 void
  80 FetchUnit::exec()
  81 {
  82     // re-evaluate waves which are marked as not ready for fetch
  83     for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
  84         // Following code assumes 64-bit opertaion and all insts are
  85         // represented by 64-bit pointers to inst objects.
  86         Wavefront *curWave = fetchStatusQueue[j].first;
  87         assert (curWave);
  88
  89         // The wavefront has to be active, the IB occupancy has to be
  90         // 4 or less instructions and it can not have any branches to
  91         // prevent speculative instruction fetches
  92         if (!fetchStatusQueue[j].second) {
  93             if (curWave->status == Wavefront::S_RUNNING &&
  94                 curWave->instructionBuffer.size() <= 4 &&
  95                 !curWave->instructionBufferHasBranch() &&
  96                 !curWave->pendingFetch) {
  97                 fetchQueue.push_back(curWave);
  98                 fetchStatusQueue[j].second = true;
  99             }
 100         }
 101     }
 102
 103     // Fetch only if there is some wave ready to be fetched
 104     // An empty fetchQueue will cause the schedular to panic
 105     if (fetchQueue.size()) {
 106         Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
 107         waveToBeFetched->pendingFetch = true;
 108         fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
 109         initiateFetch(waveToBeFetched);
 110     }
 111 }
 112
 113 void
 114 FetchUnit::initiateFetch(Wavefront *wavefront)
 115 {
 116     // calculate the virtual address to fetch from the SQC
 117     Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
 118     vaddr = wavefront->base_ptr +  vaddr * sizeof(GPUStaticInst*);
 119
 120     DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
 121             computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
 122
 123     // Since this is an instruction prefetch, if you're split then just finish
 124     // out the current line.
 125     unsigned block_size = RubySystem::getBlockSizeBytes();
 126     // check for split accesses
 127     Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
 128     unsigned size = block_size;
 129
 130     if (split_addr > vaddr) {
 131         // misaligned access, just grab the rest of the line
 132         size = split_addr - vaddr;
 133     }
 134
 135     // set up virtual request
 136     Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
 137                                computeUnit->masterId(), 0, 0, 0);
 138
 139     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 140     // This fetchBlock is kind of faux right now - because the translations so
 141     // far don't actually return Data
 142     uint64_t fetchBlock;
 143     pkt->dataStatic(&fetchBlock);
 144
 145     if (timingSim) {
 146         // SenderState needed on Return
 147         pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
 148
 149         // Sender State needed by TLB hierarchy
 150         pkt->senderState =
 151             new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
 152                                                  computeUnit->shader->gpuTc,
 153                                                  false, pkt->senderState);
 154
 155         if (computeUnit->sqcTLBPort->isStalled()) {
 156             assert(computeUnit->sqcTLBPort->retries.size() > 0);
 157
 158             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
 159                     vaddr);
 160
 161             computeUnit->sqcTLBPort->retries.push_back(pkt);
 162         } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
 163             // Stall the data port;
 164             // No more packet is issued till
 165             // ruby indicates resources are freed by
 166             // a recvReqRetry() call back on this port.
 167             computeUnit->sqcTLBPort->stallPort();
 168
 169             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
 170                     vaddr);
 171
 172             computeUnit->sqcTLBPort->retries.push_back(pkt);
 173         } else {
 174             DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
 175         }
 176     } else {
 177         pkt->senderState =
 178             new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
 179                                                  computeUnit->shader->gpuTc);
 180
 181         computeUnit->sqcTLBPort->sendFunctional(pkt);
 182
 183         TheISA::GpuTLB::TranslationState *sender_state =
 184              safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 185
 186         delete sender_state->tlbEntry;
 187         delete sender_state;
 188         // fetch the instructions from the SQC when we operate in
 189         // functional mode only
 190         fetch(pkt, wavefront);
 191     }
 192 }
 193
 194 void
 195 FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
 196 {
 197     assert(pkt->req->hasPaddr());
 198     assert(pkt->req->hasSize());
 199
 200     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
 201             computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
 202             pkt->req->getPaddr());
 203
 204     // this is necessary because the GPU TLB receives packets instead of
 205     // requests. when the translation is complete, all relevent fields in the
 206     // request will be populated, but not in the packet. here we create the
 207     // new packet so we can set the size, addr, and proper flags.
 208     PacketPtr oldPkt = pkt;
 209     pkt = new Packet(oldPkt->req, oldPkt->cmd);
 210     delete oldPkt;
 211
 212     TheGpuISA::RawMachInst *data =
 213         new TheGpuISA::RawMachInst[pkt->req->getSize() /
 214         sizeof(TheGpuISA::RawMachInst)];
 215
 216     pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
 217
 218     // New SenderState for the memory access
 219     pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
 220
 221     if (timingSim) {
 222         // translation is done. Send the appropriate timing memory request.
 223
 224         if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
 225             computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
 226                                                                    wavefront));
 227
 228             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
 229                     computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
 230                     pkt->req->getPaddr());
 231         } else {
 232             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
 233                     computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
 234                     pkt->req->getPaddr());
 235         }
 236     } else {
 237         computeUnit->sqcPort->sendFunctional(pkt);
 238         processFetchReturn(pkt);
 239     }
 240 }
 241
 242 void
 243 FetchUnit::processFetchReturn(PacketPtr pkt)
 244 {
 245     ComputeUnit::SQCPort::SenderState *sender_state =
 246         safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
 247
 248     Wavefront *wavefront = sender_state->wavefront;
 249
 250     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
 251             "%d bytes, %d instructions!\n", computeUnit->cu_id,
 252             wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
 253             pkt->req->getSize(), pkt->req->getSize() /
 254             sizeof(TheGpuISA::RawMachInst));
 255
 256     if (wavefront->dropFetch) {
 257         assert(wavefront->instructionBuffer.empty());
 258         wavefront->dropFetch = false;
 259     } else {
 260         TheGpuISA::RawMachInst *inst_index_ptr =
 261             (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
 262
 263         assert(wavefront->instructionBuffer.size() <= 4);
 264
 265         for (int i = 0; i < pkt->req->getSize() /
 266              sizeof(TheGpuISA::RawMachInst); ++i) {
 267             GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
 268
 269             assert(inst_ptr);
 270             DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
 271                     computeUnit->cu_id, wavefront->simdId,
 272                     wavefront->wfSlotId, inst_ptr->disassemble());
 273
 274             GPUDynInstPtr gpuDynInst =
 275                 std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
 276                                              computeUnit->getAndIncSeqNum());
 277
 278             wavefront->instructionBuffer.push_back(gpuDynInst);
 279         }
 280     }
 281
 282     wavefront->pendingFetch = false;
 283
 284     delete pkt->senderState;
 285     delete pkt->req;
 286     delete pkt;
 287 }
 288
 289 void
 290 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 291 {
 292     waveList = wave_list;
 293 }