src/gpu-compute/global_memory_pipeline.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #define __STDC_FORMAT_MACROS
  35 #include <cinttypes>
  36 #include "debug/GPUCoalescer.hh"
  37 #include "debug/GPUMem.hh"
  38 #include "debug/GPUReg.hh"
  39 #include "gpu-compute/compute_unit.hh"
  40 #include "gpu-compute/global_memory_pipeline.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/vector_register_file.hh"
  44 #include "gpu-compute/wavefront.hh"
  45
  46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
  47     computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
  48     maxWaveRequests(p->max_wave_requests), inflightStores(0),
  49     inflightLoads(0)
  50 {
  51 }
  52
  53 void
  54 GlobalMemPipeline::init(ComputeUnit *cu)
  55 {
  56     computeUnit = cu;
  57     globalMemSize = computeUnit->shader->globalMemSize;
  58     _name = computeUnit->name() + ".GlobalMemPipeline";
  59 }
  60
  61 bool
  62 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
  63 {
  64     // We require one token from the coalescer's uncoalesced table to
  65     // proceed
  66     int token_count = 1;
  67
  68     // Make sure the vector port has tokens. There is a single pool
  69     // of tokens so only one port in the vector port needs to be checked.
  70     // Lane 0 is chosen arbirarily.
  71     DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
  72     if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
  73         DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
  74         return false;
  75     }
  76
  77     return true;
  78 }
  79
  80 void
  81 GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
  82 {
  83     // We require one token from the coalescer's uncoalesced table to
  84     // proceed
  85     int token_count = 1;
  86
  87     DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
  88     assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
  89     mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
  90 }
  91
  92 bool
  93 GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
  94 {
  95     // Ensure we haven't exceeded the maximum number of vmem requests
  96     // for this wavefront
  97     if ((mp->wavefront()->outstandingReqsRdGm
  98          + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
  99         return false;
 100     }
 101
 102     return true;
 103 }
 104
 105 void
 106 GlobalMemPipeline::exec()
 107 {
 108     // apply any returned global memory operations
 109     GPUDynInstPtr m = getNextReadyResp();
 110
 111     bool accessVrf = true;
 112     Wavefront *w = nullptr;
 113
 114     // check the VRF to see if the operands of a load (or load component
 115     // of an atomic) are accessible
 116     if (m && (m->isLoad() || m->isAtomicRet())) {
 117         w = m->wavefront();
 118
 119         accessVrf = w->computeUnit->vrf[w->simdId]->
 120             canScheduleWriteOperandsFromLoad(w, m);
 121
 122     }
 123
 124     if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
 125         accessVrf && (computeUnit->shader->coissue_return ||
 126         computeUnit->vectorGlobalMemUnit.rdy())) {
 127
 128         w = m->wavefront();
 129
 130         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
 131                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
 132         m->completeAcc(m);
 133
 134         if (m->isLoad() || m->isAtomicRet()) {
 135             w->computeUnit->vrf[w->simdId]->
 136             scheduleWriteOperandsFromLoad(w, m);
 137         }
 138
 139         completeRequest(m);
 140
 141         Tick accessTime = curTick() - m->getAccessTime();
 142
 143         // Decrement outstanding requests count
 144         computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
 145         if (m->isStore() || m->isAtomic() || m->isMemSync()) {
 146             computeUnit->shader->sampleStore(accessTime);
 147             computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
 148                                              m->time, -1);
 149         }
 150
 151         if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
 152             computeUnit->shader->sampleLoad(accessTime);
 153             computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
 154                                              m->time, -1);
 155         }
 156
 157         w->validateRequestCounters();
 158
 159         // Generate stats for round-trip time for vectory memory insts
 160         // going all the way to memory and stats for individual cache
 161         // blocks generated by the instruction.
 162         m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
 163         computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
 164         computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
 165
 166         // Mark write bus busy for appropriate amount of time
 167         computeUnit->glbMemToVrfBus.set(m->time);
 168         if (!computeUnit->shader->coissue_return)
 169             w->computeUnit->vectorGlobalMemUnit.set(m->time);
 170     }
 171
 172     // If pipeline has executed a global memory instruction
 173     // execute global memory packets and issue global
 174     // memory packets to DTLB
 175     if (!gmIssuedRequests.empty()) {
 176         GPUDynInstPtr mp = gmIssuedRequests.front();
 177         if (mp->isLoad() || mp->isAtomic()) {
 178             if (inflightLoads >= gmQueueSize) {
 179                 return;
 180             } else {
 181                 ++inflightLoads;
 182             }
 183         } else if (mp->isStore()) {
 184             if (inflightStores >= gmQueueSize) {
 185                 return;
 186             } else {
 187                 ++inflightStores;
 188             }
 189         }
 190
 191         DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
 192                 mp->disassemble(), mp->seqNum());
 193         mp->initiateAcc(mp);
 194
 195         if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
 196             /**
 197              * if we are not in out-of-order data delivery mode
 198              * then we keep the responses sorted in program order.
 199              * in order to do so we must reserve an entry in the
 200              * resp buffer before we issue the request to the mem
 201              * system. mem fence requests will not be stored here
 202              * because once they are issued from the GM pipeline,
 203              * they do not send any response back to it.
 204              */
 205             gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
 206                 std::make_pair(mp, false)));
 207         }
 208
 209         if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
 210             /**
 211             * Memory accesses instructions that do not generate any memory
 212             * requests (such as out-of-bounds buffer acceses where all lanes
 213             * are out of bounds) will not trigger a callback to complete the
 214             * request, so we need to mark it as completed as soon as it is
 215             * issued.  Note this this will still insert an entry in the
 216             * ordered return FIFO such that waitcnt is still resolved
 217             * correctly.
 218             */
 219             handleResponse(mp);
 220             computeUnit->getTokenManager()->recvTokens(1);
 221         }
 222
 223         gmIssuedRequests.pop();
 224
 225         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
 226                 computeUnit->cu_id, mp->simdId, mp->wfSlotId);
 227     }
 228 }
 229
 230 GPUDynInstPtr
 231 GlobalMemPipeline::getNextReadyResp()
 232 {
 233     if (!gmOrderedRespBuffer.empty()) {
 234         auto mem_req = gmOrderedRespBuffer.begin();
 235
 236         if (mem_req->second.second) {
 237             return mem_req->second.first;
 238         }
 239     }
 240
 241     return nullptr;
 242 }
 243
 244 void
 245 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
 246 {
 247     if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 248         assert(inflightLoads > 0);
 249         --inflightLoads;
 250     } else if (gpuDynInst->isStore()) {
 251         assert(inflightStores > 0);
 252         --inflightStores;
 253     }
 254
 255     // we should only pop the oldest requst, and it
 256     // should be marked as done if we are here
 257     assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
 258     assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
 259     assert(gmOrderedRespBuffer.begin()->second.second);
 260     // remove this instruction from the buffer by its
 261     // unique seq ID
 262     gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 263 }
 264
 265 void
 266 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 267 {
 268     gpuDynInst->setAccessTime(curTick());
 269     gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
 270     gmIssuedRequests.push(gpuDynInst);
 271 }
 272
 273 void
 274 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 275 {
 276     auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
 277     // if we are getting a response for this mem request,
 278     // then it ought to already be in the ordered response
 279     // buffer
 280     assert(mem_req != gmOrderedRespBuffer.end());
 281     mem_req->second.second = true;
 282 }
 283
 284 void
 285 GlobalMemPipeline::regStats()
 286 {
 287     loadVrfBankConflictCycles
 288         .name(name() + ".load_vrf_bank_conflict_cycles")
 289         .desc("total number of cycles GM data are delayed before updating "
 290               "the VRF")
 291         ;
 292 }