src/gpu-compute/global_memory_pipeline.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #define __STDC_FORMAT_MACROS
  35 #include <cinttypes>
  36 #include "debug/GPUCoalescer.hh"
  37 #include "debug/GPUMem.hh"
  38 #include "debug/GPUReg.hh"
  39 #include "gpu-compute/compute_unit.hh"
  40 #include "gpu-compute/global_memory_pipeline.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/vector_register_file.hh"
  44 #include "gpu-compute/wavefront.hh"
  45
  46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
  47                                      ComputeUnit &cu)
  48     : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
  49       gmQueueSize(p->global_mem_queue_size),
  50       maxWaveRequests(p->max_wave_requests), inflightStores(0),
  51       inflightLoads(0)
  52 {
  53 }
  54
  55 void
  56 GlobalMemPipeline::init()
  57 {
  58     globalMemSize = computeUnit.shader->globalMemSize;
  59 }
  60
  61 bool
  62 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
  63 {
  64     // We require one token from the coalescer's uncoalesced table to
  65     // proceed
  66     int token_count = 1;
  67
  68     // Make sure the vector port has tokens. There is a single pool
  69     // of tokens so only one port in the vector port needs to be checked.
  70     // Lane 0 is chosen arbirarily.
  71     DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
  72     if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
  73         DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
  74         return false;
  75     }
  76
  77     return true;
  78 }
  79
  80 void
  81 GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
  82 {
  83     // We require one token from the coalescer's uncoalesced table to
  84     // proceed
  85     int token_count = 1;
  86
  87     DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
  88     assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
  89     mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
  90 }
  91
  92 bool
  93 GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
  94 {
  95     // Ensure we haven't exceeded the maximum number of vmem requests
  96     // for this wavefront
  97     if ((mp->wavefront()->outstandingReqsRdGm
  98          + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
  99         return false;
 100     }
 101
 102     return true;
 103 }
 104
 105 void
 106 GlobalMemPipeline::exec()
 107 {
 108     // apply any returned global memory operations
 109     GPUDynInstPtr m = getNextReadyResp();
 110
 111     bool accessVrf = true;
 112     Wavefront *w = nullptr;
 113
 114     // check the VRF to see if the operands of a load (or load component
 115     // of an atomic) are accessible
 116     if (m && (m->isLoad() || m->isAtomicRet())) {
 117         w = m->wavefront();
 118
 119         accessVrf = w->computeUnit->vrf[w->simdId]->
 120             canScheduleWriteOperandsFromLoad(w, m);
 121
 122     }
 123
 124     if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
 125         accessVrf && (computeUnit.shader->coissue_return ||
 126         computeUnit.vectorGlobalMemUnit.rdy())) {
 127
 128         w = m->wavefront();
 129
 130         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
 131                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
 132         m->completeAcc(m);
 133         w->decVMemInstsIssued();
 134
 135         if (m->isLoad() || m->isAtomicRet()) {
 136             w->computeUnit->vrf[w->simdId]->
 137             scheduleWriteOperandsFromLoad(w, m);
 138         }
 139
 140         completeRequest(m);
 141
 142         Tick accessTime = curTick() - m->getAccessTime();
 143
 144         // Decrement outstanding requests count
 145         computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
 146         if (m->isStore() || m->isAtomic() || m->isMemSync()) {
 147             computeUnit.shader->sampleStore(accessTime);
 148             computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
 149                                              m->time, -1);
 150         }
 151
 152         if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
 153             computeUnit.shader->sampleLoad(accessTime);
 154             computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
 155                                              m->time, -1);
 156         }
 157
 158         w->validateRequestCounters();
 159
 160         // Generate stats for round-trip time for vectory memory insts
 161         // going all the way to memory and stats for individual cache
 162         // blocks generated by the instruction.
 163         m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
 164         computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
 165         computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
 166
 167         // Mark write bus busy for appropriate amount of time
 168         computeUnit.glbMemToVrfBus.set(m->time);
 169         if (!computeUnit.shader->coissue_return)
 170             w->computeUnit->vectorGlobalMemUnit.set(m->time);
 171     }
 172
 173     // If pipeline has executed a global memory instruction
 174     // execute global memory packets and issue global
 175     // memory packets to DTLB
 176     if (!gmIssuedRequests.empty()) {
 177         GPUDynInstPtr mp = gmIssuedRequests.front();
 178         if (mp->isLoad() || mp->isAtomic()) {
 179             if (inflightLoads >= gmQueueSize) {
 180                 return;
 181             } else {
 182                 ++inflightLoads;
 183             }
 184         } else if (mp->isStore()) {
 185             if (inflightStores >= gmQueueSize) {
 186                 return;
 187             } else {
 188                 ++inflightStores;
 189             }
 190         }
 191
 192         DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
 193                 mp->disassemble(), mp->seqNum());
 194         mp->initiateAcc(mp);
 195
 196         if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
 197             /**
 198              * if we are not in out-of-order data delivery mode
 199              * then we keep the responses sorted in program order.
 200              * in order to do so we must reserve an entry in the
 201              * resp buffer before we issue the request to the mem
 202              * system. mem fence requests will not be stored here
 203              * because once they are issued from the GM pipeline,
 204              * they do not send any response back to it.
 205              */
 206             gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
 207                 std::make_pair(mp, false)));
 208         }
 209
 210         if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
 211             /**
 212             * Memory accesses instructions that do not generate any memory
 213             * requests (such as out-of-bounds buffer acceses where all lanes
 214             * are out of bounds) will not trigger a callback to complete the
 215             * request, so we need to mark it as completed as soon as it is
 216             * issued.  Note this this will still insert an entry in the
 217             * ordered return FIFO such that waitcnt is still resolved
 218             * correctly.
 219             */
 220             handleResponse(mp);
 221             computeUnit.getTokenManager()->recvTokens(1);
 222         }
 223
 224         gmIssuedRequests.pop();
 225
 226         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
 227                 computeUnit.cu_id, mp->simdId, mp->wfSlotId);
 228     }
 229 }
 230
 231 GPUDynInstPtr
 232 GlobalMemPipeline::getNextReadyResp()
 233 {
 234     if (!gmOrderedRespBuffer.empty()) {
 235         auto mem_req = gmOrderedRespBuffer.begin();
 236
 237         if (mem_req->second.second) {
 238             return mem_req->second.first;
 239         }
 240     }
 241
 242     return nullptr;
 243 }
 244
 245 void
 246 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
 247 {
 248     if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 249         assert(inflightLoads > 0);
 250         --inflightLoads;
 251     } else if (gpuDynInst->isStore()) {
 252         assert(inflightStores > 0);
 253         --inflightStores;
 254     }
 255
 256     // we should only pop the oldest requst, and it
 257     // should be marked as done if we are here
 258     assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
 259     assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
 260     assert(gmOrderedRespBuffer.begin()->second.second);
 261     // remove this instruction from the buffer by its
 262     // unique seq ID
 263     gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 264 }
 265
 266 void
 267 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 268 {
 269     gpuDynInst->setAccessTime(curTick());
 270     gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
 271     gmIssuedRequests.push(gpuDynInst);
 272 }
 273
 274 void
 275 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 276 {
 277     auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
 278     // if we are getting a response for this mem request,
 279     // then it ought to already be in the ordered response
 280     // buffer
 281     assert(mem_req != gmOrderedRespBuffer.end());
 282     mem_req->second.second = true;
 283 }
 284
 285 void
 286 GlobalMemPipeline::regStats()
 287 {
 288     loadVrfBankConflictCycles
 289         .name(name() + ".load_vrf_bank_conflict_cycles")
 290         .desc("total number of cycles GM data are delayed before updating "
 291               "the VRF")
 292         ;
 293 }