src/gpu-compute/global_memory_pipeline.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: John Kalamatianos,
  34  *          Sooraj Puthoor
  35  */
  36
  37 #include "gpu-compute/global_memory_pipeline.hh"
  38
  39 #include "debug/GPUMem.hh"
  40 #include "debug/GPUReg.hh"
  41 #include "gpu-compute/compute_unit.hh"
  42 #include "gpu-compute/gpu_dyn_inst.hh"
  43 #include "gpu-compute/shader.hh"
  44 #include "gpu-compute/vector_register_file.hh"
  45 #include "gpu-compute/wavefront.hh"
  46
  47 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
  48     computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
  49     outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
  50     inflightLoads(0)
  51 {
  52 }
  53
  54 void
  55 GlobalMemPipeline::init(ComputeUnit *cu)
  56 {
  57     computeUnit = cu;
  58     globalMemSize = computeUnit->shader->globalMemSize;
  59     _name = computeUnit->name() + ".GlobalMemPipeline";
  60 }
  61
  62 void
  63 GlobalMemPipeline::exec()
  64 {
  65     // apply any returned global memory operations
  66     GPUDynInstPtr m = getNextReadyResp();
  67
  68     bool accessVrf = true;
  69     Wavefront *w = nullptr;
  70
  71     // check the VRF to see if the operands of a load (or load component
  72     // of an atomic) are accessible
  73     if ((m) && (m->isLoad() || m->isAtomicRet())) {
  74         w = m->wavefront();
  75
  76         accessVrf =
  77             w->computeUnit->vrf[w->simdId]->
  78                 vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
  79     }
  80
  81     if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
  82         accessVrf && m->statusBitVector == VectorMask(0) &&
  83         (computeUnit->shader->coissue_return ||
  84         computeUnit->wfWait.at(m->pipeId).rdy())) {
  85
  86         w = m->wavefront();
  87
  88         m->completeAcc(m);
  89
  90         completeRequest(m);
  91
  92         // Decrement outstanding register count
  93         computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
  94
  95         if (m->isStore() || m->isAtomic()) {
  96             computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
  97                                              m->time, -1);
  98         }
  99
 100         if (m->isLoad() || m->isAtomic()) {
 101             computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
 102                                              m->time, -1);
 103         }
 104
 105         // Mark write bus busy for appropriate amount of time
 106         computeUnit->glbMemToVrfBus.set(m->time);
 107         if (!computeUnit->shader->coissue_return)
 108             w->computeUnit->wfWait.at(m->pipeId).set(m->time);
 109     }
 110
 111     // If pipeline has executed a global memory instruction
 112     // execute global memory packets and issue global
 113     // memory packets to DTLB
 114     if (!gmIssuedRequests.empty()) {
 115         GPUDynInstPtr mp = gmIssuedRequests.front();
 116         if (mp->isLoad() || mp->isAtomic()) {
 117             if (inflightLoads >= gmQueueSize) {
 118                 return;
 119             } else {
 120                 ++inflightLoads;
 121             }
 122         } else if (mp->isStore()) {
 123             if (inflightStores >= gmQueueSize) {
 124                 return;
 125             } else {
 126                 ++inflightStores;
 127             }
 128         }
 129
 130         mp->initiateAcc(mp);
 131
 132         if (!outOfOrderDataDelivery && !mp->isMemFence()) {
 133             /**
 134              * if we are not in out-of-order data delivery mode
 135              * then we keep the responses sorted in program order.
 136              * in order to do so we must reserve an entry in the
 137              * resp buffer before we issue the request to the mem
 138              * system. mem fence requests will not be stored here
 139              * because once they are issued from the GM pipeline,
 140              * they do not send any response back to it.
 141              */
 142             gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
 143                 std::make_pair(mp, false)));
 144         }
 145
 146         gmIssuedRequests.pop();
 147
 148         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
 149                 computeUnit->cu_id, mp->simdId, mp->wfSlotId);
 150     }
 151 }
 152
 153 GPUDynInstPtr
 154 GlobalMemPipeline::getNextReadyResp()
 155 {
 156     if (outOfOrderDataDelivery) {
 157         if (!gmReturnedLoads.empty()) {
 158             return gmReturnedLoads.front();
 159         } else if (!gmReturnedStores.empty()) {
 160             return gmReturnedStores.front();
 161         }
 162     } else {
 163         if (!gmOrderedRespBuffer.empty()) {
 164             auto mem_req = gmOrderedRespBuffer.begin();
 165
 166             if (mem_req->second.second) {
 167                 return mem_req->second.first;
 168             }
 169         }
 170     }
 171
 172     return nullptr;
 173 }
 174
 175 void
 176 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
 177 {
 178     if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 179         assert(inflightLoads > 0);
 180         --inflightLoads;
 181     } else if (gpuDynInst->isStore()) {
 182         assert(inflightStores > 0);
 183         --inflightStores;
 184     }
 185
 186     if (outOfOrderDataDelivery) {
 187         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 188             assert(!gmReturnedLoads.empty());
 189             gmReturnedLoads.pop();
 190         } else if (gpuDynInst->isStore()) {
 191             assert(!gmReturnedStores.empty());
 192             gmReturnedStores.pop();
 193         }
 194     } else {
 195         // we should only pop the oldest requst, and it
 196         // should be marked as done if we are here
 197         assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
 198         assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
 199         assert(gmOrderedRespBuffer.begin()->second.second);
 200         // remove this instruction from the buffer by its
 201         // unique seq ID
 202         gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 203     }
 204 }
 205
 206 void
 207 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 208 {
 209     gmIssuedRequests.push(gpuDynInst);
 210 }
 211
 212 void
 213 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 214 {
 215     if (outOfOrderDataDelivery) {
 216         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 217             assert(isGMLdRespFIFOWrRdy());
 218             gmReturnedLoads.push(gpuDynInst);
 219         } else {
 220             assert(isGMStRespFIFOWrRdy());
 221             gmReturnedStores.push(gpuDynInst);
 222         }
 223     } else {
 224         auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
 225         // if we are getting a response for this mem request,
 226         // then it ought to already be in the ordered response
 227         // buffer
 228         assert(mem_req != gmOrderedRespBuffer.end());
 229         mem_req->second.second = true;
 230     }
 231 }
 232
 233 void
 234 GlobalMemPipeline::regStats()
 235 {
 236     loadVrfBankConflictCycles
 237         .name(name() + ".load_vrf_bank_conflict_cycles")
 238         .desc("total number of cycles GM data are delayed before updating "
 239               "the VRF")
 240         ;
 241 }