src/gpu-compute/global_memory_pipeline.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: John Kalamatianos, Sooraj Puthoor
  34  */
  35
  36 #include "gpu-compute/global_memory_pipeline.hh"
  37
  38 #include "debug/GPUMem.hh"
  39 #include "debug/GPUReg.hh"
  40 #include "gpu-compute/compute_unit.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/vector_register_file.hh"
  44 #include "gpu-compute/wavefront.hh"
  45
  46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
  47     computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
  48     outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
  49     inflightLoads(0)
  50 {
  51 }
  52
  53 void
  54 GlobalMemPipeline::init(ComputeUnit *cu)
  55 {
  56     computeUnit = cu;
  57     globalMemSize = computeUnit->shader->globalMemSize;
  58     _name = computeUnit->name() + ".GlobalMemPipeline";
  59 }
  60
  61 void
  62 GlobalMemPipeline::exec()
  63 {
  64     // apply any returned global memory operations
  65     GPUDynInstPtr m = getNextReadyResp();
  66
  67     bool accessVrf = true;
  68     Wavefront *w = nullptr;
  69
  70     // check the VRF to see if the operands of a load (or load component
  71     // of an atomic) are accessible
  72     if ((m) && (m->isLoad() || m->isAtomicRet())) {
  73         w = m->wavefront();
  74
  75         accessVrf =
  76             w->computeUnit->vrf[w->simdId]->
  77                 vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
  78     }
  79
  80     if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
  81         accessVrf && m->statusBitVector == VectorMask(0) &&
  82         (computeUnit->shader->coissue_return ||
  83         computeUnit->wfWait.at(m->pipeId).rdy())) {
  84
  85         w = m->wavefront();
  86
  87         m->completeAcc(m);
  88
  89         completeRequest(m);
  90
  91         // Decrement outstanding register count
  92         computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
  93
  94         if (m->isStore() || m->isAtomic()) {
  95             computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
  96                                              m->time, -1);
  97         }
  98
  99         if (m->isLoad() || m->isAtomic()) {
 100             computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
 101                                              m->time, -1);
 102         }
 103
 104         // Mark write bus busy for appropriate amount of time
 105         computeUnit->glbMemToVrfBus.set(m->time);
 106         if (!computeUnit->shader->coissue_return)
 107             w->computeUnit->wfWait.at(m->pipeId).set(m->time);
 108     }
 109
 110     // If pipeline has executed a global memory instruction
 111     // execute global memory packets and issue global
 112     // memory packets to DTLB
 113     if (!gmIssuedRequests.empty()) {
 114         GPUDynInstPtr mp = gmIssuedRequests.front();
 115         if (mp->isLoad() || mp->isAtomic()) {
 116             if (inflightLoads >= gmQueueSize) {
 117                 return;
 118             } else {
 119                 ++inflightLoads;
 120             }
 121         } else if (mp->isStore()) {
 122             if (inflightStores >= gmQueueSize) {
 123                 return;
 124             } else {
 125                 ++inflightStores;
 126             }
 127         }
 128
 129         mp->initiateAcc(mp);
 130
 131         if (!outOfOrderDataDelivery && !mp->isMemFence()) {
 132             /**
 133              * if we are not in out-of-order data delivery mode
 134              * then we keep the responses sorted in program order.
 135              * in order to do so we must reserve an entry in the
 136              * resp buffer before we issue the request to the mem
 137              * system. mem fence requests will not be stored here
 138              * because once they are issued from the GM pipeline,
 139              * they do not send any response back to it.
 140              */
 141             gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
 142                 std::make_pair(mp, false)));
 143         }
 144
 145         gmIssuedRequests.pop();
 146
 147         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
 148                 computeUnit->cu_id, mp->simdId, mp->wfSlotId);
 149     }
 150 }
 151
 152 GPUDynInstPtr
 153 GlobalMemPipeline::getNextReadyResp()
 154 {
 155     if (outOfOrderDataDelivery) {
 156         if (!gmReturnedLoads.empty()) {
 157             return gmReturnedLoads.front();
 158         } else if (!gmReturnedStores.empty()) {
 159             return gmReturnedStores.front();
 160         }
 161     } else {
 162         if (!gmOrderedRespBuffer.empty()) {
 163             auto mem_req = gmOrderedRespBuffer.begin();
 164
 165             if (mem_req->second.second) {
 166                 return mem_req->second.first;
 167             }
 168         }
 169     }
 170
 171     return nullptr;
 172 }
 173
 174 void
 175 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
 176 {
 177     if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 178         assert(inflightLoads > 0);
 179         --inflightLoads;
 180     } else if (gpuDynInst->isStore()) {
 181         assert(inflightStores > 0);
 182         --inflightStores;
 183     }
 184
 185     if (outOfOrderDataDelivery) {
 186         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 187             assert(!gmReturnedLoads.empty());
 188             gmReturnedLoads.pop();
 189         } else if (gpuDynInst->isStore()) {
 190             assert(!gmReturnedStores.empty());
 191             gmReturnedStores.pop();
 192         }
 193     } else {
 194         // we should only pop the oldest requst, and it
 195         // should be marked as done if we are here
 196         assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
 197         assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
 198         assert(gmOrderedRespBuffer.begin()->second.second);
 199         // remove this instruction from the buffer by its
 200         // unique seq ID
 201         gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 202     }
 203 }
 204
 205 void
 206 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 207 {
 208     gmIssuedRequests.push(gpuDynInst);
 209 }
 210
 211 void
 212 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 213 {
 214     if (outOfOrderDataDelivery) {
 215         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 216             assert(isGMLdRespFIFOWrRdy());
 217             gmReturnedLoads.push(gpuDynInst);
 218         } else {
 219             assert(isGMStRespFIFOWrRdy());
 220             gmReturnedStores.push(gpuDynInst);
 221         }
 222     } else {
 223         auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
 224         // if we are getting a response for this mem request,
 225         // then it ought to already be in the ordered response
 226         // buffer
 227         assert(mem_req != gmOrderedRespBuffer.end());
 228         mem_req->second.second = true;
 229     }
 230 }
 231
 232 void
 233 GlobalMemPipeline::regStats()
 234 {
 235     loadVrfBankConflictCycles
 236         .name(name() + ".load_vrf_bank_conflict_cycles")
 237         .desc("total number of cycles GM data are delayed before updating "
 238               "the VRF")
 239         ;
 240 }