help = 'fast forward using kvm until the m5_switchcpu'
' pseudo-op is encountered, then switch cpus. subsequent'
' m5_switchcpu pseudo-ops will toggle back and forth')
-
+parser.add_option('--outOfOrderDataDelivery', action='store_true',
+ default=False, help='enable OoO data delivery in the GM'
+ ' pipeline')
Ruby.define_options(parser)
localDataStore = \
LdsState(banks = options.numLdsBanks,
bankConflictPenalty = \
- options.ldsBankConflictPenalty)))
+ options.ldsBankConflictPenalty),
+ out_of_order_data_delivery =
+ options.outOfOrderDataDelivery))
wavefronts = []
vrfs = []
for j in xrange(options.simds_per_cu):
gpuDynInst->useContinuation = false;
GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
- gmp->getGMReqFIFO().push(gpuDynInst);
+ gmp->issueRequest(gpuDynInst);
w->wrGmReqsInPipe--;
w->rdGmReqsInPipe--;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
}
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
m->latency.set(w->computeUnit->shader->ticks(64));
m->pipeId = GLBMEM_PIPE;
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
w->outstandingReqs++;
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
"file")
+ out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
+ " in the GM pipeline")
class Shader(ClockedObject):
type = 'Shader'
if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
gpuDynInst->statusVector.clear();
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
-
- compute_unit->globalMemoryPipe.getGMLdRespFIFO()
- .push(gpuDynInst);
- } else {
- assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
-
- compute_unit->globalMemoryPipe.getGMStRespFIFO()
- .push(gpuDynInst);
- }
+ compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
compute_unit->cu_id, gpuDynInst->simdId,
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
- inflightStores(0), inflightLoads(0)
+ outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+ inflightLoads(0)
{
}
GlobalMemPipeline::exec()
{
// apply any returned global memory operations
- GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
- !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+ GPUDynInstPtr m = getNextReadyResp();
bool accessVrf = true;
Wavefront *w = nullptr;
accessVrf =
w->computeUnit->vrf[w->simdId]->
- vrfOperandAccessReady(m->seqNum(), w, m,
- VrfAccessType::WRITE);
+ vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
}
- if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
- m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+ if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
accessVrf && m->statusBitVector == VectorMask(0) &&
(computeUnit->shader->coissue_return ||
- computeUnit->wfWait.at(m->pipeId).rdy())) {
+ computeUnit->wfWait.at(m->pipeId).rdy())) {
w = m->wavefront();
m->completeAcc(m);
- if (m->isLoad() || m->isAtomic()) {
- gmReturnedLoads.pop();
- assert(inflightLoads > 0);
- --inflightLoads;
- } else {
- assert(m->isStore());
- gmReturnedStores.pop();
- assert(inflightStores > 0);
- --inflightStores;
- }
+ completeRequest(m);
// Decrement outstanding register count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
} else {
++inflightLoads;
}
- } else {
+ } else if (mp->isStore()) {
if (inflightStores >= gmQueueSize) {
return;
- } else if (mp->isStore()) {
+ } else {
++inflightStores;
}
}
mp->initiateAcc(mp);
+
+ if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+ /**
+ * if we are not in out-of-order data delivery mode
+ * then we keep the responses sorted in program order.
+ * in order to do so we must reserve an entry in the
+ * resp buffer before we issue the request to the mem
+ * system. mem fence requests will not be stored here
+ * because once they are issued from the GM pipeline,
+ * they do not send any response back to it.
+ */
+ gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
+ std::make_pair(mp, false)));
+ }
+
gmIssuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
}
}
+GPUDynInstPtr
+GlobalMemPipeline::getNextReadyResp()
+{
+ if (outOfOrderDataDelivery) {
+ if (!gmReturnedLoads.empty()) {
+ return gmReturnedLoads.front();
+ } else if (!gmReturnedStores.empty()) {
+ return gmReturnedStores.front();
+ }
+ } else {
+ if (!gmOrderedRespBuffer.empty()) {
+ auto mem_req = gmOrderedRespBuffer.begin();
+
+ if (mem_req->second.second) {
+ return mem_req->second.first;
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+void
+GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
+{
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(inflightLoads > 0);
+ --inflightLoads;
+ } else if (gpuDynInst->isStore()) {
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ if (outOfOrderDataDelivery) {
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(!gmReturnedLoads.empty());
+ gmReturnedLoads.pop();
+ } else if (gpuDynInst->isStore()) {
+ assert(!gmReturnedStores.empty());
+ gmReturnedStores.pop();
+ }
+ } else {
+ // we should only pop the oldest requst, and it
+ // should be marked as done if we are here
+ assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+ assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+ assert(gmOrderedRespBuffer.begin()->second.second);
+ // remove this instruction from the buffer by its
+ // unique seq ID
+ gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
+ }
+}
+
+void
+GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+ gmIssuedRequests.push(gpuDynInst);
+}
+
+void
+GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
+{
+ if (outOfOrderDataDelivery) {
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(isGMLdRespFIFOWrRdy());
+ gmReturnedLoads.push(gpuDynInst);
+ } else {
+ assert(isGMStRespFIFOWrRdy());
+ gmReturnedStores.push(gpuDynInst);
+ }
+ } else {
+ auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+ // if we are getting a response for this mem request,
+ // then it ought to already be in the ordered response
+ // buffer
+ assert(mem_req != gmOrderedRespBuffer.end());
+ mem_req->second.second = true;
+ }
+}
+
void
GlobalMemPipeline::regStats()
{
void init(ComputeUnit *cu);
void exec();
- std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+ /**
+ * find the next ready response to service. for OoO mode we
+ * simply pop the oldest (based on when the response was
+ * received) response in the response FIFOs. for in-order mode
+ * we pop the oldest (in program order) response, and only if
+ * it is marked as done.
+ */
+ GPUDynInstPtr getNextReadyResp();
+
+ /**
+ * once a memory request is finished we remove it from the
+ * buffer. this method determines which response buffer
+ * we're using based on the mode (in-order vs. OoO).
+ */
+ void completeRequest(GPUDynInstPtr gpuDynInst);
+
+ /**
+ * issues a request to the pipeline - i.e., enqueue it
+ * in the request buffer.
+ */
+ void issueRequest(GPUDynInstPtr gpuDynInst);
+
+ /**
+ * this method handles responses sent to this GM pipeline by the
+ * CU. in the case of in-order delivery it simply marks the reqeust
+ * as done in the ordered buffer to indicate that the requst is
+ * finished. for out-of-order data delivery, the requests are enqueued
+ * (in the order in which they are received) in the response FIFOs.
+ */
+ void handleResponse(GPUDynInstPtr gpuDynInst);
+
bool
isGMLdRespFIFOWrRdy() const
{
ComputeUnit *computeUnit;
std::string _name;
int gmQueueSize;
+ bool outOfOrderDataDelivery;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
// The size of global memory.
int globalMemSize;
+ /*
+ * this buffer holds the memory responses when in-order data
+ * deilvery is used - the responses are ordered by their unique
+ * sequence number, which is monotonically increasing. when a
+ * memory request returns its "done" flag is set to true. during
+ * each tick the the GM pipeline will check if the oldest request
+ * is finished, and if so it will be removed from the queue.
+ *
+ * key: memory instruction's sequence ID
+ *
+ * value: pair holding the instruction pointer and a bool that
+ * is used to indicate whether or not the request has
+ * completed
+ */
+ std::map<uint64_t, std::pair<GPUDynInstPtr, bool>> gmOrderedRespBuffer;
+
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;