* POSSIBILITY OF SUCH DAMAGE.
*/
-#include "gpu-compute/global_memory_pipeline.hh"
-
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
#include "debug/GPUCoalescer.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUReg.hh"
#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
- computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
- outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
- inflightLoads(0)
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
+ ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
+ gmQueueSize(p.global_mem_queue_size),
+ maxWaveRequests(p.max_wave_requests), inflightStores(0),
+ inflightLoads(0), stats(&cu)
{
}
void
-GlobalMemPipeline::init(ComputeUnit *cu)
+GlobalMemPipeline::init()
{
- computeUnit = cu;
- globalMemSize = computeUnit->shader->globalMemSize;
- _name = computeUnit->name() + ".GlobalMemPipeline";
+ globalMemSize = computeUnit.shader->globalMemSize;
}
bool
return true;
}
+void
+GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
+{
+ // We require one token from the coalescer's uncoalesced table to
+ // proceed
+ int token_count = 1;
+
+ DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
+ assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
+ mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
+}
+
+bool
+GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
+{
+ // Ensure we haven't exceeded the maximum number of vmem requests
+ // for this wavefront
+ if ((mp->wavefront()->outstandingReqsRdGm
+ + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
+ return false;
+ }
+
+ return true;
+}
+
void
GlobalMemPipeline::exec()
{
// check the VRF to see if the operands of a load (or load component
// of an atomic) are accessible
- if ((m) && (m->isLoad() || m->isAtomicRet())) {
+ if (m && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
- accessVrf =
- w->computeUnit->vrf[w->simdId]->
- vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
+ accessVrf = w->computeUnit->vrf[w->simdId]->
+ canScheduleWriteOperandsFromLoad(w, m);
+
}
- if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
- accessVrf && m->statusBitVector == VectorMask(0) &&
- (computeUnit->shader->coissue_return ||
- computeUnit->wfWait.at(m->pipeId).rdy())) {
+ if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+ accessVrf && (computeUnit.shader->coissue_return ||
+ computeUnit.vectorGlobalMemUnit.rdy())) {
w = m->wavefront();
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
+ m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
+ if (m->isFlat()) {
+ w->decLGKMInstsIssued();
+ }
+ w->decVMemInstsIssued();
+
+ if (m->isLoad() || m->isAtomicRet()) {
+ w->computeUnit->vrf[w->simdId]->
+ scheduleWriteOperandsFromLoad(w, m);
+ }
completeRequest(m);
- // Decrement outstanding register count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ Tick accessTime = curTick() - m->getAccessTime();
- if (m->isStore() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+ // Decrement outstanding requests count
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ if (m->isStore() || m->isAtomic() || m->isMemSync()) {
+ computeUnit.shader->sampleStore(accessTime);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
m->time, -1);
}
- if (m->isLoad() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+ if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
+ computeUnit.shader->sampleLoad(accessTime);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
m->time, -1);
}
+ w->validateRequestCounters();
+
+ // Generate stats for round-trip time for vectory memory insts
+ // going all the way to memory and stats for individual cache
+ // blocks generated by the instruction.
+ m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
+ computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+ computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
+
// Mark write bus busy for appropriate amount of time
- computeUnit->glbMemToVrfBus.set(m->time);
- if (!computeUnit->shader->coissue_return)
- w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+ computeUnit.glbMemToVrfBus.set(m->time);
+ if (!computeUnit.shader->coissue_return)
+ w->computeUnit->vectorGlobalMemUnit.set(m->time);
}
// If pipeline has executed a global memory instruction
DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
mp->disassemble(), mp->seqNum());
- // Memfences will not return tokens and must be issued so we should
- // not request one as this will deplete the token count until deadlock
- if (!mp->isMemFence()) {
- assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
- mp->computeUnit()->getTokenManager()->acquireTokens(1);
- }
mp->initiateAcc(mp);
- if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+ if (mp->isStore() && mp->isGlobalSeg()) {
+ mp->wavefront()->decExpInstsIssued();
+ }
+
+ if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
/**
* if we are not in out-of-order data delivery mode
* then we keep the responses sorted in program order.
std::make_pair(mp, false)));
}
+ if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
+ /**
+ * Memory accesses instructions that do not generate any memory
+ * requests (such as out-of-bounds buffer acceses where all lanes
+ * are out of bounds) will not trigger a callback to complete the
+ * request, so we need to mark it as completed as soon as it is
+ * issued. Note this this will still insert an entry in the
+ * ordered return FIFO such that waitcnt is still resolved
+ * correctly.
+ */
+ handleResponse(mp);
+ computeUnit.getTokenManager()->recvTokens(1);
+ }
+
gmIssuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
- computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+ computeUnit.cu_id, mp->simdId, mp->wfSlotId);
}
}
GPUDynInstPtr
GlobalMemPipeline::getNextReadyResp()
{
- if (outOfOrderDataDelivery) {
- if (!gmReturnedLoads.empty()) {
- return gmReturnedLoads.front();
- } else if (!gmReturnedStores.empty()) {
- return gmReturnedStores.front();
- }
- } else {
- if (!gmOrderedRespBuffer.empty()) {
- auto mem_req = gmOrderedRespBuffer.begin();
+ if (!gmOrderedRespBuffer.empty()) {
+ auto mem_req = gmOrderedRespBuffer.begin();
- if (mem_req->second.second) {
- return mem_req->second.first;
- }
+ if (mem_req->second.second) {
+ return mem_req->second.first;
}
}
--inflightStores;
}
- if (outOfOrderDataDelivery) {
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(!gmReturnedLoads.empty());
- gmReturnedLoads.pop();
- } else if (gpuDynInst->isStore()) {
- assert(!gmReturnedStores.empty());
- gmReturnedStores.pop();
- }
- } else {
- // we should only pop the oldest requst, and it
- // should be marked as done if we are here
- assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
- assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
- assert(gmOrderedRespBuffer.begin()->second.second);
- // remove this instruction from the buffer by its
- // unique seq ID
- gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
- }
+ // we should only pop the oldest requst, and it
+ // should be marked as done if we are here
+ assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+ assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+ assert(gmOrderedRespBuffer.begin()->second.second);
+ // remove this instruction from the buffer by its
+ // unique seq ID
+ gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
}
void
GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
{
+ gpuDynInst->setAccessTime(curTick());
+ gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
gmIssuedRequests.push(gpuDynInst);
}
void
GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
{
- if (outOfOrderDataDelivery) {
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(isGMLdRespFIFOWrRdy());
- gmReturnedLoads.push(gpuDynInst);
- } else {
- assert(isGMStRespFIFOWrRdy());
- gmReturnedStores.push(gpuDynInst);
- }
- } else {
- auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
- // if we are getting a response for this mem request,
- // then it ought to already be in the ordered response
- // buffer
- assert(mem_req != gmOrderedRespBuffer.end());
- mem_req->second.second = true;
- }
+ auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+ // if we are getting a response for this mem request,
+ // then it ought to already be in the ordered response
+ // buffer
+ assert(mem_req != gmOrderedRespBuffer.end());
+ mem_req->second.second = true;
}
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+ : Stats::Group(parent, "GlobalMemPipeline"),
+ ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+ "are delayed before updating the VRF")
{
- loadVrfBankConflictCycles
- .name(name() + ".load_vrf_bank_conflict_cycles")
- .desc("total number of cycles GM data are delayed before updating "
- "the VRF")
- ;
}