From: Tony Gutierrez Date: Fri, 29 Mar 2019 21:48:39 +0000 (-0400) Subject: gpu-compute: Create CU's ports in the standard way X-Git-Tag: v20.1.0.0~218 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=94000aefe610d7084eb142cd41a8c66cd4670bbd;p=gem5.git gpu-compute: Create CU's ports in the standard way The CU would initialize its ports in getMasterPort(), which is not desirable as getMasterPort() may be called several times for the same port. This can lead to a fatal if the CU expects to only create a single port of a given type, and may lead to other issues where stat names are duplicated. This change instantiates and initializes the CU's ports in the CU constructor using the CU params. The index field is also removed from the CU's ports because the base class already has an ID field, which will be set to the default value in the base class's constructor for scalar ports. It doesn't make sense for scalar port's to take an index because they are scalar, so we let the base class initialize the ID to the invalid port ID. Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32836 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 9a41233b6..2d64fa3c7 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -96,6 +96,11 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(this, "ComputeUnit")), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), + ldsPort(csprintf("%s-port", name()), this), + scalarDataPort(csprintf("%s-port", name()), this), + scalarDTLBPort(csprintf("%s-port", name()), this), + sqcPort(csprintf("%s-port", name()), this), + sqcTLBPort(csprintf("%s-port", name()), this), _cacheLineSize(p->system->cacheLineSize()), _numBarrierSlots(p->num_barrier_slots), globalSeqNum(0), wavefrontSize(p->wf_size), @@ -169,16 +174,18 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fatal("Invalid WF execution policy (CU)\n"); } - memPort.resize(wfSize()); + for (int i = 0; i < p->port_memory_port_connection_count; ++i) { + memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); + } + + for (int i = 0; i < p->port_translation_port_connection_count; ++i) { + tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); + } // Setup tokens for slave ports. The number of tokens in memSlaveTokens // is the total token count for the entire vector port (i.e., this CU). memPortTokens = new TokenManager(p->max_cu_tokens); - // resize the tlbPort vectorArray - int tlbPort_width = perLaneTLB ? wfSize() : 1; - tlbPort.resize(tlbPort_width); - registerExitCallback([this]() { exitCallback(); }); lastExecCycle.resize(numVectorALUs, 0); @@ -214,7 +221,6 @@ ComputeUnit::~ComputeUnit() lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); - delete ldsPort; } int @@ -781,7 +787,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) // appropriate cycle to process the timing memory response // This delay represents the pipeline delay SenderState *sender_state = safe_cast(pkt->senderState); - int index = sender_state->port_index; + PortID index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); @@ -886,7 +892,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) } EventFunctionWrapper *mem_resp_event = - computeUnit->memPort[index]->createMemRespEvent(pkt); + computeUnit->memPort[index].createMemRespEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n", @@ -1007,7 +1013,7 @@ ComputeUnit::SQCPort::recvReqRetry() } void -ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { // There must be a way around this check to do the globalMemStart... Addr tmp_vaddr = pkt->req->getVaddr(); @@ -1039,7 +1045,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) tlbCycles -= curTick(); ++tlbRequests; - int tlbPort_index = perLaneTLB ? index : 0; + PortID tlbPort_index = perLaneTLB ? index : 0; if (shader->timingSim) { if (debugSegFault) { @@ -1074,7 +1080,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) pkt->senderState = translation_state; if (functionalTLB) { - tlbPort[tlbPort_index]->sendFunctional(pkt); + tlbPort[tlbPort_index].sendFunctional(pkt); // update the hitLevel distribution int hit_level = translation_state->hitLevel; @@ -1117,33 +1123,33 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) // translation is done. Schedule the mem_req_event at the // appropriate cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = - memPort[index]->createMemReqEvent(pkt); + memPort[index].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " "scheduled\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); - } else if (tlbPort[tlbPort_index]->isStalled()) { - assert(tlbPort[tlbPort_index]->retries.size() > 0); + } else if (tlbPort[tlbPort_index].isStalled()) { + assert(tlbPort[tlbPort_index].retries.size() > 0); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); - tlbPort[tlbPort_index]->retries.push_back(pkt); - } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { + tlbPort[tlbPort_index].retries.push_back(pkt); + } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) { // Stall the data port; // No more packet will be issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. - tlbPort[tlbPort_index]->stallPort(); + tlbPort[tlbPort_index].stallPort(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); - tlbPort[tlbPort_index]->retries.push_back(pkt); + tlbPort[tlbPort_index].retries.push_back(pkt); } else { DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n", @@ -1163,7 +1169,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc); - tlbPort[tlbPort_index]->sendFunctional(pkt); + tlbPort[tlbPort_index].sendFunctional(pkt); // the addr of the packet is not modified, so we need to create a new // packet, or otherwise the memory access will have the old virtual @@ -1173,7 +1179,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) new_pkt->dataStatic(pkt->getPtr()); // Translation is done. It is safe to send the packet to memory. - memPort[0]->sendFunctional(new_pkt); + memPort[0].sendFunctional(new_pkt); DPRINTF(GPUMem, "Functional sendRequest\n"); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, @@ -1205,12 +1211,12 @@ ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt) new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false, pkt->senderState); - if (scalarDTLBPort->isStalled()) { - assert(scalarDTLBPort->retries.size()); - scalarDTLBPort->retries.push_back(pkt); - } else if (!scalarDTLBPort->sendTimingReq(pkt)) { - scalarDTLBPort->stallPort(); - scalarDTLBPort->retries.push_back(pkt); + if (scalarDTLBPort.isStalled()) { + assert(scalarDTLBPort.retries.size()); + scalarDTLBPort.retries.push_back(pkt); + } else if (!scalarDTLBPort.sendTimingReq(pkt)) { + scalarDTLBPort.stallPort(); + scalarDTLBPort.retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n", tlb_mode == BaseTLB::Read ? "read" : "write", @@ -1246,7 +1252,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = - memPort[0]->createMemReqEvent(pkt); + memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "an acquire\n", cu_id, gpuDynInst->simdId, @@ -1266,7 +1272,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = - memPort[0]->createMemReqEvent(pkt); + memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "a release\n", cu_id, gpuDynInst->simdId, @@ -1284,7 +1290,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = - memPort[0]->createMemReqEvent(pkt); + memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", @@ -1308,7 +1314,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - pkt->req->getPaddr(), index); + pkt->req->getPaddr(), id); Addr paddr = pkt->req->getPaddr(); @@ -1321,7 +1327,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) int index = gpuDynInst->memStatusVector[paddr].back(); DPRINTF(GPUMem, "Response for addr %#x, index %d\n", - pkt->req->getPaddr(), index); + pkt->req->getPaddr(), id); gpuDynInst->memStatusVector[paddr].pop_back(); gpuDynInst->pAddr = pkt->req->getPaddr(); @@ -1425,7 +1431,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; - int mp_index = sender_state->portIndex; + PortID mp_index = sender_state->portIndex; Addr vaddr = pkt->req->getVaddr(); gpuDynInst->memStatusVector[line].push_back(mp_index); gpuDynInst->tlbHitLevel[mp_index] = hit_level; @@ -1535,7 +1541,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) // translation is done. Schedule the mem_req_event at the appropriate // cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = - computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt); + computeUnit->memPort[mp_index].createMemReqEvent(new_pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", computeUnit->cu_id, gpuDynInst->simdId, @@ -1575,14 +1581,13 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", - compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, index, - pkt->req->getPaddr()); + compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + id, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index, + gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id, pkt->req->getPaddr()); } } @@ -1598,22 +1603,21 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process() { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; - ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit; + ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit; - if (!(scalarDataPort->sendTimingReq(pkt))) { - scalarDataPort->retries.push_back(pkt); + if (!(scalarDataPort.sendTimingReq(pkt))) { + scalarDataPort.retries.push_back(pkt); DPRINTF(GPUPort, - "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", + "CU%d: WF[%d][%d]: addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, scalarDataPort->index, - pkt->req->getPaddr()); + gpuDynInst->wfSlotId, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, - "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " + "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), - scalarDataPort->index, pkt->req->getPaddr()); + pkt->req->getPaddr()); } } @@ -1702,8 +1706,8 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt) req_pkt->senderState = new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst); - if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) { - computeUnit->scalarDataPort->retries.push_back(req_pkt); + if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) { + computeUnit->scalarDataPort.retries.push_back(req_pkt); DPRINTF(GPUMem, "send scalar req failed for: %s\n", gpuDynInst->disassemble()); } else { @@ -2544,7 +2548,7 @@ ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) // This is the SenderState needed upon return newPacket->senderState = new LDSPort::SenderState(gpuDynInst); - return ldsPort->sendTimingReq(newPacket); + return ldsPort.sendTimingReq(newPacket); } /** diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 211dd5350..f7484af87 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -448,7 +448,7 @@ class ComputeUnit : public ClockedObject void doSmReturn(GPUDynInstPtr gpuDynInst); virtual void init() override; - void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt); void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt); void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, @@ -652,16 +652,15 @@ class ComputeUnit : public ClockedObject class DataPort : public RequestPort { public: - DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) - : RequestPort(_name, _cu), computeUnit(_cu), - index(_index) { } + DataPort(const std::string &_name, ComputeUnit *_cu, PortID id) + : RequestPort(_name, _cu, id), computeUnit(_cu) { } bool snoopRangeSent; struct SenderState : public Packet::SenderState { GPUDynInstPtr _gpuDynInst; - int port_index; + PortID port_index; Packet::SenderState *saved; SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, @@ -681,7 +680,6 @@ class ComputeUnit : public ClockedObject protected: ComputeUnit *computeUnit; - int index; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } @@ -702,11 +700,9 @@ class ComputeUnit : public ClockedObject class ScalarDataPort : public RequestPort { public: - ScalarDataPort(const std::string &_name, ComputeUnit *_cu, - PortID _index) - : RequestPort(_name, _cu, _index), computeUnit(_cu), index(_index) + ScalarDataPort(const std::string &_name, ComputeUnit *_cu) + : RequestPort(_name, _cu), computeUnit(_cu) { - (void)index; } bool recvTimingResp(PacketPtr pkt) override; @@ -727,11 +723,11 @@ class ComputeUnit : public ClockedObject class MemReqEvent : public Event { private: - ScalarDataPort *scalarDataPort; + ScalarDataPort &scalarDataPort; PacketPtr pkt; public: - MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt) + MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt) : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt) { setFlags(Event::AutoDelete); @@ -745,16 +741,14 @@ class ComputeUnit : public ClockedObject private: ComputeUnit *computeUnit; - PortID index; }; // Instruction cache access port class SQCPort : public RequestPort { public: - SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) - : RequestPort(_name, _cu), computeUnit(_cu), - index(_index) { } + SQCPort(const std::string &_name, ComputeUnit *_cu) + : RequestPort(_name, _cu), computeUnit(_cu) { } bool snoopRangeSent; @@ -775,7 +769,6 @@ class ComputeUnit : public ClockedObject protected: ComputeUnit *computeUnit; - int index; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } @@ -795,9 +788,9 @@ class ComputeUnit : public ClockedObject class DTLBPort : public RequestPort { public: - DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) - : RequestPort(_name, _cu), computeUnit(_cu), - index(_index), stalled(false) + DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id) + : RequestPort(_name, _cu, id), computeUnit(_cu), + stalled(false) { } bool isStalled() { return stalled; } @@ -820,7 +813,7 @@ class ComputeUnit : public ClockedObject // the lane in the memInst this is associated with, so we send // the memory request down the right port - int portIndex; + PortID portIndex; // constructor used for packets involved in timing accesses SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) @@ -830,7 +823,6 @@ class ComputeUnit : public ClockedObject protected: ComputeUnit *computeUnit; - int index; bool stalled; virtual bool recvTimingResp(PacketPtr pkt); @@ -913,8 +905,8 @@ class ComputeUnit : public ClockedObject class LDSPort : public RequestPort { public: - LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) - : RequestPort(_name, _cu, _id), computeUnit(_cu) + LDSPort(const std::string &_name, ComputeUnit *_cu) + : RequestPort(_name, _cu), computeUnit(_cu) { } @@ -983,13 +975,7 @@ class ComputeUnit : public ClockedObject /** The port to access the Local Data Store * Can be connected to a LDS object */ - LDSPort *ldsPort = nullptr; - - LDSPort * - getLdsPort() const - { - return ldsPort; - } + LDSPort ldsPort; TokenManager * getTokenManager() @@ -1000,54 +986,37 @@ class ComputeUnit : public ClockedObject /** The memory port for SIMD data accesses. * Can be connected to PhysMem for Ruby for timing simulations */ - std::vector memPort; + std::vector memPort; // port to the TLB hierarchy (i.e., the L1 TLB) - std::vector tlbPort; + std::vector tlbPort; // port to the scalar data cache - ScalarDataPort *scalarDataPort; + ScalarDataPort scalarDataPort; // port to the scalar data TLB - ScalarDTLBPort *scalarDTLBPort; + ScalarDTLBPort scalarDTLBPort; // port to the SQC (i.e. the I-cache) - SQCPort *sqcPort; + SQCPort sqcPort; // port to the SQC TLB (there's a separate TLB for each I-cache) - ITLBPort *sqcTLBPort; + ITLBPort sqcTLBPort; Port & getPort(const std::string &if_name, PortID idx) override { - if (if_name == "memory_port") { - memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), - this, idx); - return *memPort[idx]; - } else if (if_name == "translation_port") { - tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), - this, idx); - return *tlbPort[idx]; + if (if_name == "memory_port" && idx < memPort.size()) { + return memPort[idx]; + } else if (if_name == "translation_port" && idx < tlbPort.size()) { + return tlbPort[idx]; } else if (if_name == "scalar_port") { - scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(), - idx), this, idx); - return *scalarDataPort; + return scalarDataPort; } else if (if_name == "scalar_tlb_port") { - scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()), - this); - return *scalarDTLBPort; + return scalarDTLBPort; } else if (if_name == "sqc_port") { - sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), - this, idx); - return *sqcPort; + return sqcPort; } else if (if_name == "sqc_tlb_port") { - sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); - return *sqcTLBPort; + return sqcTLBPort; } else if (if_name == "ldsPort") { - if (ldsPort) { - fatal("an LDS port was already allocated"); - } - ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); - return *ldsPort; - } else if (if_name == "gmTokenPort") { - return gmTokenPort; + return ldsPort; } else { - panic("incorrect port name"); + return ClockedObject::getPort(if_name, idx); } } diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index ac9a5a656..3a139f530 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -174,24 +174,24 @@ FetchUnit::initiateFetch(Wavefront *wavefront) computeUnit.shader->gpuTc, false, pkt->senderState); - if (computeUnit.sqcTLBPort->isStalled()) { - assert(computeUnit.sqcTLBPort->retries.size() > 0); + if (computeUnit.sqcTLBPort.isStalled()) { + assert(computeUnit.sqcTLBPort.retries.size() > 0); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); - computeUnit.sqcTLBPort->retries.push_back(pkt); - } else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) { + computeUnit.sqcTLBPort.retries.push_back(pkt); + } else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) { // Stall the data port; // No more packet is issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. - computeUnit.sqcTLBPort->stallPort(); + computeUnit.sqcTLBPort.stallPort(); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); - computeUnit.sqcTLBPort->retries.push_back(pkt); + computeUnit.sqcTLBPort.retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); } @@ -200,7 +200,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront) new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, computeUnit.shader->gpuTc); - computeUnit.sqcTLBPort->sendFunctional(pkt); + computeUnit.sqcTLBPort.sendFunctional(pkt); TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); @@ -257,8 +257,8 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) if (timingSim) { // translation is done. Send the appropriate timing memory request. - if (!computeUnit.sqcPort->sendTimingReq(pkt)) { - computeUnit.sqcPort->retries.push_back(std::make_pair(pkt, + if (!computeUnit.sqcPort.sendTimingReq(pkt)) { + computeUnit.sqcPort.retries.push_back(std::make_pair(pkt, wavefront)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", @@ -270,7 +270,7 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) pkt->req->getPaddr()); } } else { - computeUnit.sqcPort->sendFunctional(pkt); + computeUnit.sqcPort.sendFunctional(pkt); processFetchReturn(pkt); } } diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 1d88e855a..7b4f20f16 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -400,8 +400,8 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, // fixme: this should be cuList[cu_id] if cu_id != n_cu // The latter requires a memPort in the dispatcher - cuList[0]->memPort[0]->sendFunctional(new_pkt1); - cuList[0]->memPort[0]->sendFunctional(new_pkt2); + cuList[0]->memPort[0].sendFunctional(new_pkt1); + cuList[0]->memPort[0].sendFunctional(new_pkt2); delete new_pkt1; delete new_pkt2; @@ -419,7 +419,7 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, // fixme: this should be cuList[cu_id] if cu_id != n_cu // The latter requires a memPort in the dispatcher - cuList[0]->memPort[0]->sendFunctional(new_pkt); + cuList[0]->memPort[0].sendFunctional(new_pkt); delete new_pkt; delete pkt; @@ -507,7 +507,7 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) // it's ok tp send all accesses through lane 0 // since the lane # is not known here, // This isn't important since these are functional accesses. - cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); + cuList[cu_id]->tlbPort[0].sendFunctional(pkt); /* safe_cast the senderState */ TheISA::GpuTLB::TranslationState *sender_state =