From 741261f10bb308cdc200c5dfd8eb68567349cf19 Mon Sep 17 00:00:00 2001 From: Sean Wilson Date: Tue, 27 Jun 2017 14:18:10 -0500 Subject: [PATCH] gpu-compute: Refactor some Event subclasses to lambdas Change-Id: Ic1332b8e8ba0afacbe591c80f4d06afbf5f04bd9 Signed-off-by: Sean Wilson Reviewed-on: https://gem5-review.googlesource.com/3922 Reviewed-by: Jason Lowe-Power Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez --- src/gpu-compute/compute_unit.cc | 63 ++++++++++++++------------- src/gpu-compute/compute_unit.hh | 36 ++-------------- src/gpu-compute/dispatcher.cc | 21 ++------- src/gpu-compute/dispatcher.hh | 14 +----- src/gpu-compute/shader.cc | 38 +++++++---------- src/gpu-compute/shader.hh | 14 +----- src/gpu-compute/tlb_coalescer.cc | 73 +++++++++++++------------------- src/gpu-compute/tlb_coalescer.hh | 37 ++++------------ 8 files changed, 96 insertions(+), 200 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index ffa5243d2..87f29eb68 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -669,9 +669,8 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) return true; } - ComputeUnit::DataPort::MemRespEvent *mem_resp_event = - new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index], - pkt); + EventFunctionWrapper *mem_resp_event = + computeUnit->memPort[index]->createMemRespEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, @@ -845,8 +844,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) // translation is done. Schedule the mem_req_event at the // appropriate cycle to send the timing memory request to ruby - ComputeUnit::DataPort::MemReqEvent *mem_req_event = - new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + EventFunctionWrapper *mem_req_event = + memPort[index]->createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " "scheduled\n", cu_id, gpuDynInst->simdId, @@ -923,8 +922,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) void ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) { - ComputeUnit::DataPort::MemReqEvent *mem_req_event = - new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + EventFunctionWrapper *mem_req_event = + memPort[index]->createMemReqEvent(pkt); // New SenderState for the memory access @@ -972,26 +971,20 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, sendSyncRequest(gpuDynInst, 0, pkt); } -const char* -ComputeUnit::DataPort::MemRespEvent::description() const -{ - return "ComputeUnit memory response event"; -} - void -ComputeUnit::DataPort::MemRespEvent::process() +ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) { DataPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; - ComputeUnit *compute_unit = dataPort->computeUnit; + ComputeUnit *compute_unit = computeUnit; assert(gpuDynInst); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - pkt->req->getPaddr(), dataPort->index); + pkt->req->getPaddr(), index); Addr paddr = pkt->req->getPaddr(); @@ -1045,8 +1038,9 @@ ComputeUnit::DataPort::MemRespEvent::process() // this memory request if (gpuDynInst->useContinuation) { assert(!gpuDynInst->isNoScope()); - gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), - gpuDynInst); + gpuDynInst->execContinuation( + gpuDynInst->staticInstruction(), + gpuDynInst); } } } @@ -1230,9 +1224,8 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) // translation is done. Schedule the mem_req_event at the appropriate // cycle to send the timing memory request to ruby - ComputeUnit::DataPort::MemReqEvent *mem_req_event = - new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index], - new_pkt); + EventFunctionWrapper *mem_req_event = + computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", computeUnit->cu_id, gpuDynInst->simdId, @@ -1244,32 +1237,42 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) return true; } -const char* -ComputeUnit::DataPort::MemReqEvent::description() const +EventFunctionWrapper* +ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt) +{ + return new EventFunctionWrapper( + [this, pkt]{ processMemReqEvent(pkt); }, + "ComputeUnit memory request event", true); +} + +EventFunctionWrapper* +ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt) { - return "ComputeUnit memory request event"; + return new EventFunctionWrapper( + [this, pkt]{ processMemRespEvent(pkt); }, + "ComputeUnit memory response event", true); } void -ComputeUnit::DataPort::MemReqEvent::process() +ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; - ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit; + ComputeUnit *compute_unit M5_VAR_USED = computeUnit; - if (!(dataPort->sendTimingReq(pkt))) { - dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst)); + if (!(sendTimingReq(pkt))) { + retries.push_back(std::make_pair(pkt, gpuDynInst)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, dataPort->index, + gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, dataPort->index, + gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); } } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 4a1c09c27..150228694 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -440,39 +440,11 @@ class ComputeUnit : public MemObject saved(sender_state) { } }; - class MemReqEvent : public Event - { - private: - DataPort *dataPort; - PacketPtr pkt; - - public: - MemReqEvent(DataPort *_data_port, PacketPtr _pkt) - : Event(), dataPort(_data_port), pkt(_pkt) - { - setFlags(Event::AutoDelete); - } - - void process(); - const char *description() const; - }; + void processMemReqEvent(PacketPtr pkt); + EventFunctionWrapper *createMemReqEvent(PacketPtr pkt); - class MemRespEvent : public Event - { - private: - DataPort *dataPort; - PacketPtr pkt; - - public: - MemRespEvent(DataPort *_data_port, PacketPtr _pkt) - : Event(), dataPort(_data_port), pkt(_pkt) - { - setFlags(Event::AutoDelete); - } - - void process(); - const char *description() const; - }; + void processMemRespEvent(PacketPtr pkt); + EventFunctionWrapper *createMemRespEvent(PacketPtr pkt); std::deque> retries; diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 2ce96ec34..7fd1101b1 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -50,7 +50,9 @@ GpuDispatcher::GpuDispatcher(const Params *p) : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), dispatchCount(0), dispatchActive(false), cpu(p->cpu), - shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this) + shader(p->shader_pointer), driver(p->cl_driver), + tickEvent([this]{ exec(); }, "GPU Dispatcher tick", + false, Event::CPU_Tick_Pri) { shader->handshake(this); driver->handshake(this); @@ -363,23 +365,6 @@ GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) } } -GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher) - : Event(CPU_Tick_Pri), dispatcher(_dispatcher) -{ -} - -void -GpuDispatcher::TickEvent::process() -{ - dispatcher->exec(); -} - -const char* -GpuDispatcher::TickEvent::description() const -{ - return "GPU Dispatcher tick"; -} - // helper functions for driver to retrieve GPU attributes int GpuDispatcher::getNumCUs() diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh index f5e89e8aa..50a1d800e 100644 --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -55,17 +55,6 @@ class GpuDispatcher : public DmaDevice public: typedef GpuDispatcherParams Params; - class TickEvent : public Event - { - private: - GpuDispatcher *dispatcher; - - public: - TickEvent(GpuDispatcher *); - void process(); - const char *description() const; - }; - MasterID masterId() { return _masterId; } protected: @@ -93,7 +82,8 @@ class GpuDispatcher : public DmaDevice BaseCPU *cpu; Shader *shader; ClDriver *driver; - TickEvent tickEvent; + EventFunctionWrapper tickEvent; + static GpuDispatcher *instance; diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 6d6154503..41671f85b 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -50,14 +50,17 @@ #include "mem/ruby/system/RubySystem.hh" #include "sim/sim_exit.hh" -Shader::Shader(const Params *p) : ClockedObject(p), - clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), - cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), - hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), - separate_acquire_release(p->separate_acquire_release), coissue_return(1), - trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), - globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), - box_tick_cnt(0), start_tick_cnt(0) +Shader::Shader(const Params *p) + : ClockedObject(p), clock(p->clk_domain->clockPeriod()), + cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer), + tickEvent([this]{ processTick(); }, "Shader tick", + false, Event::CPU_Tick_Pri), + timingSim(p->timing), hsail_mode(SIMT), + impl_kern_boundary_sync(p->impl_kern_boundary_sync), + separate_acquire_release(p->separate_acquire_release), coissue_return(1), + trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), + globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), + box_tick_cnt(0), start_tick_cnt(0) { cuList.resize(n_cu); @@ -317,27 +320,16 @@ Shader::ScheduleAdd(uint32_t *val,Tick when,int x) ++sa_n; } -Shader::TickEvent::TickEvent(Shader *_shader) - : Event(CPU_Tick_Pri), shader(_shader) -{ -} - void -Shader::TickEvent::process() +Shader::processTick() { - if (shader->busy()) { - shader->exec(); - shader->schedule(this, curTick() + shader->ticks(1)); + if (busy()) { + exec(); + schedule(tickEvent, curTick() + ticks(1)); } } -const char* -Shader::TickEvent::description() const -{ - return "Shader tick"; -} - void Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors) diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 55c3feef9..f9c1ad4b2 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -99,18 +99,8 @@ class Shader : public ClockedObject ThreadContext *gpuTc; BaseCPU *cpuPointer; - class TickEvent : public Event - { - private: - Shader *shader; - - public: - TickEvent(Shader*); - void process(); - const char* description() const; - }; - - TickEvent tickEvent; + void processTick(); + EventFunctionWrapper tickEvent; // is this simulation going to be timing mode in the memory? bool timingSim; diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc index c9b888d5f..9b6c9e941 100644 --- a/src/gpu-compute/tlb_coalescer.cc +++ b/src/gpu-compute/tlb_coalescer.cc @@ -39,11 +39,18 @@ #include "debug/GPUTLB.hh" -TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), - clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), - coalescingWindow(p->coalescingWindow), - disableCoalescing(p->disableCoalescing), probeTLBEvent(this), - cleanupEvent(this) +TLBCoalescer::TLBCoalescer(const Params *p) + : MemObject(p), + clock(p->clk_domain->clockPeriod()), + TLBProbesPerCycle(p->probesPerCycle), + coalescingWindow(p->coalescingWindow), + disableCoalescing(p->disableCoalescing), + probeTLBEvent([this]{ processProbeTLBEvent(); }, + "Probe the TLB below", + false, Event::CPU_Tick_Pri), + cleanupEvent([this]{ processCleanupEvent(); }, + "Cleanup issuedTranslationsTable hashmap", + false, Event::Maximum_Pri) { // create the slave ports based on the number of connected ports for (size_t i = 0; i < p->port_slave_connection_count; ++i) { @@ -390,17 +397,6 @@ TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); } -TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) - : Event(CPU_Tick_Pri), coalescer(_coalescer) -{ -} - -const char* -TLBCoalescer::IssueProbeEvent::description() const -{ - return "Probe the TLB below"; -} - /* * Here we scan the coalescer FIFO and issue the max * number of permitted probes to the TLB below. We @@ -414,7 +410,7 @@ TLBCoalescer::IssueProbeEvent::description() const * track of the outstanding reqs) */ void -TLBCoalescer::IssueProbeEvent::process() +TLBCoalescer::processProbeTLBEvent() { // number of TLB probes sent so far int sent_probes = 0; @@ -425,10 +421,10 @@ TLBCoalescer::IssueProbeEvent::process() // returns false or when there is another outstanding request for the // same virt. page. - DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); + DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__); - for (auto iter = coalescer->coalescerFIFO.begin(); - iter != coalescer->coalescerFIFO.end() && !rejected; ) { + for (auto iter = coalescerFIFO.begin(); + iter != coalescerFIFO.end() && !rejected; ) { int coalescedReq_cnt = iter->second.size(); int i = 0; int vector_index = 0; @@ -446,7 +442,7 @@ TLBCoalescer::IssueProbeEvent::process() // is there another outstanding request for the same page addr? int pending_reqs = - coalescer->issuedTranslationsTable.count(virt_page_addr); + issuedTranslationsTable.count(virt_page_addr); if (pending_reqs) { DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " @@ -459,7 +455,7 @@ TLBCoalescer::IssueProbeEvent::process() } // send the coalesced request for virt_page_addr - if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { + if (!memSidePort[0]->sendTimingReq(first_packet)) { DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", virt_page_addr); @@ -479,22 +475,22 @@ TLBCoalescer::IssueProbeEvent::process() // by the one we just sent counting all the way from // the top of TLB hiearchy (i.e., from the CU) int req_cnt = tmp_sender_state->reqCnt.back(); - coalescer->queuingCycles += (curTick() * req_cnt); + queuingCycles += (curTick() * req_cnt); DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", - coalescer->name(), req_cnt); + name(), req_cnt); // pkt_cnt is number of packets we coalesced into the one // we just sent but only at this coalescer level int pkt_cnt = iter->second[vector_index].size(); - coalescer->localqueuingCycles += (curTick() * pkt_cnt); + localqueuingCycles += (curTick() * pkt_cnt); } DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", virt_page_addr); //copy coalescedReq to issuedTranslationsTable - coalescer->issuedTranslationsTable[virt_page_addr] + issuedTranslationsTable[virt_page_addr] = iter->second[vector_index]; //erase the entry of this coalesced req @@ -504,7 +500,7 @@ TLBCoalescer::IssueProbeEvent::process() assert(i == coalescedReq_cnt); sent_probes++; - if (sent_probes == coalescer->TLBProbesPerCycle) + if (sent_probes == TLBProbesPerCycle) return; } } @@ -512,31 +508,20 @@ TLBCoalescer::IssueProbeEvent::process() //if there are no more coalesced reqs for this tick_index //erase the hash_map with the first iterator if (iter->second.empty()) { - coalescer->coalescerFIFO.erase(iter++); + coalescerFIFO.erase(iter++); } else { ++iter; } } } -TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) - : Event(Maximum_Pri), coalescer(_coalescer) -{ -} - -const char* -TLBCoalescer::CleanupEvent::description() const -{ - return "Cleanup issuedTranslationsTable hashmap"; -} - void -TLBCoalescer::CleanupEvent::process() +TLBCoalescer::processCleanupEvent() { - while (!coalescer->cleanupQueue.empty()) { - Addr cleanup_addr = coalescer->cleanupQueue.front(); - coalescer->cleanupQueue.pop(); - coalescer->issuedTranslationsTable.erase(cleanup_addr); + while (!cleanupQueue.empty()) { + Addr cleanup_addr = cleanupQueue.front(); + cleanupQueue.pop(); + issuedTranslationsTable.erase(cleanup_addr); DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", cleanup_addr); diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh index 09210148b..b03e77150 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/gpu-compute/tlb_coalescer.hh @@ -214,35 +214,14 @@ class TLBCoalescer : public MemObject BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx); BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx); - class IssueProbeEvent : public Event - { - private: - TLBCoalescer *coalescer; - - public: - IssueProbeEvent(TLBCoalescer *_coalescer); - void process(); - const char *description() const; - }; - - // this event issues the TLB probes - IssueProbeEvent probeTLBEvent; - - // the cleanupEvent is scheduled after a TLBEvent triggers - // in order to free memory and do the required clean-up - class CleanupEvent : public Event - { - private: - TLBCoalescer *coalescer; - - public: - CleanupEvent(TLBCoalescer *_coalescer); - void process(); - const char* description() const; - }; - - // schedule cleanup - CleanupEvent cleanupEvent; + void processProbeTLBEvent(); + /// This event issues the TLB probes + EventFunctionWrapper probeTLBEvent; + + void processCleanupEvent(); + /// The cleanupEvent is scheduled after a TLBEvent triggers + /// in order to free memory and do the required clean-up + EventFunctionWrapper cleanupEvent; // this FIFO queue keeps track of the virt. page // addresses that are pending cleanup -- 2.30.2