From 5323cccfdd753d8a277df923fab822fb0da504f5 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 14 Jan 2021 10:29:37 -0600 Subject: [PATCH] arch-gcn3,gpu-compute: Update stats style for GPU Convert all gpu-compute stats to Stats::Group style. Change-Id: I29116f1de53ae379210c6cfb5bed3fc74f50cca5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39135 Reviewed-by: Matthew Poremba Maintainer: Matthew Poremba Tested-by: kokoro --- src/arch/gcn3/insts/instructions.cc | 8 +- src/gpu-compute/compute_unit.cc | 862 ++++++------------ src/gpu-compute/compute_unit.hh | 301 +++--- src/gpu-compute/dispatcher.cc | 29 +- src/gpu-compute/dispatcher.hh | 14 +- src/gpu-compute/exec_stage.cc | 86 +- src/gpu-compute/exec_stage.hh | 43 +- src/gpu-compute/fetch_stage.cc | 17 +- src/gpu-compute/fetch_stage.hh | 11 +- src/gpu-compute/global_memory_pipeline.cc | 14 +- src/gpu-compute/global_memory_pipeline.hh | 20 +- src/gpu-compute/gpu_dyn_inst.cc | 8 +- src/gpu-compute/gpu_dyn_inst.hh | 4 +- src/gpu-compute/gpu_tlb.cc | 160 ++-- src/gpu-compute/gpu_tlb.hh | 64 +- src/gpu-compute/lds_state.cc | 4 +- src/gpu-compute/local_memory_pipeline.cc | 15 +- src/gpu-compute/local_memory_pipeline.hh | 16 +- src/gpu-compute/register_file.cc | 35 +- src/gpu-compute/register_file.hh | 30 +- src/gpu-compute/register_manager.cc | 6 - src/gpu-compute/register_manager.hh | 3 - src/gpu-compute/register_manager_policy.hh | 3 - src/gpu-compute/scalar_memory_pipeline.cc | 5 - src/gpu-compute/scalar_memory_pipeline.hh | 1 - src/gpu-compute/scalar_register_file.cc | 10 +- src/gpu-compute/schedule_stage.cc | 194 ++-- src/gpu-compute/schedule_stage.hh | 92 +- src/gpu-compute/scoreboard_check_stage.cc | 17 +- src/gpu-compute/scoreboard_check_stage.hh | 15 +- src/gpu-compute/shader.cc | 175 ++-- src/gpu-compute/shader.hh | 76 +- .../static_register_manager_policy.cc | 5 - .../static_register_manager_policy.hh | 2 - src/gpu-compute/tlb_coalescer.cc | 54 +- src/gpu-compute/tlb_coalescer.hh | 41 +- src/gpu-compute/vector_register_file.cc | 16 +- src/gpu-compute/wavefront.cc | 204 ++--- src/gpu-compute/wavefront.hh | 88 +- 39 files changed, 1159 insertions(+), 1589 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index e8951a9a5..565e85404 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -3800,7 +3800,7 @@ namespace Gcn3ISA wf->computeUnit->cu_id, wf->wgId, refCount); wf->computeUnit->registerManager->freeRegisters(wf); - wf->computeUnit->completedWfs++; + wf->computeUnit->stats.completedWfs++; wf->computeUnit->activeWaves--; panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less " @@ -3811,7 +3811,7 @@ namespace Gcn3ISA for (int i = 0; i < wf->vecReads.size(); i++) { if (wf->rawDist.find(i) != wf->rawDist.end()) { - wf->readsPerWrite.sample(wf->vecReads.at(i)); + wf->stats.readsPerWrite.sample(wf->vecReads.at(i)); } } wf->vecReads.clear(); @@ -3853,7 +3853,7 @@ namespace Gcn3ISA if (!kernelEnd || !relNeeded) { wf->computeUnit->shader->dispatcher().notifyWgCompl(wf); wf->setStatus(Wavefront::S_STOPPED); - wf->computeUnit->completedWGs++; + wf->computeUnit->stats.completedWGs++; return; } @@ -3877,7 +3877,7 @@ namespace Gcn3ISA // call shader to prepare the flush operations wf->computeUnit->shader->prepareFlush(gpuDynInst); - wf->computeUnit->completedWGs++; + wf->computeUnit->stats.completedWGs++; } else { wf->computeUnit->shader->dispatcher().scheduleDispatch(); } diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index d460861e2..636fd554e 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -106,7 +106,8 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p), _numBarrierSlots(p.num_barrier_slots), globalSeqNum(0), wavefrontSize(p.wf_size), scoreboardCheckToSchedule(p), - scheduleToExecute(p) + scheduleToExecute(p), + stats(this, p.n_wf) { /** * This check is necessary because std::bitset only provides conversion @@ -367,7 +368,7 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, w->initRegState(task, w->actualWgSzTotal); w->start(_n_wave++, task->codeAddr()); - waveLevelParallelism.sample(activeWaves); + stats.waveLevelParallelism.sample(activeWaves); activeWaves++; } @@ -612,22 +613,22 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg) freeWfSlots, numMappedWfs, vregAvail, sregAvail); if (!vregAvail) { - ++numTimesWgBlockedDueVgprAlloc; + ++stats.numTimesWgBlockedDueVgprAlloc; } if (!sregAvail) { - ++numTimesWgBlockedDueSgprAlloc; + ++stats.numTimesWgBlockedDueSgprAlloc; } // Return true if enough WF slots to submit workgroup and if there are // enough VGPRs to schedule all WFs to their SIMD units bool ldsAvail = lds.canReserve(task->ldsSize()); if (!ldsAvail) { - wgBlockedDueLdsAllocation++; + stats.wgBlockedDueLdsAllocation++; } if (!barrier_avail) { - wgBlockedDueBarrierAllocation++; + stats.wgBlockedDueBarrierAllocation++; } // Return true if the following are all true: @@ -734,7 +735,7 @@ ComputeUnit::exec() scoreboardCheckStage.exec(); fetchStage.exec(); - totalCycles++; + stats.totalCycles++; // Put this CU to sleep if there is no more work to be done. if (!isDone()) { @@ -1032,8 +1033,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) fatal("pkt is not a read nor a write\n"); } - tlbCycles -= curTick(); - ++tlbRequests; + stats.tlbCycles -= curTick(); + ++stats.tlbRequests; PortID tlbPort_index = perLaneTLB ? index : 0; @@ -1075,7 +1076,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) // update the hitLevel distribution int hit_level = translation_state->hitLevel; assert(hit_level != -1); - hitsPerTLBLevel[hit_level]++; + stats.hitsPerTLBLevel[hit_level]++; // New SenderState for the memory access X86ISA::GpuTLB::TranslationState *sender_state = @@ -1346,7 +1347,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) // for the first cache block. if (compute_unit->headTailMap.count(gpuDynInst)) { Tick headTick = compute_unit->headTailMap.at(gpuDynInst); - compute_unit->headTailLatency.sample(curTick() - headTick); + compute_unit->stats.headTailLatency.sample(curTick() - headTick); compute_unit->headTailMap.erase(gpuDynInst); } @@ -1381,7 +1382,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) pkt->req->getVaddr(), line); assert(pkt->senderState); - computeUnit->tlbCycles += curTick(); + computeUnit->stats.tlbCycles += curTick(); // pop off the TLB translation state X86ISA::GpuTLB::TranslationState *translation_state = @@ -1402,7 +1403,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) // update the hitLevel distribution int hit_level = translation_state->hitLevel; - computeUnit->hitsPerTLBLevel[hit_level]++; + computeUnit->stats.hitsPerTLBLevel[hit_level]++; delete translation_state->tlbEntry; assert(!translation_state->ports.size()); @@ -1788,561 +1789,17 @@ ComputeUnit::ITLBPort::recvReqRetry() } } -void -ComputeUnit::regStats() -{ - ClockedObject::regStats(); - - vALUInsts - .name(name() + ".valu_insts") - .desc("Number of vector ALU insts issued.") - ; - vALUInstsPerWF - .name(name() + ".valu_insts_per_wf") - .desc("The avg. number of vector ALU insts issued per-wavefront.") - ; - sALUInsts - .name(name() + ".salu_insts") - .desc("Number of scalar ALU insts issued.") - ; - sALUInstsPerWF - .name(name() + ".salu_insts_per_wf") - .desc("The avg. number of scalar ALU insts issued per-wavefront.") - ; - instCyclesVALU - .name(name() + ".inst_cycles_valu") - .desc("Number of cycles needed to execute VALU insts.") - ; - instCyclesSALU - .name(name() + ".inst_cycles_salu") - .desc("Number of cycles needed to execute SALU insts.") - ; - threadCyclesVALU - .name(name() + ".thread_cycles_valu") - .desc("Number of thread cycles used to execute vector ALU ops. " - "Similar to instCyclesVALU but multiplied by the number of " - "active threads.") - ; - vALUUtilization - .name(name() + ".valu_utilization") - .desc("Percentage of active vector ALU threads in a wave.") - ; - ldsNoFlatInsts - .name(name() + ".lds_no_flat_insts") - .desc("Number of LDS insts issued, not including FLAT " - "accesses that resolve to LDS.") - ; - ldsNoFlatInstsPerWF - .name(name() + ".lds_no_flat_insts_per_wf") - .desc("The avg. number of LDS insts (not including FLAT " - "accesses that resolve to LDS) per-wavefront.") - ; - flatVMemInsts - .name(name() + ".flat_vmem_insts") - .desc("The number of FLAT insts that resolve to vmem issued.") - ; - flatVMemInstsPerWF - .name(name() + ".flat_vmem_insts_per_wf") - .desc("The average number of FLAT insts that resolve to vmem " - "issued per-wavefront.") - ; - flatLDSInsts - .name(name() + ".flat_lds_insts") - .desc("The number of FLAT insts that resolve to LDS issued.") - ; - flatLDSInstsPerWF - .name(name() + ".flat_lds_insts_per_wf") - .desc("The average number of FLAT insts that resolve to LDS " - "issued per-wavefront.") - ; - vectorMemWrites - .name(name() + ".vector_mem_writes") - .desc("Number of vector mem write insts (excluding FLAT insts).") - ; - vectorMemWritesPerWF - .name(name() + ".vector_mem_writes_per_wf") - .desc("The average number of vector mem write insts " - "(excluding FLAT insts) per-wavefront.") - ; - vectorMemReads - .name(name() + ".vector_mem_reads") - .desc("Number of vector mem read insts (excluding FLAT insts).") - ; - vectorMemReadsPerWF - .name(name() + ".vector_mem_reads_per_wf") - .desc("The avg. number of vector mem read insts (excluding " - "FLAT insts) per-wavefront.") - ; - scalarMemWrites - .name(name() + ".scalar_mem_writes") - .desc("Number of scalar mem write insts.") - ; - scalarMemWritesPerWF - .name(name() + ".scalar_mem_writes_per_wf") - .desc("The average number of scalar mem write insts per-wavefront.") - ; - scalarMemReads - .name(name() + ".scalar_mem_reads") - .desc("Number of scalar mem read insts.") - ; - scalarMemReadsPerWF - .name(name() + ".scalar_mem_reads_per_wf") - .desc("The average number of scalar mem read insts per-wavefront.") - ; - - vALUInstsPerWF = vALUInsts / completedWfs; - sALUInstsPerWF = sALUInsts / completedWfs; - vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; - ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs; - flatVMemInstsPerWF = flatVMemInsts / completedWfs; - flatLDSInstsPerWF = flatLDSInsts / completedWfs; - vectorMemWritesPerWF = vectorMemWrites / completedWfs; - vectorMemReadsPerWF = vectorMemReads / completedWfs; - scalarMemWritesPerWF = scalarMemWrites / completedWfs; - scalarMemReadsPerWF = scalarMemReads / completedWfs; - - vectorMemReadsPerKiloInst - .name(name() + ".vector_mem_reads_per_kilo_inst") - .desc("Number of vector mem reads per kilo-instruction") - ; - vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000; - vectorMemWritesPerKiloInst - .name(name() + ".vector_mem_writes_per_kilo_inst") - .desc("Number of vector mem writes per kilo-instruction") - ; - vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000; - vectorMemInstsPerKiloInst - .name(name() + ".vector_mem_insts_per_kilo_inst") - .desc("Number of vector mem insts per kilo-instruction") - ; - vectorMemInstsPerKiloInst = - ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000; - scalarMemReadsPerKiloInst - .name(name() + ".scalar_mem_reads_per_kilo_inst") - .desc("Number of scalar mem reads per kilo-instruction") - ; - scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000; - scalarMemWritesPerKiloInst - .name(name() + ".scalar_mem_writes_per_kilo_inst") - .desc("Number of scalar mem writes per kilo-instruction") - ; - scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000; - scalarMemInstsPerKiloInst - .name(name() + ".scalar_mem_insts_per_kilo_inst") - .desc("Number of scalar mem insts per kilo-instruction") - ; - scalarMemInstsPerKiloInst = - ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000; - - instCyclesVMemPerSimd - .init(numVectorALUs) - .name(name() + ".inst_cycles_vector_memory") - .desc("Number of cycles to send address, command, data from VRF to " - "vector memory unit, per SIMD") - ; - - instCyclesScMemPerSimd - .init(numVectorALUs) - .name(name() + ".inst_cycles_scalar_memory") - .desc("Number of cycles to send address, command, data from SRF to " - "scalar memory unit, per SIMD") - ; - - instCyclesLdsPerSimd - .init(numVectorALUs) - .name(name() + ".inst_cycles_lds") - .desc("Number of cycles to send address, command, data from VRF to " - "LDS unit, per SIMD") - ; - - globalReads - .name(name() + ".global_mem_reads") - .desc("Number of reads to the global segment") - ; - globalWrites - .name(name() + ".global_mem_writes") - .desc("Number of writes to the global segment") - ; - globalMemInsts - .name(name() + ".global_mem_insts") - .desc("Number of memory instructions sent to the global segment") - ; - globalMemInsts = globalReads + globalWrites; - argReads - .name(name() + ".arg_reads") - .desc("Number of reads to the arg segment") - ; - argWrites - .name(name() + ".arg_writes") - .desc("NUmber of writes to the arg segment") - ; - argMemInsts - .name(name() + ".arg_mem_insts") - .desc("Number of memory instructions sent to the arg segment") - ; - argMemInsts = argReads + argWrites; - spillReads - .name(name() + ".spill_reads") - .desc("Number of reads to the spill segment") - ; - spillWrites - .name(name() + ".spill_writes") - .desc("Number of writes to the spill segment") - ; - spillMemInsts - .name(name() + ".spill_mem_insts") - .desc("Number of memory instructions sent to the spill segment") - ; - spillMemInsts = spillReads + spillWrites; - groupReads - .name(name() + ".group_reads") - .desc("Number of reads to the group segment") - ; - groupWrites - .name(name() + ".group_writes") - .desc("Number of writes to the group segment") - ; - groupMemInsts - .name(name() + ".group_mem_insts") - .desc("Number of memory instructions sent to the group segment") - ; - groupMemInsts = groupReads + groupWrites; - privReads - .name(name() + ".private_reads") - .desc("Number of reads to the private segment") - ; - privWrites - .name(name() + ".private_writes") - .desc("Number of writes to the private segment") - ; - privMemInsts - .name(name() + ".private_mem_insts") - .desc("Number of memory instructions sent to the private segment") - ; - privMemInsts = privReads + privWrites; - readonlyReads - .name(name() + ".readonly_reads") - .desc("Number of reads to the readonly segment") - ; - readonlyWrites - .name(name() + ".readonly_writes") - .desc("Number of memory instructions sent to the readonly segment") - ; - readonlyMemInsts - .name(name() + ".readonly_mem_insts") - .desc("Number of memory instructions sent to the readonly segment") - ; - readonlyMemInsts = readonlyReads + readonlyWrites; - kernargReads - .name(name() + ".kernarg_reads") - .desc("Number of reads sent to the kernarg segment") - ; - kernargWrites - .name(name() + ".kernarg_writes") - .desc("Number of memory instructions sent to the kernarg segment") - ; - kernargMemInsts - .name(name() + ".kernarg_mem_insts") - .desc("Number of memory instructions sent to the kernarg segment") - ; - kernargMemInsts = kernargReads + kernargWrites; - - tlbCycles - .name(name() + ".tlb_cycles") - .desc("total number of cycles for all uncoalesced requests") - ; - - tlbRequests - .name(name() + ".tlb_requests") - .desc("number of uncoalesced requests") - ; - - tlbLatency - .name(name() + ".avg_translation_latency") - .desc("Avg. translation latency for data translations") - ; - - tlbLatency = tlbCycles / tlbRequests; - - hitsPerTLBLevel - .init(4) - .name(name() + ".TLB_hits_distribution") - .desc("TLB hits distribution (0 for page table, x for Lx-TLB") - ; - - // fixed number of TLB levels - for (int i = 0; i < 4; ++i) { - if (!i) - hitsPerTLBLevel.subname(i,"page_table"); - else - hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); - } - - execRateDist - .init(0, 10, 2) - .name(name() + ".inst_exec_rate") - .desc("Instruction Execution Rate: Number of executed vector " - "instructions per cycle") - ; - - ldsBankConflictDist - .init(0, wfSize(), 2) - .name(name() + ".lds_bank_conflicts") - .desc("Number of bank conflicts per LDS memory packet") - ; - - ldsBankAccesses - .name(name() + ".lds_bank_access_cnt") - .desc("Total number of LDS bank accesses") - ; - - pageDivergenceDist - // A wavefront can touch up to N pages per memory instruction where - // N is equal to the wavefront size - // The number of pages per bin can be configured (here it's 4). - .init(1, wfSize(), 4) - .name(name() + ".page_divergence_dist") - .desc("pages touched per wf (over all mem. instr.)") - ; - - controlFlowDivergenceDist - .init(1, wfSize(), 4) - .name(name() + ".warp_execution_dist") - .desc("number of lanes active per instruction (oval all instructions)") - ; - - activeLanesPerGMemInstrDist - .init(1, wfSize(), 4) - .name(name() + ".gmem_lanes_execution_dist") - .desc("number of active lanes per global memory instruction") - ; - - activeLanesPerLMemInstrDist - .init(1, wfSize(), 4) - .name(name() + ".lmem_lanes_execution_dist") - .desc("number of active lanes per local memory instruction") - ; - - numInstrExecuted - .name(name() + ".num_instr_executed") - .desc("number of instructions executed") - ; - - numVecOpsExecuted - .name(name() + ".num_vec_ops_executed") - .desc("number of vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedF16 - .name(name() + ".num_vec_ops_f16_executed") - .desc("number of f16 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedF32 - .name(name() + ".num_vec_ops_f32_executed") - .desc("number of f32 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedF64 - .name(name() + ".num_vec_ops_f64_executed") - .desc("number of f64 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedFMA16 - .name(name() + ".num_vec_ops_fma16_executed") - .desc("number of fma16 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedFMA32 - .name(name() + ".num_vec_ops_fma32_executed") - .desc("number of fma32 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedFMA64 - .name(name() + ".num_vec_ops_fma64_executed") - .desc("number of fma64 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAD16 - .name(name() + ".num_vec_ops_mad16_executed") - .desc("number of mad16 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAD32 - .name(name() + ".num_vec_ops_mad32_executed") - .desc("number of mad32 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAD64 - .name(name() + ".num_vec_ops_mad64_executed") - .desc("number of mad64 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAC16 - .name(name() + ".num_vec_ops_mac16_executed") - .desc("number of mac16 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAC32 - .name(name() + ".num_vec_ops_mac32_executed") - .desc("number of mac32 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedMAC64 - .name(name() + ".num_vec_ops_mac64_executed") - .desc("number of mac64 vec ops executed (e.g. WF size/inst)") - ; - - numVecOpsExecutedTwoOpFP - .name(name() + ".num_vec_ops_two_op_fp_executed") - .desc("number of two op FP vec ops executed (e.g. WF size/inst)") - ; - - totalCycles - .name(name() + ".num_total_cycles") - .desc("number of cycles the CU ran for") - ; - - ipc - .name(name() + ".ipc") - .desc("Instructions per cycle (this CU only)") - ; - - vpc - .name(name() + ".vpc") - .desc("Vector Operations per cycle (this CU only)") - ; - - vpc_f16 - .name(name() + ".vpc_f16") - .desc("F16 Vector Operations per cycle (this CU only)") - ; - - vpc_f32 - .name(name() + ".vpc_f32") - .desc("F32 Vector Operations per cycle (this CU only)") - ; - - vpc_f64 - .name(name() + ".vpc_f64") - .desc("F64 Vector Operations per cycle (this CU only)") - ; - - numALUInstsExecuted - .name(name() + ".num_alu_insts_executed") - .desc("Number of dynamic non-GM memory insts executed") - ; - - wgBlockedDueBarrierAllocation - .name(name() + ".wg_blocked_due_barrier_alloc") - .desc("WG dispatch was blocked due to lack of barrier resources") - ; - - wgBlockedDueLdsAllocation - .name(name() + ".wg_blocked_due_lds_alloc") - .desc("Workgroup blocked due to LDS capacity") - ; - - ipc = numInstrExecuted / totalCycles; - vpc = numVecOpsExecuted / totalCycles; - vpc_f16 = numVecOpsExecutedF16 / totalCycles; - vpc_f32 = numVecOpsExecutedF32 / totalCycles; - vpc_f64 = numVecOpsExecutedF64 / totalCycles; - - numTimesWgBlockedDueVgprAlloc - .name(name() + ".times_wg_blocked_due_vgpr_alloc") - .desc("Number of times WGs are blocked due to VGPR allocation per " - "SIMD") - ; - - numTimesWgBlockedDueSgprAlloc - .name(name() + ".times_wg_blocked_due_sgpr_alloc") - .desc("Number of times WGs are blocked due to SGPR allocation per " - "SIMD") - ; - - dynamicGMemInstrCnt - .name(name() + ".global_mem_instr_cnt") - .desc("dynamic non-flat global memory instruction count") - ; - - dynamicFlatMemInstrCnt - .name(name() + ".flat_global_mem_instr_cnt") - .desc("dynamic flat global memory instruction count") - ; - - dynamicLMemInstrCnt - .name(name() + ".local_mem_instr_cnt") - .desc("dynamic local memory intruction count") - ; - - numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - - dynamicLMemInstrCnt; - - completedWfs - .name(name() + ".num_completed_wfs") - .desc("number of completed wavefronts") - ; - - completedWGs - .name(name() + ".num_completed_wgs") - .desc("number of completed workgroups") - ; - - numCASOps - .name(name() + ".num_CAS_ops") - .desc("number of compare and swap operations") - ; - - numFailedCASOps - .name(name() + ".num_failed_CAS_ops") - .desc("number of compare and swap operations that failed") - ; - - headTailLatency - .init(0, 1000000, 10000) - .name(name() + ".head_tail_latency") - .desc("ticks between first and last cache block arrival at coalescer") - .flags(Stats::pdf | Stats::oneline) - ; - - waveLevelParallelism - .init(0, shader->n_wf * numVectorALUs, 1) - .name(name() + ".wlp") - .desc("wave level parallelism: count of active waves at wave launch") - ; - - instInterleave - .init(numVectorALUs, 0, 20, 1) - .name(name() + ".interleaving") - .desc("Measure of instruction interleaving per SIMD") - ; - - // register stats of pipeline stages - fetchStage.regStats(); - scoreboardCheckStage.regStats(); - scheduleStage.regStats(); - execStage.regStats(); - - // register stats of memory pipelines - globalMemoryPipe.regStats(); - localMemoryPipe.regStats(); - scalarMemoryPipe.regStats(); - - registerManager->regStats(); -} - void ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) { if (gpuDynInst->isScalar()) { if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) { - sALUInsts++; - instCyclesSALU++; + stats.sALUInsts++; + stats.instCyclesSALU++; } else if (gpuDynInst->isLoad()) { - scalarMemReads++; + stats.scalarMemReads++; } else if (gpuDynInst->isStore()) { - scalarMemWrites++; + stats.scalarMemWrites++; } } else { if (gpuDynInst->isALU()) { @@ -2350,45 +1807,46 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) if (shader->total_valu_insts == shader->max_valu_insts) { exitSimLoop("max vALU insts"); } - vALUInsts++; - instCyclesVALU++; - threadCyclesVALU += gpuDynInst->wavefront()->execMask().count(); + stats.vALUInsts++; + stats.instCyclesVALU++; + stats.threadCyclesVALU + += gpuDynInst->wavefront()->execMask().count(); } else if (gpuDynInst->isFlat()) { if (gpuDynInst->isLocalMem()) { - flatLDSInsts++; + stats.flatLDSInsts++; } else { - flatVMemInsts++; + stats.flatVMemInsts++; } } else if (gpuDynInst->isLocalMem()) { - ldsNoFlatInsts++; + stats.ldsNoFlatInsts++; } else if (gpuDynInst->isLoad()) { - vectorMemReads++; + stats.vectorMemReads++; } else if (gpuDynInst->isStore()) { - vectorMemWrites++; + stats.vectorMemWrites++; } if (gpuDynInst->isLoad()) { switch (gpuDynInst->executedAs()) { case Enums::SC_SPILL: - spillReads++; + stats.spillReads++; break; case Enums::SC_GLOBAL: - globalReads++; + stats.globalReads++; break; case Enums::SC_GROUP: - groupReads++; + stats.groupReads++; break; case Enums::SC_PRIVATE: - privReads++; + stats.privReads++; break; case Enums::SC_READONLY: - readonlyReads++; + stats.readonlyReads++; break; case Enums::SC_KERNARG: - kernargReads++; + stats.kernargReads++; break; case Enums::SC_ARG: - argReads++; + stats.argReads++; break; case Enums::SC_NONE: /** @@ -2403,25 +1861,25 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) } else if (gpuDynInst->isStore()) { switch (gpuDynInst->executedAs()) { case Enums::SC_SPILL: - spillWrites++; + stats.spillWrites++; break; case Enums::SC_GLOBAL: - globalWrites++; + stats.globalWrites++; break; case Enums::SC_GROUP: - groupWrites++; + stats.groupWrites++; break; case Enums::SC_PRIVATE: - privWrites++; + stats.privWrites++; break; case Enums::SC_READONLY: - readonlyWrites++; + stats.readonlyWrites++; break; case Enums::SC_KERNARG: - kernargWrites++; + stats.kernargWrites++; break; case Enums::SC_ARG: - argWrites++; + stats.argWrites++; break; case Enums::SC_NONE: /** @@ -2636,3 +2094,241 @@ ComputeUnit::LDSPort::recvReqRetry() } } } + +ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf) + : Stats::Group(parent), + ADD_STAT(vALUInsts, "Number of vector ALU insts issued."), + ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued " + "per-wavefront."), + ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."), + ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued " + "per-wavefront."), + ADD_STAT(instCyclesVALU, + "Number of cycles needed to execute VALU insts."), + ADD_STAT(instCyclesSALU, + "Number of cycles needed to execute SALU insts."), + ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute " + "vector ALU ops. Similar to instCyclesVALU but multiplied by " + "the number of active threads."), + ADD_STAT(vALUUtilization, + "Percentage of active vector ALU threads in a wave."), + ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT" + " accesses that resolve to LDS."), + ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not " + "including FLAT accesses that resolve to LDS) per-wavefront."), + ADD_STAT(flatVMemInsts, + "The number of FLAT insts that resolve to vmem issued."), + ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that " + "resolve to vmem issued per-wavefront."), + ADD_STAT(flatLDSInsts, + "The number of FLAT insts that resolve to LDS issued."), + ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that " + "resolve to LDS issued per-wavefront."), + ADD_STAT(vectorMemWrites, + "Number of vector mem write insts (excluding FLAT insts)."), + ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write " + "insts (excluding FLAT insts) per-wavefront."), + ADD_STAT(vectorMemReads, + "Number of vector mem read insts (excluding FLAT insts)."), + ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts " + "(excluding FLAT insts) per-wavefront."), + ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."), + ADD_STAT(scalarMemWritesPerWF, + "The average number of scalar mem write insts per-wavefront."), + ADD_STAT(scalarMemReads, "Number of scalar mem read insts."), + ADD_STAT(scalarMemReadsPerWF, + "The average number of scalar mem read insts per-wavefront."), + ADD_STAT(vectorMemReadsPerKiloInst, + "Number of vector mem reads per kilo-instruction"), + ADD_STAT(vectorMemWritesPerKiloInst, + "Number of vector mem writes per kilo-instruction"), + ADD_STAT(vectorMemInstsPerKiloInst, + "Number of vector mem insts per kilo-instruction"), + ADD_STAT(scalarMemReadsPerKiloInst, + "Number of scalar mem reads per kilo-instruction"), + ADD_STAT(scalarMemWritesPerKiloInst, + "Number of scalar mem writes per kilo-instruction"), + ADD_STAT(scalarMemInstsPerKiloInst, + "Number of scalar mem insts per kilo-instruction"), + ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, " + "command, data from VRF to vector memory unit, per SIMD"), + ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, " + "command, data from SRF to scalar memory unit, per SIMD"), + ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, " + "command, data from VRF to LDS unit, per SIMD"), + ADD_STAT(globalReads, "Number of reads to the global segment"), + ADD_STAT(globalWrites, "Number of writes to the global segment"), + ADD_STAT(globalMemInsts, + "Number of memory instructions sent to the global segment"), + ADD_STAT(argReads, "Number of reads to the arg segment"), + ADD_STAT(argWrites, "NUmber of writes to the arg segment"), + ADD_STAT(argMemInsts, + "Number of memory instructions sent to the arg segment"), + ADD_STAT(spillReads, "Number of reads to the spill segment"), + ADD_STAT(spillWrites, "Number of writes to the spill segment"), + ADD_STAT(spillMemInsts, + "Number of memory instructions sent to the spill segment"), + ADD_STAT(groupReads, "Number of reads to the group segment"), + ADD_STAT(groupWrites, "Number of writes to the group segment"), + ADD_STAT(groupMemInsts, + "Number of memory instructions sent to the group segment"), + ADD_STAT(privReads, "Number of reads to the private segment"), + ADD_STAT(privWrites, "Number of writes to the private segment"), + ADD_STAT(privMemInsts, + "Number of memory instructions sent to the private segment"), + ADD_STAT(readonlyReads, "Number of reads to the readonly segment"), + ADD_STAT(readonlyWrites, + "Number of memory instructions sent to the readonly segment"), + ADD_STAT(readonlyMemInsts, + "Number of memory instructions sent to the readonly segment"), + ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"), + ADD_STAT(kernargWrites, + "Number of memory instructions sent to the kernarg segment"), + ADD_STAT(kernargMemInsts, + "Number of memory instructions sent to the kernarg segment"), + ADD_STAT(waveLevelParallelism, + "wave level parallelism: count of active waves at wave launch"), + ADD_STAT(tlbRequests, "number of uncoalesced requests"), + ADD_STAT(tlbCycles, + "total number of cycles for all uncoalesced requests"), + ADD_STAT(tlbLatency, "Avg. translation latency for data translations"), + ADD_STAT(hitsPerTLBLevel, + "TLB hits distribution (0 for page table, x for Lx-TLB)"), + ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"), + ADD_STAT(ldsBankConflictDist, + "Number of bank conflicts per LDS memory packet"), + ADD_STAT(pageDivergenceDist, + "pages touched per wf (over all mem. instr.)"), + ADD_STAT(dynamicGMemInstrCnt, + "dynamic non-flat global memory instruction count"), + ADD_STAT(dynamicFlatMemInstrCnt, + "dynamic flat global memory instruction count"), + ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"), + ADD_STAT(wgBlockedDueBarrierAllocation, + "WG dispatch was blocked due to lack of barrier resources"), + ADD_STAT(wgBlockedDueLdsAllocation, + "Workgroup blocked due to LDS capacity"), + ADD_STAT(numInstrExecuted, "number of instructions executed"), + ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed " + "vector instructions per cycle"), + ADD_STAT(numVecOpsExecuted, + "number of vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedF16, + "number of f16 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedF32, + "number of f32 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedF64, + "number of f64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedFMA16, + "number of fma16 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedFMA32, + "number of fma32 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedFMA64, + "number of fma64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAC16, + "number of mac16 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAC32, + "number of mac32 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAC64, + "number of mac64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAD16, + "number of mad16 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAD32, + "number of mad32 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMAD64, + "number of mad64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedTwoOpFP, + "number of two op FP vec ops executed (e.g. WF size/inst)"), + ADD_STAT(totalCycles, "number of cycles the CU ran for"), + ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"), + ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"), + ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"), + ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"), + ADD_STAT(ipc, "Instructions per cycle (this CU only)"), + ADD_STAT(controlFlowDivergenceDist, "number of lanes active per " + "instruction (over all instructions)"), + ADD_STAT(activeLanesPerGMemInstrDist, + "number of active lanes per global memory instruction"), + ADD_STAT(activeLanesPerLMemInstrDist, + "number of active lanes per local memory instruction"), + ADD_STAT(numALUInstsExecuted, + "Number of dynamic non-GM memory insts executed"), + ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are " + "blocked due to VGPR allocation per SIMD"), + ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are " + "blocked due to SGPR allocation per SIMD"), + ADD_STAT(numCASOps, "number of compare and swap operations"), + ADD_STAT(numFailedCASOps, + "number of compare and swap operations that failed"), + ADD_STAT(completedWfs, "number of completed wavefronts"), + ADD_STAT(completedWGs, "number of completed workgroups"), + ADD_STAT(headTailLatency, "ticks between first and last cache block " + "arrival at coalescer"), + ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD") +{ + ComputeUnit *cu = static_cast(parent); + + instCyclesVMemPerSimd.init(cu->numVectorALUs); + instCyclesScMemPerSimd.init(cu->numVectorALUs); + instCyclesLdsPerSimd.init(cu->numVectorALUs); + + hitsPerTLBLevel.init(4); + execRateDist.init(0, 10, 2); + ldsBankConflictDist.init(0, cu->wfSize(), 2); + + pageDivergenceDist.init(1, cu->wfSize(), 4); + controlFlowDivergenceDist.init(1, cu->wfSize(), 4); + activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4); + activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4); + + headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline); + waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1); + instInterleave.init(cu->numVectorALUs, 0, 20, 1); + + vALUInstsPerWF = vALUInsts / completedWfs; + sALUInstsPerWF = sALUInsts / completedWfs; + vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; + ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs; + flatVMemInstsPerWF = flatVMemInsts / completedWfs; + flatLDSInstsPerWF = flatLDSInsts / completedWfs; + vectorMemWritesPerWF = vectorMemWrites / completedWfs; + vectorMemReadsPerWF = vectorMemReads / completedWfs; + scalarMemWritesPerWF = scalarMemWrites / completedWfs; + scalarMemReadsPerWF = scalarMemReads / completedWfs; + + vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000; + vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000; + vectorMemInstsPerKiloInst = + ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000; + scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000; + scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000; + scalarMemInstsPerKiloInst = + ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000; + + globalMemInsts = globalReads + globalWrites; + argMemInsts = argReads + argWrites; + spillMemInsts = spillReads + spillWrites; + groupMemInsts = groupReads + groupWrites; + privMemInsts = privReads + privWrites; + readonlyMemInsts = readonlyReads + readonlyWrites; + kernargMemInsts = kernargReads + kernargWrites; + + tlbLatency = tlbCycles / tlbRequests; + + // fixed number of TLB levels + for (int i = 0; i < 4; ++i) { + if (!i) + hitsPerTLBLevel.subname(i,"page_table"); + else + hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); + } + + ipc = numInstrExecuted / totalCycles; + vpc = numVecOpsExecuted / totalCycles; + vpc_f16 = numVecOpsExecutedF16 / totalCycles; + vpc_f32 = numVecOpsExecutedF32 / totalCycles; + vpc_f64 = numVecOpsExecutedF64 / totalCycles; + + numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - + dynamicLMemInstrCnt; +} diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index ecb6d06d8..186a45664 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -42,6 +42,7 @@ #include "base/callback.hh" #include "base/compiler.hh" #include "base/statistics.hh" +#include "base/stats/group.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "enums/PrefetchType.hh" @@ -320,12 +321,6 @@ class ComputeUnit : public ClockedObject // tracks the last cycle a vector instruction was executed on a SIMD std::vector lastExecCycle; - // Track the amount of interleaving between wavefronts on each SIMD. - // This stat is sampled using instExecPerSimd to compute the number of - // instructions that have been executed on a SIMD between a WF executing - // two successive instructions. - Stats::VectorDistribution instInterleave; - // tracks the number of dyn inst executed per SIMD std::vector instExecPerSimd; @@ -472,148 +467,6 @@ class ComputeUnit : public ClockedObject LdsState &lds; public: - Stats::Scalar vALUInsts; - Stats::Formula vALUInstsPerWF; - Stats::Scalar sALUInsts; - Stats::Formula sALUInstsPerWF; - Stats::Scalar instCyclesVALU; - Stats::Scalar instCyclesSALU; - Stats::Scalar threadCyclesVALU; - Stats::Formula vALUUtilization; - Stats::Scalar ldsNoFlatInsts; - Stats::Formula ldsNoFlatInstsPerWF; - Stats::Scalar flatVMemInsts; - Stats::Formula flatVMemInstsPerWF; - Stats::Scalar flatLDSInsts; - Stats::Formula flatLDSInstsPerWF; - Stats::Scalar vectorMemWrites; - Stats::Formula vectorMemWritesPerWF; - Stats::Scalar vectorMemReads; - Stats::Formula vectorMemReadsPerWF; - Stats::Scalar scalarMemWrites; - Stats::Formula scalarMemWritesPerWF; - Stats::Scalar scalarMemReads; - Stats::Formula scalarMemReadsPerWF; - - Stats::Formula vectorMemReadsPerKiloInst; - Stats::Formula vectorMemWritesPerKiloInst; - Stats::Formula vectorMemInstsPerKiloInst; - Stats::Formula scalarMemReadsPerKiloInst; - Stats::Formula scalarMemWritesPerKiloInst; - Stats::Formula scalarMemInstsPerKiloInst; - - // Cycles required to send register source (addr and data) from - // register files to memory pipeline, per SIMD. - Stats::Vector instCyclesVMemPerSimd; - Stats::Vector instCyclesScMemPerSimd; - Stats::Vector instCyclesLdsPerSimd; - - Stats::Scalar globalReads; - Stats::Scalar globalWrites; - Stats::Formula globalMemInsts; - Stats::Scalar argReads; - Stats::Scalar argWrites; - Stats::Formula argMemInsts; - Stats::Scalar spillReads; - Stats::Scalar spillWrites; - Stats::Formula spillMemInsts; - Stats::Scalar groupReads; - Stats::Scalar groupWrites; - Stats::Formula groupMemInsts; - Stats::Scalar privReads; - Stats::Scalar privWrites; - Stats::Formula privMemInsts; - Stats::Scalar readonlyReads; - Stats::Scalar readonlyWrites; - Stats::Formula readonlyMemInsts; - Stats::Scalar kernargReads; - Stats::Scalar kernargWrites; - Stats::Formula kernargMemInsts; - - int activeWaves; - Stats::Distribution waveLevelParallelism; - - void updateInstStats(GPUDynInstPtr gpuDynInst); - - // the following stats compute the avg. TLB accesslatency per - // uncoalesced request (only for data) - Stats::Scalar tlbRequests; - Stats::Scalar tlbCycles; - Stats::Formula tlbLatency; - // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. - Stats::Vector hitsPerTLBLevel; - - Stats::Scalar ldsBankAccesses; - Stats::Distribution ldsBankConflictDist; - - // over all memory instructions executed over all wavefronts - // how many touched 0-4 pages, 4-8, ..., 60-64 pages - Stats::Distribution pageDivergenceDist; - // count of non-flat global memory vector instructions executed - Stats::Scalar dynamicGMemInstrCnt; - // count of flat global memory vector instructions executed - Stats::Scalar dynamicFlatMemInstrCnt; - Stats::Scalar dynamicLMemInstrCnt; - - Stats::Scalar wgBlockedDueBarrierAllocation; - Stats::Scalar wgBlockedDueLdsAllocation; - // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are - // active when the instruction is committed, this number is still - // incremented by 1 - Stats::Scalar numInstrExecuted; - // Number of cycles among successive instruction executions across all - // wavefronts of the same CU - Stats::Distribution execRateDist; - // number of individual vector operations executed - Stats::Scalar numVecOpsExecuted; - // number of individual f16 vector operations executed - Stats::Scalar numVecOpsExecutedF16; - // number of individual f32 vector operations executed - Stats::Scalar numVecOpsExecutedF32; - // number of individual f64 vector operations executed - Stats::Scalar numVecOpsExecutedF64; - // number of individual FMA 16,32,64 vector operations executed - Stats::Scalar numVecOpsExecutedFMA16; - Stats::Scalar numVecOpsExecutedFMA32; - Stats::Scalar numVecOpsExecutedFMA64; - // number of individual MAC 16,32,64 vector operations executed - Stats::Scalar numVecOpsExecutedMAC16; - Stats::Scalar numVecOpsExecutedMAC32; - Stats::Scalar numVecOpsExecutedMAC64; - // number of individual MAD 16,32,64 vector operations executed - Stats::Scalar numVecOpsExecutedMAD16; - Stats::Scalar numVecOpsExecutedMAD32; - Stats::Scalar numVecOpsExecutedMAD64; - // total number of two op FP vector operations executed - Stats::Scalar numVecOpsExecutedTwoOpFP; - // Total cycles that something is running on the GPU - Stats::Scalar totalCycles; - Stats::Formula vpc; // vector ops per cycle - Stats::Formula vpc_f16; // vector ops per cycle - Stats::Formula vpc_f32; // vector ops per cycle - Stats::Formula vpc_f64; // vector ops per cycle - Stats::Formula ipc; // vector instructions per cycle - Stats::Distribution controlFlowDivergenceDist; - Stats::Distribution activeLanesPerGMemInstrDist; - Stats::Distribution activeLanesPerLMemInstrDist; - // number of vector ALU instructions received - Stats::Formula numALUInstsExecuted; - // number of times a WG can not start due to lack of free VGPRs in SIMDs - Stats::Scalar numTimesWgBlockedDueVgprAlloc; - // number of times a WG can not start due to lack of free SGPRs in SIMDs - Stats::Scalar numTimesWgBlockedDueSgprAlloc; - Stats::Scalar numCASOps; - Stats::Scalar numFailedCASOps; - Stats::Scalar completedWfs; - Stats::Scalar completedWGs; - - // distrubtion in latency difference between first and last cache block - // arrival ticks - Stats::Distribution headTailLatency; - - void - regStats() override; - LdsState & getLds() const { @@ -1081,6 +934,158 @@ class ComputeUnit : public ClockedObject // a particular GPUDynInst. This is used to calculate the difference // between the first and last chace block arrival times. std::unordered_map headTailMap; + + public: + void updateInstStats(GPUDynInstPtr gpuDynInst); + int activeWaves; + + struct ComputeUnitStats : public Stats::Group + { + ComputeUnitStats(Stats::Group *parent, int n_wf); + + Stats::Scalar vALUInsts; + Stats::Formula vALUInstsPerWF; + Stats::Scalar sALUInsts; + Stats::Formula sALUInstsPerWF; + Stats::Scalar instCyclesVALU; + Stats::Scalar instCyclesSALU; + Stats::Scalar threadCyclesVALU; + Stats::Formula vALUUtilization; + Stats::Scalar ldsNoFlatInsts; + Stats::Formula ldsNoFlatInstsPerWF; + Stats::Scalar flatVMemInsts; + Stats::Formula flatVMemInstsPerWF; + Stats::Scalar flatLDSInsts; + Stats::Formula flatLDSInstsPerWF; + Stats::Scalar vectorMemWrites; + Stats::Formula vectorMemWritesPerWF; + Stats::Scalar vectorMemReads; + Stats::Formula vectorMemReadsPerWF; + Stats::Scalar scalarMemWrites; + Stats::Formula scalarMemWritesPerWF; + Stats::Scalar scalarMemReads; + Stats::Formula scalarMemReadsPerWF; + + Stats::Formula vectorMemReadsPerKiloInst; + Stats::Formula vectorMemWritesPerKiloInst; + Stats::Formula vectorMemInstsPerKiloInst; + Stats::Formula scalarMemReadsPerKiloInst; + Stats::Formula scalarMemWritesPerKiloInst; + Stats::Formula scalarMemInstsPerKiloInst; + + // Cycles required to send register source (addr and data) from + // register files to memory pipeline, per SIMD. + Stats::Vector instCyclesVMemPerSimd; + Stats::Vector instCyclesScMemPerSimd; + Stats::Vector instCyclesLdsPerSimd; + + Stats::Scalar globalReads; + Stats::Scalar globalWrites; + Stats::Formula globalMemInsts; + Stats::Scalar argReads; + Stats::Scalar argWrites; + Stats::Formula argMemInsts; + Stats::Scalar spillReads; + Stats::Scalar spillWrites; + Stats::Formula spillMemInsts; + Stats::Scalar groupReads; + Stats::Scalar groupWrites; + Stats::Formula groupMemInsts; + Stats::Scalar privReads; + Stats::Scalar privWrites; + Stats::Formula privMemInsts; + Stats::Scalar readonlyReads; + Stats::Scalar readonlyWrites; + Stats::Formula readonlyMemInsts; + Stats::Scalar kernargReads; + Stats::Scalar kernargWrites; + Stats::Formula kernargMemInsts; + + Stats::Distribution waveLevelParallelism; + + // the following stats compute the avg. TLB accesslatency per + // uncoalesced request (only for data) + Stats::Scalar tlbRequests; + Stats::Scalar tlbCycles; + Stats::Formula tlbLatency; + // hitsPerTLBLevel[x] are the hits in Level x TLB. + // x = 0 is the page table. + Stats::Vector hitsPerTLBLevel; + + Stats::Scalar ldsBankAccesses; + Stats::Distribution ldsBankConflictDist; + + // over all memory instructions executed over all wavefronts + // how many touched 0-4 pages, 4-8, ..., 60-64 pages + Stats::Distribution pageDivergenceDist; + // count of non-flat global memory vector instructions executed + Stats::Scalar dynamicGMemInstrCnt; + // count of flat global memory vector instructions executed + Stats::Scalar dynamicFlatMemInstrCnt; + Stats::Scalar dynamicLMemInstrCnt; + + Stats::Scalar wgBlockedDueBarrierAllocation; + Stats::Scalar wgBlockedDueLdsAllocation; + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are + // active when the instruction is committed, this number is still + // incremented by 1 + Stats::Scalar numInstrExecuted; + // Number of cycles among successive instruction executions across all + // wavefronts of the same CU + Stats::Distribution execRateDist; + // number of individual vector operations executed + Stats::Scalar numVecOpsExecuted; + // number of individual f16 vector operations executed + Stats::Scalar numVecOpsExecutedF16; + // number of individual f32 vector operations executed + Stats::Scalar numVecOpsExecutedF32; + // number of individual f64 vector operations executed + Stats::Scalar numVecOpsExecutedF64; + // number of individual FMA 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedFMA16; + Stats::Scalar numVecOpsExecutedFMA32; + Stats::Scalar numVecOpsExecutedFMA64; + // number of individual MAC 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedMAC16; + Stats::Scalar numVecOpsExecutedMAC32; + Stats::Scalar numVecOpsExecutedMAC64; + // number of individual MAD 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedMAD16; + Stats::Scalar numVecOpsExecutedMAD32; + Stats::Scalar numVecOpsExecutedMAD64; + // total number of two op FP vector operations executed + Stats::Scalar numVecOpsExecutedTwoOpFP; + // Total cycles that something is running on the GPU + Stats::Scalar totalCycles; + Stats::Formula vpc; // vector ops per cycle + Stats::Formula vpc_f16; // vector ops per cycle + Stats::Formula vpc_f32; // vector ops per cycle + Stats::Formula vpc_f64; // vector ops per cycle + Stats::Formula ipc; // vector instructions per cycle + Stats::Distribution controlFlowDivergenceDist; + Stats::Distribution activeLanesPerGMemInstrDist; + Stats::Distribution activeLanesPerLMemInstrDist; + // number of vector ALU instructions received + Stats::Formula numALUInstsExecuted; + // number of times a WG cannot start due to lack of free VGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueVgprAlloc; + // number of times a WG cannot start due to lack of free SGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueSgprAlloc; + Stats::Scalar numCASOps; + Stats::Scalar numFailedCASOps; + Stats::Scalar completedWfs; + Stats::Scalar completedWGs; + + // distrubtion in latency difference between first and last cache block + // arrival ticks + Stats::Distribution headTailLatency; + + // Track the amount of interleaving between wavefronts on each SIMD. + // This stat is sampled using instExecPerSimd to compute the number + // of instructions that have been executed on a SIMD between a WF + // executing two successive instructions. + Stats::VectorDistribution instInterleave; + } stats; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index a4fe92385..dae7b8c12 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -49,7 +49,7 @@ GPUDispatcher::GPUDispatcher(const Params &p) : SimObject(p), shader(nullptr), gpuCmdProc(nullptr), tickEvent([this]{ exec(); }, "GPU Dispatcher tick", false, Event::CPU_Tick_Pri), - dispatchActive(false) + dispatchActive(false), stats(this) { schedule(&tickEvent, 0); } @@ -58,21 +58,6 @@ GPUDispatcher::~GPUDispatcher() { } -void -GPUDispatcher::regStats() -{ - numKernelLaunched - .name(name() + ".num_kernel_launched") - .desc("number of kernel launched") - ; - - cyclesWaitingForDispatch - .name(name() + ".cycles_wait_dispatch") - .desc("number of cycles with outstanding wavefronts " - "that are waiting to be dispatched") - ; -} - HSAQueueEntry* GPUDispatcher::hsaTask(int disp_id) { @@ -127,7 +112,7 @@ GPUDispatcher::unserialize(CheckpointIn &cp) void GPUDispatcher::dispatch(HSAQueueEntry *task) { - ++numKernelLaunched; + ++stats.numKernelLaunched; DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n", task->kernelName(), task->dispatchId()); @@ -158,7 +143,7 @@ GPUDispatcher::exec() DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size()); if (execIds.size() > 0) { - ++cyclesWaitingForDispatch; + ++stats.cyclesWaitingForDispatch; } /** @@ -368,3 +353,11 @@ GPUDispatcher::scheduleDispatch() schedule(&tickEvent, curTick() + shader->clockPeriod()); } } + +GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(numKernelLaunched, "number of kernel launched"), + ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding " + "wavefronts that are waiting to be dispatched") +{ +} diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh index b8cd3f1ef..3cd65f664 100644 --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -48,6 +48,7 @@ #include #include "base/statistics.hh" +#include "base/stats/group.hh" #include "dev/hsa/hsa_packet.hh" #include "params/GPUDispatcher.hh" #include "sim/sim_object.hh" @@ -67,7 +68,6 @@ class GPUDispatcher : public SimObject void serialize(CheckpointOut &cp) const override; void unserialize(CheckpointIn &cp) override; - void regStats() override; void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc); void setShader(Shader *new_shader); void exec(); @@ -91,9 +91,15 @@ class GPUDispatcher : public SimObject std::queue doneIds; // is there a kernel in execution? bool dispatchActive; - /*statistics*/ - Stats::Scalar numKernelLaunched; - Stats::Scalar cyclesWaitingForDispatch; + + protected: + struct GPUDispatcherStats : public Stats::Group + { + GPUDispatcherStats(Stats::Group *parent); + + Stats::Scalar numKernelLaunched; + Stats::Scalar cyclesWaitingForDispatch; + } stats; }; #endif // __GPU_COMPUTE_DISPATCHER_HH__ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index 81806270c..5c57bb3b3 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -46,10 +46,11 @@ ExecStage::ExecStage(const ComputeUnitParams &p, ComputeUnit &cu, : computeUnit(cu), fromSchedule(from_schedule), lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), - executionResourcesUsed(0), _name(cu.name() + ".ExecStage") + executionResourcesUsed(0), _name(cu.name() + ".ExecStage"), + stats(&cu) { - numTransActiveIdle = 0; + stats.numTransActiveIdle = 0; idle_dur = 0; } @@ -64,22 +65,22 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { if (stage == IdleExec) { // count cycles when no instruction to a specific execution resource // is executed - numCyclesWithNoInstrTypeIssued[unitId]++; + stats.numCyclesWithNoInstrTypeIssued[unitId]++; } else if (stage == BusyExec) { // count the number of cycles an instruction to a specific execution // resource type was issued - numCyclesWithInstrTypeIssued[unitId]++; + stats.numCyclesWithInstrTypeIssued[unitId]++; thisTimeInstExecuted = true; instrExecuted = true; ++executionResourcesUsed; } else if (stage == PostExec) { // count the number of transitions from active to idle if (lastTimeInstExecuted && !thisTimeInstExecuted) { - ++numTransActiveIdle; + ++stats.numTransActiveIdle; } if (!lastTimeInstExecuted && thisTimeInstExecuted) { - idleDur.sample(idle_dur); + stats.idleDur.sample(idle_dur); idle_dur = 0; } else if (!thisTimeInstExecuted) { idle_dur++; @@ -89,11 +90,11 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { // track the number of cycles we either issued at least // instruction or issued no instructions at all if (instrExecuted) { - numCyclesWithInstrIssued++; + stats.numCyclesWithInstrIssued++; } else { - numCyclesWithNoIssue++; + stats.numCyclesWithNoIssue++; } - spc.sample(executionResourcesUsed); + stats.spc.sample(executionResourcesUsed); } } @@ -196,57 +197,35 @@ ExecStage::exec() collectStatistics(PostExec, 0); } -void -ExecStage::regStats() +ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent) + : Stats::Group(parent, "ExecStage"), + ADD_STAT(numTransActiveIdle, + "number of CU transitions from active to idle"), + ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"), + ADD_STAT(numCyclesWithInstrIssued, + "number of cycles the CU issued at least one instruction"), + ADD_STAT(spc, + "Execution units active per cycle (Exec unit=SIMD,MemPipe)"), + ADD_STAT(idleDur, "duration of idle periods in cycles"), + ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one " + "instruction issued to execution resource type"), + ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions" + " issued to execution resource type") { - numTransActiveIdle - .name(name() + ".num_transitions_active_to_idle") - .desc("number of CU transitions from active to idle") - ; - - numCyclesWithNoIssue - .name(name() + ".num_cycles_with_no_issue") - .desc("number of cycles the CU issues nothing") - ; - - numCyclesWithInstrIssued - .name(name() + ".num_cycles_with_instr_issued") - .desc("number of cycles the CU issued at least one instruction") - ; - - spc - .init(0, computeUnit.numExeUnits(), 1) - .name(name() + ".spc") - .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") - ; + ComputeUnit *compute_unit = static_cast(parent); - idleDur - .init(0,75,5) - .name(name() + ".idle_duration_in_cycles") - .desc("duration of idle periods in cycles") - ; - - numCyclesWithInstrTypeIssued - .init(computeUnit.numExeUnits()) - .name(name() + ".num_cycles_issue_exec_rsrc") - .desc("Number of cycles at least one instruction issued to " - "execution resource type") - ; - - numCyclesWithNoInstrTypeIssued - .init(computeUnit.numExeUnits()) - .name(name() + ".num_cycles_no_issue_exec_rsrc") - .desc("Number of clks no instructions issued to execution " - "resource type") - ; + spc.init(0, compute_unit->numExeUnits(), 1); + idleDur.init(0, 75, 5); + numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits()); + numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits()); int c = 0; - for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) { + for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) { std::string s = "VectorALU" + std::to_string(i); numCyclesWithNoInstrTypeIssued.subname(c, s); numCyclesWithInstrTypeIssued.subname(c, s); } - for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) { + for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) { std::string s = "ScalarALU" + std::to_string(i); numCyclesWithNoInstrTypeIssued.subname(c, s); numCyclesWithInstrTypeIssued.subname(c, s); @@ -256,7 +235,4 @@ ExecStage::regStats() numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe"); numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe"); - - numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe"); - numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe"); } diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh index 4051b31d9..c560b2440 100644 --- a/src/gpu-compute/exec_stage.hh +++ b/src/gpu-compute/exec_stage.hh @@ -39,7 +39,8 @@ #include #include -#include "sim/stats.hh" +#include "base/statistics.hh" +#include "base/stats/group.hh" class ComputeUnit; class ScheduleToExecute; @@ -81,20 +82,6 @@ class ExecStage void dumpDispList(); const std::string& name() const { return _name; } - void regStats(); - // number of idle cycles - Stats::Scalar numCyclesWithNoIssue; - // number of busy cycles - Stats::Scalar numCyclesWithInstrIssued; - // number of cycles during which at least one - // instruction was issued to an execution resource type - Stats::Vector numCyclesWithInstrTypeIssued; - // number of idle cycles during which the scheduler - // issued no instructions targeting a specific - // execution resource type - Stats::Vector numCyclesWithNoInstrTypeIssued; - // SIMDs active per cycle - Stats::Distribution spc; private: void collectStatistics(enum STAT_STATUS stage, int unitId); @@ -105,11 +92,33 @@ class ExecStage bool lastTimeInstExecuted; bool thisTimeInstExecuted; bool instrExecuted; - Stats::Scalar numTransActiveIdle; - Stats::Distribution idleDur; int executionResourcesUsed; uint64_t idle_dur; const std::string _name; + + protected: + struct ExecStageStats : public Stats::Group + { + ExecStageStats(Stats::Group *parent); + + // number of transitions from active to idle + Stats::Scalar numTransActiveIdle; + // number of idle cycles + Stats::Scalar numCyclesWithNoIssue; + // number of busy cycles + Stats::Scalar numCyclesWithInstrIssued; + // SIMDs active per cycle + Stats::Distribution spc; + // duration of idle periods in cycles + Stats::Distribution idleDur; + // number of cycles during which at least one + // instruction was issued to an execution resource type + Stats::Vector numCyclesWithInstrTypeIssued; + // number of idle cycles during which the scheduler + // issued no instructions targeting a specific + // execution resource type + Stats::Vector numCyclesWithNoInstrTypeIssued; + } stats; }; #endif // __EXEC_STAGE_HH__ diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc index 8a37756db..21374bb53 100644 --- a/src/gpu-compute/fetch_stage.cc +++ b/src/gpu-compute/fetch_stage.cc @@ -38,7 +38,7 @@ FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu) : numVectorALUs(p.num_SIMDs), computeUnit(cu), - _name(cu.name() + ".FetchStage") + _name(cu.name() + ".FetchStage"), stats(&cu) { for (int j = 0; j < numVectorALUs; ++j) { FetchUnit newFetchUnit(p, cu); @@ -79,7 +79,7 @@ FetchStage::processFetchReturn(PacketPtr pkt) const unsigned num_instructions = pkt->req->getSize() / sizeof(TheGpuISA::RawMachInst); - instFetchInstReturned.sample(num_instructions); + stats.instFetchInstReturned.sample(num_instructions); uint32_t simdId = wavefront->simdId; _fetchUnit[simdId].processFetchReturn(pkt); } @@ -90,13 +90,10 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront) _fetchUnit[wavefront->simdId].fetch(pkt, wavefront); } -void -FetchStage::regStats() +FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent) + : Stats::Group(parent, "FetchStage"), + ADD_STAT(instFetchInstReturned, "For each instruction fetch request " + "received record how many instructions you got from it") { - instFetchInstReturned - .init(1, 32, 1) - .name(name() + ".inst_fetch_instr_returned") - .desc("For each instruction fetch request recieved record how many " - "instructions you got from it") - ; + instFetchInstReturned.init(1, 32, 1); } diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh index 8e6996b17..3967d6de4 100644 --- a/src/gpu-compute/fetch_stage.hh +++ b/src/gpu-compute/fetch_stage.hh @@ -38,6 +38,7 @@ #include #include "base/statistics.hh" +#include "base/stats/group.hh" #include "gpu-compute/fetch_unit.hh" // Instruction fetch stage. @@ -61,8 +62,6 @@ class FetchStage // Stats related variables and methods const std::string& name() const { return _name; } - void regStats(); - Stats::Distribution instFetchInstReturned; FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); } private: @@ -73,6 +72,14 @@ class FetchStage // instantiated per VALU/SIMD std::vector _fetchUnit; const std::string _name; + + protected: + struct FetchStageStats : public Stats::Group + { + FetchStageStats(Stats::Group *parent); + + Stats::Distribution instFetchInstReturned; + } stats; }; #endif // __FETCH_STAGE_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index f6d60cf08..48f767b56 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -48,7 +48,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p, : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"), gmQueueSize(p.global_mem_queue_size), maxWaveRequests(p.max_wave_requests), inflightStores(0), - inflightLoads(0) + inflightLoads(0), stats(&cu) { } @@ -293,12 +293,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst) mem_req->second.second = true; } -void -GlobalMemPipeline::regStats() +GlobalMemPipeline:: +GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent) + : Stats::Group(parent, "GlobalMemPipeline"), + ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data " + "are delayed before updating the VRF") { - loadVrfBankConflictCycles - .name(name() + ".load_vrf_bank_conflict_cycles") - .desc("total number of cycles GM data are delayed before updating " - "the VRF") - ; } diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index a1b652a7d..e8a1fb033 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -37,6 +37,8 @@ #include #include +#include "base/statistics.hh" +#include "base/stats/group.hh" #include "gpu-compute/misc.hh" #include "params/ComputeUnit.hh" #include "sim/stats.hh" @@ -95,11 +97,10 @@ class GlobalMemPipeline } const std::string &name() const { return _name; } - void regStats(); void incLoadVRFBankConflictCycles(int num_cycles) { - loadVrfBankConflictCycles += num_cycles; + stats.loadVrfBankConflictCycles += num_cycles; } bool coalescerReady(GPUDynInstPtr mp) const; @@ -113,10 +114,6 @@ class GlobalMemPipeline int gmQueueSize; int maxWaveRequests; - // number of cycles of delaying the update of a VGPR that is the - // target of a load instruction (or the load component of an atomic) - // The delay is due to VRF bank conflicts - Stats::Scalar loadVrfBankConflictCycles; // Counters to track the inflight loads and stores // so that we can provide the proper backpressure // on the number of inflight memory operations. @@ -144,6 +141,17 @@ class GlobalMemPipeline // Global Memory Request FIFO: all global memory requests // are issued to this FIFO from the memory pipelines std::queue gmIssuedRequests; + + protected: + struct GlobalMemPipelineStats : public Stats::Group + { + GlobalMemPipelineStats(Stats::Group *parent); + + // number of cycles of delaying the update of a VGPR that is the + // target of a load instruction (or the load component of an atomic) + // The delay is due to VRF bank conflicts + Stats::Scalar loadVrfBankConflictCycles; + } stats; }; #endif // __GLOBAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 38e4ecf49..a17a93fcd 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -930,16 +930,16 @@ GPUDynInst::updateStats() { if (_staticInst->isLocalMem()) { // access to LDS (shared) memory - cu->dynamicLMemInstrCnt++; + cu->stats.dynamicLMemInstrCnt++; } else if (_staticInst->isFlat()) { - cu->dynamicFlatMemInstrCnt++; + cu->stats.dynamicFlatMemInstrCnt++; } else { // access to global memory // update PageDivergence histogram int number_pages_touched = cu->pagesTouched.size(); assert(number_pages_touched); - cu->pageDivergenceDist.sample(number_pages_touched); + cu->stats.pageDivergenceDist.sample(number_pages_touched); std::pair ret; @@ -962,7 +962,7 @@ GPUDynInst::updateStats() // total number of memory instructions (dynamic) // Atomics are counted as a single memory instruction. // this is # memory instructions per wavefronts, not per workitem - cu->dynamicGMemInstrCnt++; + cu->stats.dynamicGMemInstrCnt++; } } diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index cdb130e2f..8c7cf8787 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -63,12 +63,12 @@ class AtomicOpCAS : public TypedAtomicOpFunctor void execute(T *b) { - computeUnit->numCASOps++; + computeUnit->stats.numCASOps++; if (*b == c) { *b = s; } else { - computeUnit->numFailedCASOps++; + computeUnit->stats.numFailedCASOps++; } } AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc index 700a894ca..f61f3827e 100644 --- a/src/gpu-compute/gpu_tlb.cc +++ b/src/gpu-compute/gpu_tlb.cc @@ -67,7 +67,7 @@ namespace X86ISA : ClockedObject(p), configAddress(0), size(p.size), cleanupEvent([this]{ cleanup(); }, name(), false, Event::Maximum_Pri), - exitEvent([this]{ exitCallback(); }, name()) + exitEvent([this]{ exitCallback(); }, name()), stats(this) { assoc = p.assoc; assert(assoc <= size); @@ -402,12 +402,12 @@ namespace X86ISA return tlb_hit; } - localNumTLBAccesses++; + stats.localNumTLBAccesses++; if (!entry) { - localNumTLBMisses++; + stats.localNumTLBMisses++; } else { - localNumTLBHits++; + stats.localNumTLBHits++; } } } @@ -499,10 +499,10 @@ namespace X86ISA DPRINTF(GPUTLB, "Paging enabled.\n"); // The vaddr already has the segment base applied. TlbEntry *entry = lookup(vaddr); - localNumTLBAccesses++; + stats.localNumTLBAccesses++; if (!entry) { - localNumTLBMisses++; + stats.localNumTLBMisses++; if (timing) { latency = missLatency1; } @@ -544,7 +544,7 @@ namespace X86ISA DPRINTF(GPUTLB, "Miss was serviced.\n"); } } else { - localNumTLBHits++; + stats.localNumTLBHits++; if (timing) { latency = hitLatency; @@ -659,89 +659,6 @@ namespace X86ISA { } - void - GpuTLB::regStats() - { - ClockedObject::regStats(); - - localNumTLBAccesses - .name(name() + ".local_TLB_accesses") - .desc("Number of TLB accesses") - ; - - localNumTLBHits - .name(name() + ".local_TLB_hits") - .desc("Number of TLB hits") - ; - - localNumTLBMisses - .name(name() + ".local_TLB_misses") - .desc("Number of TLB misses") - ; - - localTLBMissRate - .name(name() + ".local_TLB_miss_rate") - .desc("TLB miss rate") - ; - - accessCycles - .name(name() + ".access_cycles") - .desc("Cycles spent accessing this TLB level") - ; - - pageTableCycles - .name(name() + ".page_table_cycles") - .desc("Cycles spent accessing the page table") - ; - - localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses; - - numUniquePages - .name(name() + ".unique_pages") - .desc("Number of unique pages touched") - ; - - localCycles - .name(name() + ".local_cycles") - .desc("Number of cycles spent in queue for all incoming reqs") - ; - - localLatency - .name(name() + ".local_latency") - .desc("Avg. latency over incoming coalesced reqs") - ; - - localLatency = localCycles / localNumTLBAccesses; - - globalNumTLBAccesses - .name(name() + ".global_TLB_accesses") - .desc("Number of TLB accesses") - ; - - globalNumTLBHits - .name(name() + ".global_TLB_hits") - .desc("Number of TLB hits") - ; - - globalNumTLBMisses - .name(name() + ".global_TLB_misses") - .desc("Number of TLB misses") - ; - - globalTLBMissRate - .name(name() + ".global_TLB_miss_rate") - .desc("TLB miss rate") - ; - - globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses; - - avgReuseDistance - .name(name() + ".avg_reuse_distance") - .desc("avg. reuse distance over all pages (in ticks)") - ; - - } - /** * Do the TLB lookup for this coalesced request and schedule * another event cycles later. @@ -768,10 +685,10 @@ namespace X86ISA int req_cnt = sender_state->reqCnt.back(); if (update_stats) { - accessCycles -= (curTick() * req_cnt); - localCycles -= curTick(); + stats.accessCycles -= (curTick() * req_cnt); + stats.localCycles -= curTick(); updatePageFootprint(virt_page_addr); - globalNumTLBAccesses += req_cnt; + stats.globalNumTLBAccesses += req_cnt; } tlbOutcome lookup_outcome = TLB_MISS; @@ -795,11 +712,11 @@ namespace X86ISA // the reqCnt has an entry per level, so its size tells us // which level we are in sender_state->hitLevel = sender_state->reqCnt.size(); - globalNumTLBHits += req_cnt; + stats.globalNumTLBHits += req_cnt; } } else { if (update_stats) - globalNumTLBMisses += req_cnt; + stats.globalNumTLBMisses += req_cnt; } /* @@ -981,16 +898,16 @@ namespace X86ISA handleTranslationReturn(virtPageAddr, TLB_HIT, pkt); if (update_stats) { - accessCycles += (req_cnt * curTick()); - localCycles += curTick(); + stats.accessCycles += (req_cnt * curTick()); + stats.localCycles += curTick(); } } else if (outcome == TLB_MISS) { DPRINTF(GPUTLB, "This is a TLB miss\n"); if (update_stats) { - accessCycles += (req_cnt*curTick()); - localCycles += curTick(); + stats.accessCycles += (req_cnt*curTick()); + stats.localCycles += curTick(); } if (hasMemSidePort) { @@ -998,8 +915,8 @@ namespace X86ISA // the reply back till when we propagate it to the coalescer // above. if (update_stats) { - accessCycles += (req_cnt * 1); - localCycles += 1; + stats.accessCycles += (req_cnt * 1); + stats.localCycles += 1; } /** @@ -1022,7 +939,7 @@ namespace X86ISA "addr %#x\n", virtPageAddr); if (update_stats) - pageTableCycles -= (req_cnt*curTick()); + stats.pageTableCycles -= (req_cnt*curTick()); TLBEvent *tlb_event = translationReturnEvent[virtPageAddr]; assert(tlb_event); @@ -1032,7 +949,7 @@ namespace X86ISA } } else if (outcome == PAGE_WALK) { if (update_stats) - pageTableCycles += (req_cnt*curTick()); + stats.pageTableCycles += (req_cnt*curTick()); // Need to access the page table and update the TLB DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", @@ -1222,17 +1139,17 @@ namespace X86ISA // functional mode means no coalescing // global metrics are the same as the local metrics if (update_stats) { - tlb->globalNumTLBAccesses++; + tlb->stats.globalNumTLBAccesses++; if (success) { sender_state->hitLevel = sender_state->reqCnt.size(); - tlb->globalNumTLBHits++; + tlb->stats.globalNumTLBHits++; } } if (!success) { if (update_stats) - tlb->globalNumTLBMisses++; + tlb->stats.globalNumTLBMisses++; if (tlb->hasMemSidePort) { // there is a TLB below -> propagate down the TLB hierarchy tlb->memSidePort[0]->sendFunctional(pkt); @@ -1405,7 +1322,7 @@ namespace X86ISA bool first_page_access = ret.second; if (first_page_access) { - numUniquePages++; + stats.numUniquePages++; } else { int accessed_before; accessed_before = curTick() - ret.first->second.lastTimeAccessed; @@ -1417,7 +1334,7 @@ namespace X86ISA if (accessDistance) { ret.first->second.localTLBAccesses - .push_back(localNumTLBAccesses.value()); + .push_back(stats.localNumTLBAccesses.value()); } } @@ -1506,11 +1423,36 @@ namespace X86ISA } if (!TLBFootprint.empty()) { - avgReuseDistance = + stats.avgReuseDistance = sum_avg_reuse_distance_per_page / TLBFootprint.size(); } //clear the TLBFootprint map TLBFootprint.clear(); } + + GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"), + ADD_STAT(localNumTLBHits, "Number of TLB hits"), + ADD_STAT(localNumTLBMisses, "Number of TLB misses"), + ADD_STAT(localTLBMissRate, "TLB miss rate"), + ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"), + ADD_STAT(globalNumTLBHits, "Number of TLB hits"), + ADD_STAT(globalNumTLBMisses, "Number of TLB misses"), + ADD_STAT(globalTLBMissRate, "TLB miss rate"), + ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"), + ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"), + ADD_STAT(numUniquePages, "Number of unique pages touched"), + ADD_STAT(localCycles, "Number of cycles spent in queue for all " + "incoming reqs"), + ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"), + ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in " + "ticks)") + { + localLatency = localCycles / localNumTLBAccesses; + + localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses; + globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses; + } } // namespace X86ISA diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh index edf5914a7..1df907bdb 100644 --- a/src/gpu-compute/gpu_tlb.hh +++ b/src/gpu-compute/gpu_tlb.hh @@ -47,6 +47,7 @@ #include "base/callback.hh" #include "base/logging.hh" #include "base/statistics.hh" +#include "base/stats/group.hh" #include "gpu-compute/compute_unit.hh" #include "mem/port.hh" #include "mem/request.hh" @@ -169,35 +170,6 @@ namespace X86ISA int missLatency1; int missLatency2; - // local_stats are as seen from the TLB - // without taking into account coalescing - Stats::Scalar localNumTLBAccesses; - Stats::Scalar localNumTLBHits; - Stats::Scalar localNumTLBMisses; - Stats::Formula localTLBMissRate; - - // global_stats are as seen from the - // CU's perspective taking into account - // all coalesced requests. - Stats::Scalar globalNumTLBAccesses; - Stats::Scalar globalNumTLBHits; - Stats::Scalar globalNumTLBMisses; - Stats::Formula globalTLBMissRate; - - // from the CU perspective (global) - Stats::Scalar accessCycles; - // from the CU perspective (global) - Stats::Scalar pageTableCycles; - Stats::Scalar numUniquePages; - // from the perspective of this TLB - Stats::Scalar localCycles; - // from the perspective of this TLB - Stats::Formula localLatency; - // I take the avg. per page and then - // the avg. over all pages. - Stats::Scalar avgReuseDistance; - - void regStats() override; void updatePageFootprint(Addr virt_page_addr); void printAccessPattern(); @@ -426,6 +398,40 @@ namespace X86ISA void exitCallback(); EventFunctionWrapper exitEvent; + + protected: + struct GpuTLBStats : public Stats::Group + { + GpuTLBStats(Stats::Group *parent); + + // local_stats are as seen from the TLB + // without taking into account coalescing + Stats::Scalar localNumTLBAccesses; + Stats::Scalar localNumTLBHits; + Stats::Scalar localNumTLBMisses; + Stats::Formula localTLBMissRate; + + // global_stats are as seen from the + // CU's perspective taking into account + // all coalesced requests. + Stats::Scalar globalNumTLBAccesses; + Stats::Scalar globalNumTLBHits; + Stats::Scalar globalNumTLBMisses; + Stats::Formula globalTLBMissRate; + + // from the CU perspective (global) + Stats::Scalar accessCycles; + // from the CU perspective (global) + Stats::Scalar pageTableCycles; + Stats::Scalar numUniquePages; + // from the perspective of this TLB + Stats::Scalar localCycles; + // from the perspective of this TLB + Stats::Formula localLatency; + // I take the avg. per page and then + // the avg. over all pages. + Stats::Scalar avgReuseDistance; + } stats; }; } diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index 1f653cb74..c3bafb274 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -189,10 +189,10 @@ LdsState::processPacket(PacketPtr packet) // the number of conflicts this packet will have when accessing the LDS unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); // count the total number of physical LDS bank accessed - parent->ldsBankAccesses += bankAccesses; + parent->stats.ldsBankAccesses += bankAccesses; // count the LDS bank conflicts. A number set to 1 indicates one // access per bank maximum so there are no bank conflicts - parent->ldsBankConflictDist.sample(bankConflicts-1); + parent->stats.ldsBankConflictDist.sample(bankConflicts-1); GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 3b39820f3..d441a29a7 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -43,7 +43,7 @@ LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu) : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"), - lmQueueSize(p.local_mem_queue_size) + lmQueueSize(p.local_mem_queue_size), stats(&cu) { } @@ -124,12 +124,11 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) lmIssuedRequests.push(gpuDynInst); } -void -LocalMemPipeline::regStats() + +LocalMemPipeline:: +LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent) + : Stats::Group(parent, "LocalMemPipeline"), + ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data " + "are delayed before updating the VRF") { - loadVrfBankConflictCycles - .name(name() + ".load_vrf_bank_conflict_cycles") - .desc("total number of cycles LDS data are delayed before updating " - "the VRF") - ; } diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh index 98cc75b7a..83895656e 100644 --- a/src/gpu-compute/local_memory_pipeline.hh +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -37,9 +37,10 @@ #include #include +#include "base/statistics.hh" +#include "base/stats/group.hh" #include "gpu-compute/misc.hh" #include "params/ComputeUnit.hh" -#include "sim/stats.hh" /* * @file local_memory_pipeline.hh @@ -75,19 +76,18 @@ class LocalMemPipeline } const std::string& name() const { return _name; } - void regStats(); void incLoadVRFBankConflictCycles(int num_cycles) { - loadVrfBankConflictCycles += num_cycles; + stats.loadVrfBankConflictCycles += num_cycles; } private: ComputeUnit &computeUnit; const std::string _name; int lmQueueSize; - Stats::Scalar loadVrfBankConflictCycles; + // Local Memory Request Fifo: all shared memory requests // are issued to this FIFO from the memory pipelines std::queue lmIssuedRequests; @@ -95,6 +95,14 @@ class LocalMemPipeline // Local Memory Response Fifo: all responses of shared memory // requests are sent to this FIFO from LDS std::queue lmReturnedRequests; + + protected: + struct LocalMemPipelineStats : public Stats::Group + { + LocalMemPipelineStats(Stats::Group *parent); + + Stats::Scalar loadVrfBankConflictCycles; + } stats; }; #endif // __LOCAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc index 42a74e8eb..9c97c6224 100644 --- a/src/gpu-compute/register_file.cc +++ b/src/gpu-compute/register_file.cc @@ -49,7 +49,7 @@ #include "params/RegisterFile.hh" RegisterFile::RegisterFile(const RegisterFileParams &p) - : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs) + : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this) { fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n"); fatal_if(simdId < 0, "Illegal SIMD id for VRF"); @@ -192,26 +192,15 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii) { } -void -RegisterFile::regStats() -{ - registerReads - .name(name() + ".register_reads") - .desc("Total number of DWORDs read from register file") - ; - - registerWrites - .name(name() + ".register_writes") - .desc("Total number of DWORDS written to register file") - ; - - sramReads - .name(name() + ".sram_reads") - .desc("Total number of register file bank SRAM activations for reads") - ; - - sramWrites - .name(name() + ".sram_writes") - .desc("Total number of register file bank SRAM activations for writes") - ; +RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(registerReads, + "Total number of DWORDs read from register file"), + ADD_STAT(registerWrites, + "Total number of DWORDS written to register file"), + ADD_STAT(sramReads, + "Total number of register file bank SRAM activations for reads"), + ADD_STAT(sramWrites, + "Total number of register file bank SRAM activations for writes") +{ } diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh index 8a417a357..75913cedb 100644 --- a/src/gpu-compute/register_file.hh +++ b/src/gpu-compute/register_file.hh @@ -62,7 +62,6 @@ class RegisterFile : public SimObject virtual ~RegisterFile(); virtual void setParent(ComputeUnit *_computeUnit); int numRegs() const { return _numRegs; } - virtual void regStats() override; // State functions @@ -154,18 +153,23 @@ class RegisterFile : public SimObject // numer of registers in this register file int _numRegs; - // Stats - // Total number of register reads, incremented once per DWORD per thread - Stats::Scalar registerReads; - // Total number of register writes, incremented once per DWORD per thread - Stats::Scalar registerWrites; - - // Number of register file SRAM activations for reads. - // The register file may be implemented with multiple SRAMs. This stat - // tracks how many times the SRAMs are accessed for reads. - Stats::Scalar sramReads; - // Number of register file SRAM activations for writes - Stats::Scalar sramWrites; + + struct RegisterFileStats : public Stats::Group + { + RegisterFileStats(Stats::Group *parent); + + // Total number of register reads per DWORD per thread + Stats::Scalar registerReads; + // Total number of register writes per DWORD per thread + Stats::Scalar registerWrites; + + // Number of register file SRAM activations for reads. + // The register file may be implemented with multiple SRAMs. This stat + // tracks how many times the SRAMs are accessed for reads. + Stats::Scalar sramReads; + // Number of register file SRAM activations for writes + Stats::Scalar sramWrites; + } stats; }; #endif // __REGISTER_FILE_HH__ diff --git a/src/gpu-compute/register_manager.cc b/src/gpu-compute/register_manager.cc index f8487554b..781ecc2e7 100644 --- a/src/gpu-compute/register_manager.cc +++ b/src/gpu-compute/register_manager.cc @@ -129,9 +129,3 @@ RegisterManager::freeRegisters(Wavefront* w) { policy->freeRegisters(w); } - -void -RegisterManager::regStats() -{ - policy->regStats(); -} diff --git a/src/gpu-compute/register_manager.hh b/src/gpu-compute/register_manager.hh index e09a748f1..448523f3e 100644 --- a/src/gpu-compute/register_manager.hh +++ b/src/gpu-compute/register_manager.hh @@ -63,9 +63,6 @@ class RegisterManager : public SimObject void setParent(ComputeUnit *cu); void exec(); - // Stats related variables and methods - void regStats(); - // lookup virtual to physical register translation int mapVgpr(Wavefront* w, int vgprIndex); int mapSgpr(Wavefront* w, int sgprIndex); diff --git a/src/gpu-compute/register_manager_policy.hh b/src/gpu-compute/register_manager_policy.hh index 2a5a2eb1e..e4f34760a 100644 --- a/src/gpu-compute/register_manager_policy.hh +++ b/src/gpu-compute/register_manager_policy.hh @@ -76,9 +76,6 @@ class RegisterManagerPolicy // free all remaining registers held by specified WF virtual void freeRegisters(Wavefront *w) = 0; - // stats - virtual void regStats() = 0; - protected: ComputeUnit *cu; }; diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index 35db8a351..f80095c80 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -142,8 +142,3 @@ ScalarMemPipeline::exec() computeUnit.cu_id, mp->simdId, mp->wfSlotId); } } - -void -ScalarMemPipeline::regStats() -{ -} diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh index 7f1acecbb..0c015d1c8 100644 --- a/src/gpu-compute/scalar_memory_pipeline.hh +++ b/src/gpu-compute/scalar_memory_pipeline.hh @@ -85,7 +85,6 @@ class ScalarMemPipeline } const std::string& name() const { return _name; } - void regStats(); private: ComputeUnit &computeUnit; diff --git a/src/gpu-compute/scalar_register_file.cc b/src/gpu-compute/scalar_register_file.cc index d8083ea31..33267ceea 100644 --- a/src/gpu-compute/scalar_register_file.cc +++ b/src/gpu-compute/scalar_register_file.cc @@ -66,11 +66,11 @@ ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const if (regBusy(pSgpr)) { if (ii->isDstOperand(i)) { - w->numTimesBlockedDueWAXDependencies++; + w->stats.numTimesBlockedDueWAXDependencies++; } else if (ii->isSrcOperand(i)) { DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n", w->wfDynId, ii->disassemble(), pSgpr); - w->numTimesBlockedDueRAWDependencies++; + w->stats.numTimesBlockedDueRAWDependencies++; } return false; } @@ -109,7 +109,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) { int DWORDs = ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4; - registerReads += DWORDs; + stats.registerReads += DWORDs; } } @@ -128,7 +128,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) enqRegFreeEvent(physReg, tickDelay); } - registerWrites += nRegs; + stats.registerWrites += nRegs; } } } @@ -152,7 +152,7 @@ ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, enqRegFreeEvent(physReg, computeUnit->clockPeriod()); } - registerWrites += nRegs; + stats.registerWrites += nRegs; } } } diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 54e931306..02580fe7d 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -51,7 +51,7 @@ ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, _name(cu.name() + ".ScheduleStage"), vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false), scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false), - locMemBusRdy(false), locMemIssueRdy(false) + locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits()) { for (int j = 0; j < cu.numExeUnits(); ++j) { scheduler.emplace_back(p); @@ -121,10 +121,10 @@ ScheduleStage::exec() // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { - rdyListEmpty[j]++; + stats.rdyListEmpty[j]++; continue; } - rdyListNotEmpty[j]++; + stats.rdyListNotEmpty[j]++; // Pick a wave and attempt to add it to schList Wavefront *wf = scheduler[j].chooseWave(); @@ -133,8 +133,8 @@ ScheduleStage::exec() if (!addToSchList(j, gpu_dyn_inst)) { // For waves not added to schList, increment count of cycles // this wave spends in SCH stage. - wf->schCycles++; - addToSchListStalls[j]++; + wf->stats.schCycles++; + stats.addToSchListStalls[j]++; } else { if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) { wf->incLGKMInstsIssued(); @@ -160,10 +160,10 @@ ScheduleStage::exec() // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { - rdyListEmpty[j]++; + stats.rdyListEmpty[j]++; continue; } - rdyListNotEmpty[j]++; + stats.rdyListNotEmpty[j]++; // Pick a wave and attempt to add it to schList Wavefront *wf = scheduler[j].chooseWave(); @@ -172,8 +172,8 @@ ScheduleStage::exec() if (!addToSchList(j, gpu_dyn_inst)) { // For waves not added to schList, increment count of cycles // this wave spends in SCH stage. - wf->schCycles++; - addToSchListStalls[j]++; + wf->stats.schCycles++; + stats.addToSchListStalls[j]++; } } @@ -241,17 +241,17 @@ ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst) computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst); return true; } else { - rfAccessStalls[SCH_RF_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++; if (!accessSrfWr) { - rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++; } if (!accessVrfWr) { - rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++; } // Increment stall counts for WF - wf->schStalls++; - wf->schRfAccessStalls++; + wf->stats.schStalls++; + wf->stats.schRfAccessStalls++; } return false; } @@ -329,19 +329,19 @@ ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst) return true; } else { // Number of stall cycles due to RF access denied - rfAccessStalls[SCH_RF_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++; // Count number of denials due to each reason // Multiple items may contribute to the denied request if (!accessVrf) { - rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++; } if (!accessSrf) { - rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++; + stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++; } // Increment stall counts for WF - wf->schStalls++; - wf->schRfAccessStalls++; + wf->stats.schStalls++; + wf->stats.schRfAccessStalls++; DPRINTF(GPUSched, "schList[%d]: Could not add: " "SIMD[%d] WV[%d]: %d: %s\n", exeType, wf->simdId, wf->wfDynId, @@ -424,26 +424,26 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) // TODO: Scalar NOP does not require SALU in hardware, // and is executed out of IB directly. if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { - dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { - dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; return false; } } else if (gpu_dyn_inst->isEndOfKernel()) { // EndPgm instruction if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { - dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; } } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch() || gpu_dyn_inst->isALU()) { // Barrier, Branch, or ALU instruction if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { - dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { - dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; return false; } } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) { @@ -451,19 +451,19 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) bool rdy = true; if (!glbMemIssueRdy) { rdy = false; - dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++; } if (!glbMemBusRdy) { rdy = false; - dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; } if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { rdy = false; - dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; } if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { rdy = false; - dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; + stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; } if (!rdy) { return false; @@ -473,18 +473,18 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) bool rdy = true; if (!scalarMemIssueRdy) { rdy = false; - dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++; } if (!scalarMemBusRdy) { rdy = false; - dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; } if (!computeUnit.scalarMemoryPipe .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe + wf->scalarWrGmReqsInPipe)) { rdy = false; - dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++; + stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++; } if (!rdy) { return false; @@ -494,16 +494,16 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) bool rdy = true; if (!locMemIssueRdy) { rdy = false; - dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++; + stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++; } if (!locMemBusRdy) { rdy = false; - dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; + stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; } if (!computeUnit.localMemoryPipe. isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { rdy = false; - dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; + stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; } if (!rdy) { return false; @@ -513,24 +513,24 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) bool rdy = true; if (!glbMemIssueRdy || !locMemIssueRdy) { rdy = false; - dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++; + stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++; } if (!glbMemBusRdy || !locMemBusRdy) { rdy = false; - dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; + stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; } if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { rdy = false; - dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; + stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; } if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { rdy = false; - dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; + stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; } if (!computeUnit.localMemoryPipe. isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { rdy = false; - dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; + stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; } if (!rdy) { return false; @@ -540,7 +540,7 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) gpu_dyn_inst->disassemble()); return false; } - dispNrdyStalls[SCH_RDY]++; + stats.dispNrdyStalls[SCH_RDY]++; return true; } @@ -584,10 +584,10 @@ ScheduleStage::fillDispatchList() } else { // Either another wave has been dispatched, or this wave // was not ready, so it is stalled this cycle - schIter->first->wavefront()->schStalls++; + schIter->first->wavefront()->stats.schStalls++; if (!dispRdy) { // not ready for dispatch, increment stall stat - schIter->first->wavefront()->schResourceStalls++; + schIter->first->wavefront()->stats.schResourceStalls++; } // Examine next wave for this resource schIter++; @@ -601,9 +601,9 @@ ScheduleStage::fillDispatchList() // Increment stall count if no wave sent to dispatchList for // current execution resource if (!dispatched) { - schListToDispListStalls[j]++; + stats.schListToDispListStalls[j]++; } else { - schListToDispList[j]++; + stats.schListToDispList[j]++; } } } @@ -635,9 +635,9 @@ ScheduleStage::arbitrateVrfToLdsBus() reinsertToSchList(wf->localMem, toExecute .readyInst(wf->localMem)); // Increment stall stats for LDS-VRF arbitration - ldsBusArbStalls++; + stats.ldsBusArbStalls++; toExecute.readyInst(wf->localMem) - ->wavefront()->schLdsArbStalls++; + ->wavefront()->stats.schLdsArbStalls++; } // With arbitration of LM pipe complete, transition the // LM pipe to SKIP state in the dispatchList to inform EX stage @@ -663,7 +663,7 @@ ScheduleStage::checkRfOperandReadComplete() // Increment the number of cycles the wave spends in the // SCH stage, since this loop visits every wave in SCH. - wf->schCycles++; + wf->stats.schCycles++; bool vrfRdy = true; if (!gpu_dyn_inst->isScalar()) { @@ -690,15 +690,15 @@ ScheduleStage::checkRfOperandReadComplete() p.second = RFBUSY; // Increment stall stats - wf->schStalls++; - wf->schOpdNrdyStalls++; + wf->stats.schStalls++; + wf->stats.schOpdNrdyStalls++; - opdNrdyStalls[SCH_RF_OPD_NRDY]++; + stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++; if (!vrfRdy) { - opdNrdyStalls[SCH_VRF_OPD_NRDY]++; + stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++; } if (!srfRdy) { - opdNrdyStalls[SCH_SRF_OPD_NRDY]++; + stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++; } } } @@ -777,60 +777,40 @@ ScheduleStage::deleteFromSch(Wavefront *w) wavesInSch.erase(w->wfDynId); } -void -ScheduleStage::regStats() +ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent, + int num_exec_units) + : Stats::Group(parent, "ScheduleStage"), + ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per " + "execution resource"), + ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready " + "list per execution resource"), + ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to " + "schList per execution resource when ready list is not empty"), + ADD_STAT(schListToDispList, "number of cycles a wave is added to " + "dispatchList per execution resource"), + ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to" + " dispatchList per execution resource"), + ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"), + ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus " + "conflicts"), + ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not " + "ready"), + ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not " + "ready") { - rdyListNotEmpty - .init(computeUnit.numExeUnits()) - .name(name() + ".rdy_list_not_empty") - .desc("number of cycles one or more wave on ready list per " - "execution resource") - ; - - rdyListEmpty - .init(computeUnit.numExeUnits()) - .name(name() + ".rdy_list_empty") - .desc("number of cycles no wave on ready list per " - "execution resource") - ; - - addToSchListStalls - .init(computeUnit.numExeUnits()) - .name(name() + ".sch_list_add_stalls") - .desc("number of cycles a wave is not added to schList per " - "execution resource when ready list is not empty") - ; - - schListToDispList - .init(computeUnit.numExeUnits()) - .name(name() + ".sch_list_to_disp_list") - .desc("number of cycles a wave is added to dispatchList per " - "execution resource") - ; - - schListToDispListStalls - .init(computeUnit.numExeUnits()) - .name(name() + ".sch_list_to_disp_list_stalls") - .desc("number of cycles no wave is added to dispatchList per " - "execution resource") - ; - - // Operand Readiness Stall Cycles - opdNrdyStalls - .init(SCH_RF_OPD_NRDY_CONDITIONS) - .name(name() + ".opd_nrdy_stalls") - .desc("number of stalls in SCH due to operands not ready") - ; + rdyListNotEmpty.init(num_exec_units); + rdyListEmpty.init(num_exec_units); + addToSchListStalls.init(num_exec_units); + schListToDispList.init(num_exec_units); + schListToDispListStalls.init(num_exec_units); + opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS); + dispNrdyStalls.init(SCH_NRDY_CONDITIONS); + rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS); + opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF")); opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF")); opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF")); - // dispatchReady Stall Cycles - dispNrdyStalls - .init(SCH_NRDY_CONDITIONS) - .name(name() + ".disp_nrdy_stalls") - .desc("number of stalls in SCH due to resource not ready") - ; dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu")); dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu")); dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY, @@ -862,21 +842,9 @@ ScheduleStage::regStats() csprintf("FlatMemFIFO")); dispNrdyStalls.subname(SCH_RDY, csprintf("Ready")); - // RF Access Stall Cycles - rfAccessStalls - .init(SCH_RF_ACCESS_NRDY_CONDITIONS) - .name(name() + ".rf_access_stalls") - .desc("number of stalls due to RF access denied") - ; rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd")); rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr")); rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd")); rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr")); rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any")); - - // Stall cycles due to wave losing LDS bus arbitration - ldsBusArbStalls - .name(name() + ".lds_bus_arb_stalls") - .desc("number of stalls due to VRF->LDS bus conflicts") - ; } diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh index 1a9aca17e..ede2a45db 100644 --- a/src/gpu-compute/schedule_stage.hh +++ b/src/gpu-compute/schedule_stage.hh @@ -40,6 +40,8 @@ #include #include +#include "base/statistics.hh" +#include "base/stats/group.hh" #include "gpu-compute/exec_stage.hh" #include "gpu-compute/misc.hh" #include "gpu-compute/scheduler.hh" @@ -105,8 +107,6 @@ class ScheduleStage SCH_RF_ACCESS_NRDY_CONDITIONS }; - void regStats(); - // Called by ExecStage to inform SCH of instruction execution void deleteFromSch(Wavefront *w); @@ -126,48 +126,6 @@ class ScheduleStage // scheduler and a dispatch list std::vector scheduler; - // Stats - - // Number of cycles with empty (or not empty) readyList, per execution - // resource, when the CU is active (not sleeping) - Stats::Vector rdyListEmpty; - Stats::Vector rdyListNotEmpty; - - // Number of cycles, per execution resource, when at least one wave - // was on the readyList and picked by scheduler, but was unable to be - // added to the schList, when the CU is active (not sleeping) - Stats::Vector addToSchListStalls; - - // Number of cycles, per execution resource, when a wave is selected - // as candidate for dispatchList from schList - // Note: may be arbitrated off dispatchList (e.g., LDS arbitration) - Stats::Vector schListToDispList; - - // Per execution resource stat, incremented once per cycle if no wave - // was selected as candidate for dispatch and moved to dispatchList - Stats::Vector schListToDispListStalls; - - // Number of times a wave is selected by the scheduler but cannot - // be added to the schList due to register files not being able to - // support reads or writes of operands. RF_ACCESS_NRDY condition is always - // incremented if at least one read/write not supported, other - // conditions are incremented independently from each other. - Stats::Vector rfAccessStalls; - - // Number of times a wave is executing FLAT instruction and - // forces another wave occupying its required local memory resource - // to be deselected for execution, and placed back on schList - Stats::Scalar ldsBusArbStalls; - - // Count of times VRF and/or SRF blocks waves on schList from - // performing RFBUSY->RFREADY transition - Stats::Vector opdNrdyStalls; - - // Count of times resource required for dispatch is not ready and - // blocks wave in RFREADY state on schList from potentially moving - // to dispatchList - Stats::Vector dispNrdyStalls; - const std::string _name; // called by exec() to add a wave to schList if the RFs can support it @@ -221,6 +179,52 @@ class ScheduleStage // the VRF/SRF availability or limits imposed by paremeters (to be added) // of the SCH stage or CU. std::vector>> schList; + + protected: + struct ScheduleStageStats : public Stats::Group + { + ScheduleStageStats(Stats::Group *parent, int num_exec_units); + + // Number of cycles with empty (or not empty) readyList, per execution + // resource, when the CU is active (not sleeping) + Stats::Vector rdyListEmpty; + Stats::Vector rdyListNotEmpty; + + // Number of cycles, per execution resource, when at least one wave + // was on the readyList and picked by scheduler, but was unable to be + // added to the schList, when the CU is active (not sleeping) + Stats::Vector addToSchListStalls; + + // Number of cycles, per execution resource, when a wave is selected + // as candidate for dispatchList from schList + // Note: may be arbitrated off dispatchList (e.g., LDS arbitration) + Stats::Vector schListToDispList; + + // Per execution resource stat, incremented once per cycle if no wave + // was selected as candidate for dispatch and moved to dispatchList + Stats::Vector schListToDispListStalls; + + // Number of times a wave is selected by the scheduler but cannot + // be added to the schList due to register files not being able to + // support reads or writes of operands. RF_ACCESS_NRDY condition is + // always incremented if at least one read/write not supported, other + // conditions are incremented independently from each other. + Stats::Vector rfAccessStalls; + + // Number of times a wave is executing FLAT instruction and + // forces another wave occupying its required local memory resource + // to be deselected for execution, and placed back on schList + Stats::Scalar ldsBusArbStalls; + + // Count of times VRF and/or SRF blocks waves on schList from + // performing RFBUSY->RFREADY transition + Stats::Vector opdNrdyStalls; + + // Count of times resource required for dispatch is not ready and + // blocks wave in RFREADY state on schList from potentially moving + // to dispatchList + Stats::Vector dispNrdyStalls; + } stats; }; #endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index dfda0ad79..c246279d6 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -49,7 +49,7 @@ ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams &p, ScoreboardCheckToSchedule &to_schedule) : computeUnit(cu), toSchedule(to_schedule), - _name(cu.name() + ".ScoreboardCheckStage") + _name(cu.name() + ".ScoreboardCheckStage"), stats(&cu) { } @@ -62,7 +62,7 @@ ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus) { panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS, "Instruction ready status %d is illegal!!!", rdyStatus); - stallCycles[rdyStatus]++; + stats.stallCycles[rdyStatus]++; } // Return true if this wavefront is ready @@ -266,14 +266,13 @@ ScoreboardCheckStage::exec() } } -void -ScoreboardCheckStage::regStats() +ScoreboardCheckStage:: +ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent) + : Stats::Group(parent, "ScoreboardCheckStage"), + ADD_STAT(stallCycles, "number of cycles wave stalled in SCB") { - stallCycles - .init(NRDY_CONDITIONS) - .name(name() + ".stall_cycles") - .desc("number of cycles wave stalled in SCB") - ; + stallCycles.init(NRDY_CONDITIONS); + stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop")); stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty")); stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt")); diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh index c45ea7571..419dffb9f 100644 --- a/src/gpu-compute/scoreboard_check_stage.hh +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -40,7 +40,8 @@ #include #include -#include "sim/stats.hh" +#include "base/statistics.hh" +#include "base/stats/group.hh" class ComputeUnit; class ScoreboardCheckToSchedule; @@ -78,7 +79,6 @@ class ScoreboardCheckStage // Stats related variables and methods const std::string& name() const { return _name; } - void regStats(); private: void collectStatistics(nonrdytype_e rdyStatus); @@ -94,10 +94,15 @@ class ScoreboardCheckStage */ ScoreboardCheckToSchedule &toSchedule; - // Stats - Stats::Vector stallCycles; - const std::string _name; + + protected: + struct ScoreboardCheckStageStats : public Stats::Group + { + ScoreboardCheckStageStats(Stats::Group *parent); + + Stats::Vector stallCycles; + } stats; }; #endif // __SCOREBOARD_CHECK_STAGE_HH__ diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 9ae3fd7ce..dcb0d8b9c 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -65,7 +65,8 @@ Shader::Shader(const Params &p) : ClockedObject(p), globalMemSize(p.globalmem), nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc), _dispatcher(*p.dispatcher), - max_valu_insts(p.max_valu_insts), total_valu_insts(0) + max_valu_insts(p.max_valu_insts), total_valu_insts(0), + stats(this, p.CUs[0]->wfSize()) { gpuCmdProc.setShader(this); _dispatcher.setShader(this); @@ -278,86 +279,6 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task) return scheduledSomething; } -void -Shader::regStats() -{ - ClockedObject::regStats(); - - shaderActiveTicks - .name(name() + ".shader_active_ticks") - .desc("Total ticks that any CU attached to this shader is active") - ; - allLatencyDist - .init(0, 1600000, 10000) - .name(name() + ".allLatencyDist") - .desc("delay distribution for all") - .flags(Stats::pdf | Stats::oneline); - - loadLatencyDist - .init(0, 1600000, 10000) - .name(name() + ".loadLatencyDist") - .desc("delay distribution for loads") - .flags(Stats::pdf | Stats::oneline); - - storeLatencyDist - .init(0, 1600000, 10000) - .name(name() + ".storeLatencyDist") - .desc("delay distribution for stores") - .flags(Stats::pdf | Stats::oneline); - - vectorInstSrcOperand - .init(4) - .name(name() + ".vec_inst_src_operand") - .desc("vector instruction source operand distribution"); - - vectorInstDstOperand - .init(4) - .name(name() + ".vec_inst_dst_operand") - .desc("vector instruction destination operand distribution"); - - initToCoalesceLatency - .init(0, 1600000, 10000) - .name(name() + ".initToCoalesceLatency") - .desc("Ticks from vmem inst initiateAcc to coalescer issue") - .flags(Stats::pdf | Stats::oneline); - - rubyNetworkLatency - .init(0, 1600000, 10000) - .name(name() + ".rubyNetworkLatency") - .desc("Ticks from coalescer issue to coalescer hit callback") - .flags(Stats::pdf | Stats::oneline); - - gmEnqueueLatency - .init(0, 1600000, 10000) - .name(name() + ".gmEnqueueLatency") - .desc("Ticks from coalescer hit callback to GM pipe enqueue") - .flags(Stats::pdf | Stats::oneline); - - gmToCompleteLatency - .init(0, 1600000, 10000) - .name(name() + ".gmToCompleteLatency") - .desc("Ticks queued in GM pipes ordered response buffer") - .flags(Stats::pdf | Stats::oneline); - - coalsrLineAddresses - .init(0, 20, 1) - .name(name() + ".coalsrLineAddresses") - .desc("Number of cache lines for coalesced request") - .flags(Stats::pdf | Stats::oneline); - - int wfSize = cuList[0]->wfSize(); - cacheBlockRoundTrip = new Stats::Distribution[wfSize]; - for (int idx = 0; idx < wfSize; ++idx) { - std::stringstream namestr; - ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx); - cacheBlockRoundTrip[idx] - .init(0, 1600000, 10000) - .name(namestr.str()) - .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") - .flags(Stats::pdf | Stats::oneline); - } -} - void Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id) @@ -528,8 +449,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) void Shader::sampleStore(const Tick accessTime) { - storeLatencyDist.sample(accessTime); - allLatencyDist.sample(accessTime); + stats.storeLatencyDist.sample(accessTime); + stats.allLatencyDist.sample(accessTime); } /* @@ -538,8 +459,8 @@ Shader::sampleStore(const Tick accessTime) void Shader::sampleLoad(const Tick accessTime) { - loadLatencyDist.sample(accessTime); - allLatencyDist.sample(accessTime); + stats.loadLatencyDist.sample(accessTime); + stats.allLatencyDist.sample(accessTime); } void @@ -556,16 +477,16 @@ Shader::sampleInstRoundTrip(std::vector roundTripTime) Tick t4 = roundTripTime[3]; Tick t5 = roundTripTime[4]; - initToCoalesceLatency.sample(t2-t1); - rubyNetworkLatency.sample(t3-t2); - gmEnqueueLatency.sample(t4-t3); - gmToCompleteLatency.sample(t5-t4); + stats.initToCoalesceLatency.sample(t2-t1); + stats.rubyNetworkLatency.sample(t3-t2); + stats.gmEnqueueLatency.sample(t4-t3); + stats.gmToCompleteLatency.sample(t5-t4); } void Shader::sampleLineRoundTrip(const std::map>& lineMap) { - coalsrLineAddresses.sample(lineMap.size()); + stats.coalsrLineAddresses.sample(lineMap.size()); std::vector netTimes; // For each cache block address generated by a vmem inst, calculate @@ -586,7 +507,7 @@ Shader::sampleLineRoundTrip(const std::map>& lineMap) // Nth distribution. int idx = 0; for (auto& time : netTimes) { - cacheBlockRoundTrip[idx].sample(time); + stats.cacheBlockRoundTrip[idx].sample(time); ++idx; } } @@ -598,5 +519,75 @@ Shader::notifyCuSleep() { "Invalid activeCu size\n"); _activeCus--; if (!_activeCus) - shaderActiveTicks += curTick() - _lastInactiveTick; + stats.shaderActiveTicks += curTick() - _lastInactiveTick; +} + +Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size) + : Stats::Group(parent), + ADD_STAT(allLatencyDist, "delay distribution for all"), + ADD_STAT(loadLatencyDist, "delay distribution for loads"), + ADD_STAT(storeLatencyDist, "delay distribution for stores"), + ADD_STAT(initToCoalesceLatency, + "Ticks from vmem inst initiateAcc to coalescer issue"), + ADD_STAT(rubyNetworkLatency, + "Ticks from coalescer issue to coalescer hit callback"), + ADD_STAT(gmEnqueueLatency, + "Ticks from coalescer hit callback to GM pipe enqueue"), + ADD_STAT(gmToCompleteLatency, + "Ticks queued in GM pipes ordered response buffer"), + ADD_STAT(coalsrLineAddresses, + "Number of cache lines for coalesced request"), + ADD_STAT(shaderActiveTicks, + "Total ticks that any CU attached to this shader is active"), + ADD_STAT(vectorInstSrcOperand, + "vector instruction source operand distribution"), + ADD_STAT(vectorInstDstOperand, + "vector instruction destination operand distribution") +{ + allLatencyDist + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + loadLatencyDist + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + storeLatencyDist + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + initToCoalesceLatency + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + rubyNetworkLatency + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + gmEnqueueLatency + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + gmToCompleteLatency + .init(0, 1600000, 10000) + .flags(Stats::pdf | Stats::oneline); + + coalsrLineAddresses + .init(0, 20, 1) + .flags(Stats::pdf | Stats::oneline); + + vectorInstSrcOperand.init(4); + vectorInstDstOperand.init(4); + + cacheBlockRoundTrip = new Stats::Distribution[wf_size]; + for (int idx = 0; idx < wf_size; ++idx) { + std::stringstream namestr; + ccprintf(namestr, "%s.cacheBlockRoundTrip%d", + static_cast(parent)->name(), idx); + cacheBlockRoundTrip[idx] + .init(0, 1600000, 10000) + .name(namestr.str()) + .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") + .flags(Stats::pdf | Stats::oneline); + } } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 76ee3c9f7..125df1a36 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -40,6 +40,8 @@ #include #include "arch/isa.hh" +#include "base/statistics.hh" +#include "base/stats/group.hh" #include "base/types.hh" #include "cpu/simple/atomic.hh" #include "cpu/simple/timing.hh" @@ -98,26 +100,6 @@ class Shader : public ClockedObject // Last tick that all CUs attached to this shader were inactive Tick _lastInactiveTick; - // some stats for measuring latency - Stats::Distribution allLatencyDist; - Stats::Distribution loadLatencyDist; - Stats::Distribution storeLatencyDist; - - // average ticks from vmem inst initiateAcc to coalescer issue, - // average ticks from coalescer issue to coalescer hit callback, - // average ticks from coalescer hit callback to GM pipe enqueue, - // and average ticks spent in GM pipe's ordered resp buffer. - Stats::Distribution initToCoalesceLatency; - Stats::Distribution rubyNetworkLatency; - Stats::Distribution gmEnqueueLatency; - Stats::Distribution gmToCompleteLatency; - - // average number of cache blocks requested by vmem inst, and - // average ticks for cache blocks to main memory for the Nth - // cache block generated by a vmem inst. - Stats::Distribution coalsrLineAddresses; - Stats::Distribution *cacheBlockRoundTrip; - public: typedef ShaderParams Params; enum hsail_mode_e {SIMT,VECTOR_SCALAR}; @@ -249,14 +231,6 @@ class Shader : public ClockedObject GPUCommandProcessor &gpuCmdProc; GPUDispatcher &_dispatcher; - /** - * Statistics - */ - Stats::Scalar shaderActiveTicks; - Stats::Vector vectorInstSrcOperand; - Stats::Vector vectorInstDstOperand; - void regStats(); - int64_t max_valu_insts; int64_t total_valu_insts; @@ -301,6 +275,52 @@ class Shader : public ClockedObject void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); void updateContext(int cid); void notifyCuSleep(); + + void + incVectorInstSrcOperand(int num_operands) + { + stats.vectorInstSrcOperand[num_operands]++; + } + + void + incVectorInstDstOperand(int num_operands) + { + stats.vectorInstDstOperand[num_operands]++; + } + + protected: + struct ShaderStats : public Stats::Group + { + ShaderStats(Stats::Group *parent, int wf_size); + + // some stats for measuring latency + Stats::Distribution allLatencyDist; + Stats::Distribution loadLatencyDist; + Stats::Distribution storeLatencyDist; + + // average ticks from vmem inst initiateAcc to coalescer issue, + Stats::Distribution initToCoalesceLatency; + + // average ticks from coalescer issue to coalescer hit callback, + Stats::Distribution rubyNetworkLatency; + + // average ticks from coalescer hit callback to GM pipe enqueue, + Stats::Distribution gmEnqueueLatency; + + // average ticks spent in GM pipe's ordered resp buffer. + Stats::Distribution gmToCompleteLatency; + + // average number of cache blocks requested by vmem inst + Stats::Distribution coalsrLineAddresses; + + // average ticks for cache blocks to main memory for the Nth + // cache block generated by a vmem inst. + Stats::Distribution *cacheBlockRoundTrip; + + Stats::Scalar shaderActiveTicks; + Stats::Vector vectorInstSrcOperand; + Stats::Vector vectorInstDstOperand; + } stats; }; #endif // __SHADER_HH__ diff --git a/src/gpu-compute/static_register_manager_policy.cc b/src/gpu-compute/static_register_manager_policy.cc index f1bc1e6f1..62b29cff3 100644 --- a/src/gpu-compute/static_register_manager_policy.cc +++ b/src/gpu-compute/static_register_manager_policy.cc @@ -180,8 +180,3 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w) w->reservedScalarRegs = 0; w->startSgprIndex = 0; } - -void -StaticRegisterManagerPolicy::regStats() -{ -} diff --git a/src/gpu-compute/static_register_manager_policy.hh b/src/gpu-compute/static_register_manager_policy.hh index 6abeb1d1a..812232f4a 100644 --- a/src/gpu-compute/static_register_manager_policy.hh +++ b/src/gpu-compute/static_register_manager_policy.hh @@ -58,8 +58,6 @@ class StaticRegisterManagerPolicy : public RegisterManagerPolicy int scalarDemand) override; void freeRegisters(Wavefront *w) override; - - void regStats() override; }; #endif // __STATIC_REGISTER_MANAGER_POLICY_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc index 39db4ee13..aaf470f4e 100644 --- a/src/gpu-compute/tlb_coalescer.cc +++ b/src/gpu-compute/tlb_coalescer.cc @@ -50,7 +50,8 @@ TLBCoalescer::TLBCoalescer(const Params &p) false, Event::CPU_Tick_Pri), cleanupEvent([this]{ processCleanupEvent(); }, "Cleanup issuedTranslationsTable hashmap", - false, Event::Maximum_Pri) + false, Event::Maximum_Pri), + stats(this) { // create the response ports based on the number of connected ports for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) { @@ -256,11 +257,11 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) sender_state->reqCnt.push_back(req_cnt); // update statistics - coalescer->uncoalescedAccesses++; + coalescer->stats.uncoalescedAccesses++; req_cnt = sender_state->reqCnt.back(); DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); - coalescer->queuingCycles -= (curTick() * req_cnt); - coalescer->localqueuingCycles -= curTick(); + coalescer->stats.queuingCycles -= (curTick() * req_cnt); + coalescer->stats.localqueuingCycles -= curTick(); } // FIXME if you want to coalesce not based on the issueTime @@ -302,7 +303,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) // and make necessary allocations. if (!coalescedReq_cnt || !didCoalesce) { if (update_stats) - coalescer->coalescedAccesses++; + coalescer->stats.coalescedAccesses++; std::vector new_array; new_array.push_back(pkt); @@ -339,7 +340,7 @@ TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) bool update_stats = !sender_state->prefetch; if (update_stats) - coalescer->uncoalescedAccesses++; + coalescer->stats.uncoalescedAccesses++; // If there is a pending timing request for this virtual address // print a warning message. This is a temporary caveat of @@ -467,7 +468,7 @@ TLBCoalescer::processProbeTLBEvent() // by the one we just sent counting all the way from // the top of TLB hiearchy (i.e., from the CU) int req_cnt = tmp_sender_state->reqCnt.back(); - queuingCycles += (curTick() * req_cnt); + stats.queuingCycles += (curTick() * req_cnt); DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", name(), req_cnt); @@ -475,7 +476,7 @@ TLBCoalescer::processProbeTLBEvent() // pkt_cnt is number of packets we coalesced into the one // we just sent but only at this coalescer level int pkt_cnt = iter->second[vector_index].size(); - localqueuingCycles += (curTick() * pkt_cnt); + stats.localqueuingCycles += (curTick() * pkt_cnt); } DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", @@ -520,35 +521,14 @@ TLBCoalescer::processCleanupEvent() } } -void -TLBCoalescer::regStats() +TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"), + ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"), + ADD_STAT(queuingCycles, "Number of cycles spent in queue"), + ADD_STAT(localqueuingCycles, + "Number of cycles spent in queue for all incoming reqs"), + ADD_STAT(localLatency, "Avg. latency over all incoming pkts") { - ClockedObject::regStats(); - - uncoalescedAccesses - .name(name() + ".uncoalesced_accesses") - .desc("Number of uncoalesced TLB accesses") - ; - - coalescedAccesses - .name(name() + ".coalesced_accesses") - .desc("Number of coalesced TLB accesses") - ; - - queuingCycles - .name(name() + ".queuing_cycles") - .desc("Number of cycles spent in queue") - ; - - localqueuingCycles - .name(name() + ".local_queuing_cycles") - .desc("Number of cycles spent in queue for all incoming reqs") - ; - - localLatency - .name(name() + ".local_latency") - .desc("Avg. latency over all incoming pkts") - ; - localLatency = localqueuingCycles / uncoalescedAccesses; } diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh index 8b71a982d..ef35ecbaa 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/gpu-compute/tlb_coalescer.hh @@ -115,26 +115,8 @@ class TLBCoalescer : public ClockedObject CoalescingTable issuedTranslationsTable; - // number of packets the coalescer receives - Stats::Scalar uncoalescedAccesses; - // number packets the coalescer send to the TLB - Stats::Scalar coalescedAccesses; - - // Number of cycles the coalesced requests spend waiting in - // coalescerFIFO. For each packet the coalescer receives we take into - // account the number of all uncoalesced requests this pkt "represents" - Stats::Scalar queuingCycles; - - // On average how much time a request from the - // uncoalescedAccesses that reaches the TLB - // spends waiting? - Stats::Scalar localqueuingCycles; - // localqueuingCycles/uncoalescedAccesses - Stats::Formula localLatency; - bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2); void updatePhysAddresses(PacketPtr pkt); - void regStats() override; class CpuSidePort : public ResponsePort { @@ -211,6 +193,29 @@ class TLBCoalescer : public ClockedObject // this FIFO queue keeps track of the virt. page // addresses that are pending cleanup std::queue cleanupQueue; + + protected: + struct TLBCoalescerStats : public Stats::Group + { + TLBCoalescerStats(Stats::Group *parent); + + // number of packets the coalescer receives + Stats::Scalar uncoalescedAccesses; + // number packets the coalescer send to the TLB + Stats::Scalar coalescedAccesses; + + // Number of cycles the coalesced requests spend waiting in + // coalescerFIFO. For each packet the coalescer receives we take into + // account the number of all uncoalesced requests this pkt "represents" + Stats::Scalar queuingCycles; + + // On average how much time a request from the + // uncoalescedAccesses that reaches the TLB + // spends waiting? + Stats::Scalar localqueuingCycles; + // localqueuingCycles/uncoalescedAccesses + Stats::Formula localLatency; + } stats; }; #endif // __TLB_COALESCER_HH__ diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index c44955cd8..40ce281f6 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -69,11 +69,11 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const ->mapVgpr(w, vgprIdx + j); if (regBusy(pVgpr)) { if (ii->isDstOperand(i)) { - w->numTimesBlockedDueWAXDependencies++; + w->stats.numTimesBlockedDueWAXDependencies++; } else if (ii->isSrcOperand(i)) { DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n", w->wfDynId, ii->disassemble(), pVgpr); - w->numTimesBlockedDueRAWDependencies++; + w->stats.numTimesBlockedDueRAWDependencies++; } return false; } @@ -125,13 +125,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) { // increment count of number of DWORDs read from VRF int DWORDs = ii->numSrcVecDWORDs(); - registerReads += (DWORDs * w->execMask().count()); + stats.registerReads += (DWORDs * w->execMask().count()); uint64_t mask = w->execMask().to_ullong(); int srams = w->execMask().size() / 4; for (int i = 0; i < srams; i++) { if (mask & 0xF) { - sramReads += DWORDs; + stats.sramReads += DWORDs; } mask = mask >> 4; } @@ -163,13 +163,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) // increment count of number of DWORDs written to VRF DWORDs = ii->numDstVecDWORDs(); - registerWrites += (DWORDs * w->execMask().count()); + stats.registerWrites += (DWORDs * w->execMask().count()); mask = w->execMask().to_ullong(); srams = w->execMask().size() / 4; for (int i = 0; i < srams; i++) { if (mask & 0xF) { - sramWrites += DWORDs; + stats.sramWrites += DWORDs; } mask = mask >> 4; } @@ -196,13 +196,13 @@ VectorRegisterFile::scheduleWriteOperandsFromLoad( } // increment count of number of DWORDs written to VRF int DWORDs = ii->numDstVecDWORDs(); - registerWrites += (DWORDs * ii->exec_mask.count()); + stats.registerWrites += (DWORDs * ii->exec_mask.count()); uint64_t mask = ii->exec_mask.to_ullong(); int srams = ii->exec_mask.size() / 4; for (int i = 0; i < srams; i++) { if (mask & 0xF) { - sramWrites += DWORDs; + stats.sramWrites += DWORDs; } mask = mask >> 4; } diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index b7ff95ab2..343b5c9f2 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -49,7 +49,7 @@ Wavefront::Wavefront(const Params &p) maxIbSize(p.max_ib_size), _gpuISA(*this), vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1), vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0), - barId(WFBarrier::InvalidID) + barId(WFBarrier::InvalidID), stats(this) { lastTrace = 0; execUnitId = -1; @@ -97,75 +97,6 @@ Wavefront::Wavefront(const Params &p) vecReads.clear(); } -void -Wavefront::regStats() -{ - SimObject::regStats(); - - // FIXME: the name of the WF needs to be unique - numTimesBlockedDueWAXDependencies - .name(name() + ".timesBlockedDueWAXDependencies") - .desc("number of times the wf's instructions are blocked due to WAW " - "or WAR dependencies") - ; - - // FIXME: the name of the WF needs to be unique - numTimesBlockedDueRAWDependencies - .name(name() + ".timesBlockedDueRAWDependencies") - .desc("number of times the wf's instructions are blocked due to RAW " - "dependencies") - ; - - numInstrExecuted - .name(name() + ".num_instr_executed") - .desc("number of instructions executed by this WF slot") - ; - - schCycles - .name(name() + ".sch_cycles") - .desc("number of cycles spent in schedule stage") - ; - - schStalls - .name(name() + ".sch_stalls") - .desc("number of cycles WF is stalled in SCH stage") - ; - - schRfAccessStalls - .name(name() + ".sch_rf_access_stalls") - .desc("number of cycles wave selected in SCH but RF denied adding " - "instruction") - ; - - schResourceStalls - .name(name() + ".sch_resource_stalls") - .desc("number of cycles stalled in sch by resource not available") - ; - - schOpdNrdyStalls - .name(name() + ".sch_opd_nrdy_stalls") - .desc("number of cycles stalled in sch waiting for RF reads to " - "complete") - ; - - schLdsArbStalls - .name(name() + ".sch_lds_arb_stalls") - .desc("number of cycles wave stalled due to LDS-VRF arbitration") - ; - - vecRawDistance - .init(0,20,1) - .name(name() + ".vec_raw_distance") - .desc("Count of RAW distance in dynamic instructions for this WF") - ; - - readsPerWrite - .init(0,4,1) - .name(name() + ".vec_reads_per_write") - .desc("Count of Vector reads per write for this WF") - ; -} - void Wavefront::init() { @@ -959,17 +890,19 @@ Wavefront::exec() } computeUnit->srf[simdId]->waveExecuteInst(this, ii); - computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++; - computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++; - computeUnit->numInstrExecuted++; - numInstrExecuted++; + computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands()); + computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands()); + computeUnit->stats.numInstrExecuted++; + stats.numInstrExecuted++; computeUnit->instExecPerSimd[simdId]++; - computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - - computeUnit->lastExecCycle[simdId]); - computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); + computeUnit->stats.execRateDist.sample( + computeUnit->stats.totalCycles.value() - + computeUnit->lastExecCycle[simdId]); + computeUnit->lastExecCycle[simdId] = + computeUnit->stats.totalCycles.value(); if (lastInstExec) { - computeUnit->instInterleave[simdId]. + computeUnit->stats.instInterleave[simdId]. sample(computeUnit->instExecPerSimd[simdId] - lastInstExec); } lastInstExec = computeUnit->instExecPerSimd[simdId]; @@ -987,8 +920,8 @@ Wavefront::exec() if (ii->isSrcOperand(i)) { // This check should never fail, but to be safe we check if (rawDist.find(vgpr+n) != rawDist.end()) { - vecRawDistance. - sample(numInstrExecuted.value() - rawDist[vgpr+n]); + stats.vecRawDistance.sample( + stats.numInstrExecuted.value() - rawDist[vgpr+n]); } // increment number of reads to this register vecReads[vgpr+n]++; @@ -997,12 +930,12 @@ Wavefront::exec() // for the first write to each physical register if (rawDist.find(vgpr+n) != rawDist.end()) { // sample the number of reads that were performed - readsPerWrite.sample(vecReads[vgpr+n]); + stats.readsPerWrite.sample(vecReads[vgpr+n]); } // on a write, reset count of reads to 0 vecReads[vgpr+n] = 0; - rawDist[vgpr+n] = numInstrExecuted.value(); + rawDist[vgpr+n] = stats.numInstrExecuted.value(); } } } @@ -1023,26 +956,29 @@ Wavefront::exec() if (computeUnit->shader->hsail_mode==Shader::SIMT) { const int num_active_lanes = execMask().count(); - computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); - computeUnit->numVecOpsExecuted += num_active_lanes; + computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes); + computeUnit->stats.numVecOpsExecuted += num_active_lanes; if (ii->isF16() && ii->isALU()) { if (ii->isF32() || ii->isF64()) { fatal("Instruction is tagged as both (1) F16, and (2)" "either F32 or F64."); } - computeUnit->numVecOpsExecutedF16 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes; if (ii->isFMA()) { - computeUnit->numVecOpsExecutedFMA16 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAC()) { - computeUnit->numVecOpsExecutedMAC16 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAD()) { - computeUnit->numVecOpsExecutedMAD16 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } } if (ii->isF32() && ii->isALU()) { @@ -1050,18 +986,21 @@ Wavefront::exec() fatal("Instruction is tagged as both (1) F32, and (2)" "either F16 or F64."); } - computeUnit->numVecOpsExecutedF32 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes; if (ii->isFMA()) { - computeUnit->numVecOpsExecutedFMA32 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAC()) { - computeUnit->numVecOpsExecutedMAC32 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAD()) { - computeUnit->numVecOpsExecutedMAD32 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } } if (ii->isF64() && ii->isALU()) { @@ -1069,24 +1008,29 @@ Wavefront::exec() fatal("Instruction is tagged as both (1) F64, and (2)" "either F16 or F32."); } - computeUnit->numVecOpsExecutedF64 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes; if (ii->isFMA()) { - computeUnit->numVecOpsExecutedFMA64 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAC()) { - computeUnit->numVecOpsExecutedMAC64 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } else if (ii->isMAD()) { - computeUnit->numVecOpsExecutedMAD64 += num_active_lanes; - computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes; + computeUnit->stats.numVecOpsExecutedTwoOpFP + += num_active_lanes; } } if (isGmInstruction(ii)) { - computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); + computeUnit->stats.activeLanesPerGMemInstrDist.sample( + num_active_lanes); } else if (isLmInstruction(ii)) { - computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); + computeUnit->stats.activeLanesPerLMemInstrDist.sample( + num_active_lanes); } } @@ -1133,14 +1077,14 @@ Wavefront::exec() computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency)); computeUnit->vectorGlobalMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesVMemPerSimd[simdId] += + computeUnit->stats.instCyclesVMemPerSimd[simdId] += computeUnit->vrf_gm_bus_latency; } else { computeUnit->srfToScalarMemPipeBus.set(computeUnit-> cyclesToTicks(computeUnit->srf_scm_bus_latency)); computeUnit->scalarMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesScMemPerSimd[simdId] += + computeUnit->stats.instCyclesScMemPerSimd[simdId] += computeUnit->srf_scm_bus_latency; } // GM or Flat as GM Store @@ -1150,14 +1094,14 @@ Wavefront::exec() cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); computeUnit->vectorGlobalMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesVMemPerSimd[simdId] += + computeUnit->stats.instCyclesVMemPerSimd[simdId] += (2 * computeUnit->vrf_gm_bus_latency); } else { computeUnit->srfToScalarMemPipeBus.set(computeUnit-> cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); computeUnit->scalarMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesScMemPerSimd[simdId] += + computeUnit->stats.instCyclesScMemPerSimd[simdId] += (2 * computeUnit->srf_scm_bus_latency); } } else if ((ii->isAtomic() || ii->isMemSync()) && @@ -1167,14 +1111,14 @@ Wavefront::exec() cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); computeUnit->vectorGlobalMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesVMemPerSimd[simdId] += + computeUnit->stats.instCyclesVMemPerSimd[simdId] += (2 * computeUnit->vrf_gm_bus_latency); } else { computeUnit->srfToScalarMemPipeBus.set(computeUnit-> cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); computeUnit->scalarMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesScMemPerSimd[simdId] += + computeUnit->stats.instCyclesScMemPerSimd[simdId] += (2 * computeUnit->srf_scm_bus_latency); } // LM or Flat as LM Load @@ -1183,7 +1127,7 @@ Wavefront::exec() cyclesToTicks(computeUnit->vrf_lm_bus_latency)); computeUnit->vectorSharedMemUnit. set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesLdsPerSimd[simdId] += + computeUnit->stats.instCyclesLdsPerSimd[simdId] += computeUnit->vrf_lm_bus_latency; // LM or Flat as LM Store } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) { @@ -1191,7 +1135,7 @@ Wavefront::exec() cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); computeUnit->vectorSharedMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesLdsPerSimd[simdId] += + computeUnit->stats.instCyclesLdsPerSimd[simdId] += (2 * computeUnit->vrf_lm_bus_latency); // LM or Flat as LM, Atomic or MemFence } else if ((ii->isAtomic() || ii->isMemSync()) && @@ -1200,7 +1144,7 @@ Wavefront::exec() cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); computeUnit->vectorSharedMemUnit. set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); - computeUnit->instCyclesLdsPerSimd[simdId] += + computeUnit->stats.instCyclesLdsPerSimd[simdId] += (2 * computeUnit->vrf_lm_bus_latency); } else { panic("Bad instruction type!\n"); @@ -1453,3 +1397,31 @@ Wavefront::releaseBarrier() { barId = WFBarrier::InvalidID; } + +Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(numInstrExecuted, + "number of instructions executed by this WF slot"), + ADD_STAT(schCycles, "number of cycles spent in schedule stage"), + ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"), + ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but " + "RF denied adding instruction"), + ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource" + " not available"), + ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for " + "RF reads to complete"), + ADD_STAT(schLdsArbStalls, + "number of cycles wave stalled due to LDS-VRF arbitration"), + // FIXME: the name of the WF needs to be unique + ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's " + "instructions are blocked due to WAW or WAR dependencies"), + // FIXME: the name of the WF needs to be unique + ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's " + "instructions are blocked due to RAW dependencies"), + ADD_STAT(vecRawDistance, + "Count of RAW distance in dynamic instructions for this WF"), + ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF") +{ + vecRawDistance.init(0, 20, 1); + readsPerWrite.init(0, 4, 1); +} diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 80fc3248a..7b617c63f 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -43,6 +43,8 @@ #include "arch/gpu_isa.hh" #include "base/logging.hh" +#include "base/statistics.hh" +#include "base/stats/group.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "gpu-compute/compute_unit.hh" @@ -217,52 +219,13 @@ class Wavefront : public SimObject // unique WF id over all WFs executed across all CUs uint64_t wfDynId; - // Wavefront slot stats - - // Number of instructions executed by this wavefront slot across all - // dynamic wavefronts - Stats::Scalar numInstrExecuted; - - // Number of cycles this WF spends in SCH stage - Stats::Scalar schCycles; - - // Number of stall cycles encounterd by this WF in SCH stage - Stats::Scalar schStalls; - - // The following stats sum to the value of schStalls, and record, per - // WF slot, what the cause of each stall was at a coarse granularity. - - // Cycles WF is selected by scheduler, but RFs cannot support instruction - Stats::Scalar schRfAccessStalls; - // Cycles spent waiting for execution resources - Stats::Scalar schResourceStalls; - // cycles spent waiting for RF reads to complete in SCH stage - Stats::Scalar schOpdNrdyStalls; - // LDS arbitration stall cycles. WF attempts to execute LM instruction, - // but another wave is executing FLAT, which requires LM and GM and forces - // this WF to stall. - Stats::Scalar schLdsArbStalls; - - // number of times an instruction of a WF is blocked from being issued - // due to WAR and WAW dependencies - Stats::Scalar numTimesBlockedDueWAXDependencies; - // number of times an instruction of a WF is blocked from being issued - // due to WAR and WAW dependencies - Stats::Scalar numTimesBlockedDueRAWDependencies; - // dyn inst id (per SIMD) of last instruction exec from this wave uint64_t lastInstExec; - // Distribution to track the distance between producer and consumer - // for vector register values - Stats::Distribution vecRawDistance; // Map to track the dyn instruction id of each vector register value // produced, indexed by physical vector register ID std::unordered_map rawDist; - // Distribution to track the number of times every vector register - // value produced is consumed. - Stats::Distribution readsPerWrite; // Counts the number of reads performed to each physical register // - counts are reset to 0 for each dynamic wavefront launched std::vector vecReads; @@ -289,7 +252,6 @@ class Wavefront : public SimObject // called by SCH stage to reserve std::vector reserveResources(); bool stopFetch(); - void regStats(); Addr pc() const; void pc(Addr new_pc); @@ -357,6 +319,52 @@ class Wavefront : public SimObject Addr _pc; VectorMask _execMask; int barId; + + public: + struct WavefrontStats : public Stats::Group + { + WavefrontStats(Stats::Group *parent); + + // Number of instructions executed by this wavefront slot across all + // dynamic wavefronts + Stats::Scalar numInstrExecuted; + + // Number of cycles this WF spends in SCH stage + Stats::Scalar schCycles; + + // Number of stall cycles encounterd by this WF in SCH stage + Stats::Scalar schStalls; + + // The following stats sum to the value of schStalls, and record, per + // WF slot, what the cause of each stall was at a coarse granularity. + + // Cycles WF is selected by scheduler, but RFs cannot support + // instruction + Stats::Scalar schRfAccessStalls; + // Cycles spent waiting for execution resources + Stats::Scalar schResourceStalls; + // cycles spent waiting for RF reads to complete in SCH stage + Stats::Scalar schOpdNrdyStalls; + // LDS arbitration stall cycles. WF attempts to execute LM instruction, + // but another wave is executing FLAT, which requires LM and GM and + // forces this WF to stall. + Stats::Scalar schLdsArbStalls; + + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueWAXDependencies; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueRAWDependencies; + + // Distribution to track the distance between producer and consumer + // for vector register values + Stats::Distribution vecRawDistance; + + // Distribution to track the number of times every vector register + // value produced is consumed. + Stats::Distribution readsPerWrite; + } stats; }; #endif // __GPU_COMPUTE_WAVEFRONT_HH__ -- 2.30.2