Convert all gpu-compute stats to Stats::Group style.
Change-Id: I29116f1de53ae379210c6cfb5bed3fc74f50cca5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39135
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matthew Poremba <matthew.poremba@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
wf->computeUnit->cu_id, wf->wgId, refCount);
wf->computeUnit->registerManager->freeRegisters(wf);
- wf->computeUnit->completedWfs++;
+ wf->computeUnit->stats.completedWfs++;
wf->computeUnit->activeWaves--;
panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
for (int i = 0; i < wf->vecReads.size(); i++) {
if (wf->rawDist.find(i) != wf->rawDist.end()) {
- wf->readsPerWrite.sample(wf->vecReads.at(i));
+ wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
}
}
wf->vecReads.clear();
if (!kernelEnd || !relNeeded) {
wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
wf->setStatus(Wavefront::S_STOPPED);
- wf->computeUnit->completedWGs++;
+ wf->computeUnit->stats.completedWGs++;
return;
}
// call shader to prepare the flush operations
wf->computeUnit->shader->prepareFlush(gpuDynInst);
- wf->computeUnit->completedWGs++;
+ wf->computeUnit->stats.completedWGs++;
} else {
wf->computeUnit->shader->dispatcher().scheduleDispatch();
}
_numBarrierSlots(p.num_barrier_slots),
globalSeqNum(0), wavefrontSize(p.wf_size),
scoreboardCheckToSchedule(p),
- scheduleToExecute(p)
+ scheduleToExecute(p),
+ stats(this, p.n_wf)
{
/**
* This check is necessary because std::bitset only provides conversion
w->initRegState(task, w->actualWgSzTotal);
w->start(_n_wave++, task->codeAddr());
- waveLevelParallelism.sample(activeWaves);
+ stats.waveLevelParallelism.sample(activeWaves);
activeWaves++;
}
freeWfSlots, numMappedWfs, vregAvail, sregAvail);
if (!vregAvail) {
- ++numTimesWgBlockedDueVgprAlloc;
+ ++stats.numTimesWgBlockedDueVgprAlloc;
}
if (!sregAvail) {
- ++numTimesWgBlockedDueSgprAlloc;
+ ++stats.numTimesWgBlockedDueSgprAlloc;
}
// Return true if enough WF slots to submit workgroup and if there are
// enough VGPRs to schedule all WFs to their SIMD units
bool ldsAvail = lds.canReserve(task->ldsSize());
if (!ldsAvail) {
- wgBlockedDueLdsAllocation++;
+ stats.wgBlockedDueLdsAllocation++;
}
if (!barrier_avail) {
- wgBlockedDueBarrierAllocation++;
+ stats.wgBlockedDueBarrierAllocation++;
}
// Return true if the following are all true:
scoreboardCheckStage.exec();
fetchStage.exec();
- totalCycles++;
+ stats.totalCycles++;
// Put this CU to sleep if there is no more work to be done.
if (!isDone()) {
fatal("pkt is not a read nor a write\n");
}
- tlbCycles -= curTick();
- ++tlbRequests;
+ stats.tlbCycles -= curTick();
+ ++stats.tlbRequests;
PortID tlbPort_index = perLaneTLB ? index : 0;
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
assert(hit_level != -1);
- hitsPerTLBLevel[hit_level]++;
+ stats.hitsPerTLBLevel[hit_level]++;
// New SenderState for the memory access
X86ISA::GpuTLB::TranslationState *sender_state =
// for the first cache block.
if (compute_unit->headTailMap.count(gpuDynInst)) {
Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
- compute_unit->headTailLatency.sample(curTick() - headTick);
+ compute_unit->stats.headTailLatency.sample(curTick() - headTick);
compute_unit->headTailMap.erase(gpuDynInst);
}
pkt->req->getVaddr(), line);
assert(pkt->senderState);
- computeUnit->tlbCycles += curTick();
+ computeUnit->stats.tlbCycles += curTick();
// pop off the TLB translation state
X86ISA::GpuTLB::TranslationState *translation_state =
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
- computeUnit->hitsPerTLBLevel[hit_level]++;
+ computeUnit->stats.hitsPerTLBLevel[hit_level]++;
delete translation_state->tlbEntry;
assert(!translation_state->ports.size());
}
}
-void
-ComputeUnit::regStats()
-{
- ClockedObject::regStats();
-
- vALUInsts
- .name(name() + ".valu_insts")
- .desc("Number of vector ALU insts issued.")
- ;
- vALUInstsPerWF
- .name(name() + ".valu_insts_per_wf")
- .desc("The avg. number of vector ALU insts issued per-wavefront.")
- ;
- sALUInsts
- .name(name() + ".salu_insts")
- .desc("Number of scalar ALU insts issued.")
- ;
- sALUInstsPerWF
- .name(name() + ".salu_insts_per_wf")
- .desc("The avg. number of scalar ALU insts issued per-wavefront.")
- ;
- instCyclesVALU
- .name(name() + ".inst_cycles_valu")
- .desc("Number of cycles needed to execute VALU insts.")
- ;
- instCyclesSALU
- .name(name() + ".inst_cycles_salu")
- .desc("Number of cycles needed to execute SALU insts.")
- ;
- threadCyclesVALU
- .name(name() + ".thread_cycles_valu")
- .desc("Number of thread cycles used to execute vector ALU ops. "
- "Similar to instCyclesVALU but multiplied by the number of "
- "active threads.")
- ;
- vALUUtilization
- .name(name() + ".valu_utilization")
- .desc("Percentage of active vector ALU threads in a wave.")
- ;
- ldsNoFlatInsts
- .name(name() + ".lds_no_flat_insts")
- .desc("Number of LDS insts issued, not including FLAT "
- "accesses that resolve to LDS.")
- ;
- ldsNoFlatInstsPerWF
- .name(name() + ".lds_no_flat_insts_per_wf")
- .desc("The avg. number of LDS insts (not including FLAT "
- "accesses that resolve to LDS) per-wavefront.")
- ;
- flatVMemInsts
- .name(name() + ".flat_vmem_insts")
- .desc("The number of FLAT insts that resolve to vmem issued.")
- ;
- flatVMemInstsPerWF
- .name(name() + ".flat_vmem_insts_per_wf")
- .desc("The average number of FLAT insts that resolve to vmem "
- "issued per-wavefront.")
- ;
- flatLDSInsts
- .name(name() + ".flat_lds_insts")
- .desc("The number of FLAT insts that resolve to LDS issued.")
- ;
- flatLDSInstsPerWF
- .name(name() + ".flat_lds_insts_per_wf")
- .desc("The average number of FLAT insts that resolve to LDS "
- "issued per-wavefront.")
- ;
- vectorMemWrites
- .name(name() + ".vector_mem_writes")
- .desc("Number of vector mem write insts (excluding FLAT insts).")
- ;
- vectorMemWritesPerWF
- .name(name() + ".vector_mem_writes_per_wf")
- .desc("The average number of vector mem write insts "
- "(excluding FLAT insts) per-wavefront.")
- ;
- vectorMemReads
- .name(name() + ".vector_mem_reads")
- .desc("Number of vector mem read insts (excluding FLAT insts).")
- ;
- vectorMemReadsPerWF
- .name(name() + ".vector_mem_reads_per_wf")
- .desc("The avg. number of vector mem read insts (excluding "
- "FLAT insts) per-wavefront.")
- ;
- scalarMemWrites
- .name(name() + ".scalar_mem_writes")
- .desc("Number of scalar mem write insts.")
- ;
- scalarMemWritesPerWF
- .name(name() + ".scalar_mem_writes_per_wf")
- .desc("The average number of scalar mem write insts per-wavefront.")
- ;
- scalarMemReads
- .name(name() + ".scalar_mem_reads")
- .desc("Number of scalar mem read insts.")
- ;
- scalarMemReadsPerWF
- .name(name() + ".scalar_mem_reads_per_wf")
- .desc("The average number of scalar mem read insts per-wavefront.")
- ;
-
- vALUInstsPerWF = vALUInsts / completedWfs;
- sALUInstsPerWF = sALUInsts / completedWfs;
- vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
- ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
- flatVMemInstsPerWF = flatVMemInsts / completedWfs;
- flatLDSInstsPerWF = flatLDSInsts / completedWfs;
- vectorMemWritesPerWF = vectorMemWrites / completedWfs;
- vectorMemReadsPerWF = vectorMemReads / completedWfs;
- scalarMemWritesPerWF = scalarMemWrites / completedWfs;
- scalarMemReadsPerWF = scalarMemReads / completedWfs;
-
- vectorMemReadsPerKiloInst
- .name(name() + ".vector_mem_reads_per_kilo_inst")
- .desc("Number of vector mem reads per kilo-instruction")
- ;
- vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
- vectorMemWritesPerKiloInst
- .name(name() + ".vector_mem_writes_per_kilo_inst")
- .desc("Number of vector mem writes per kilo-instruction")
- ;
- vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
- vectorMemInstsPerKiloInst
- .name(name() + ".vector_mem_insts_per_kilo_inst")
- .desc("Number of vector mem insts per kilo-instruction")
- ;
- vectorMemInstsPerKiloInst =
- ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
- scalarMemReadsPerKiloInst
- .name(name() + ".scalar_mem_reads_per_kilo_inst")
- .desc("Number of scalar mem reads per kilo-instruction")
- ;
- scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
- scalarMemWritesPerKiloInst
- .name(name() + ".scalar_mem_writes_per_kilo_inst")
- .desc("Number of scalar mem writes per kilo-instruction")
- ;
- scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
- scalarMemInstsPerKiloInst
- .name(name() + ".scalar_mem_insts_per_kilo_inst")
- .desc("Number of scalar mem insts per kilo-instruction")
- ;
- scalarMemInstsPerKiloInst =
- ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
-
- instCyclesVMemPerSimd
- .init(numVectorALUs)
- .name(name() + ".inst_cycles_vector_memory")
- .desc("Number of cycles to send address, command, data from VRF to "
- "vector memory unit, per SIMD")
- ;
-
- instCyclesScMemPerSimd
- .init(numVectorALUs)
- .name(name() + ".inst_cycles_scalar_memory")
- .desc("Number of cycles to send address, command, data from SRF to "
- "scalar memory unit, per SIMD")
- ;
-
- instCyclesLdsPerSimd
- .init(numVectorALUs)
- .name(name() + ".inst_cycles_lds")
- .desc("Number of cycles to send address, command, data from VRF to "
- "LDS unit, per SIMD")
- ;
-
- globalReads
- .name(name() + ".global_mem_reads")
- .desc("Number of reads to the global segment")
- ;
- globalWrites
- .name(name() + ".global_mem_writes")
- .desc("Number of writes to the global segment")
- ;
- globalMemInsts
- .name(name() + ".global_mem_insts")
- .desc("Number of memory instructions sent to the global segment")
- ;
- globalMemInsts = globalReads + globalWrites;
- argReads
- .name(name() + ".arg_reads")
- .desc("Number of reads to the arg segment")
- ;
- argWrites
- .name(name() + ".arg_writes")
- .desc("NUmber of writes to the arg segment")
- ;
- argMemInsts
- .name(name() + ".arg_mem_insts")
- .desc("Number of memory instructions sent to the arg segment")
- ;
- argMemInsts = argReads + argWrites;
- spillReads
- .name(name() + ".spill_reads")
- .desc("Number of reads to the spill segment")
- ;
- spillWrites
- .name(name() + ".spill_writes")
- .desc("Number of writes to the spill segment")
- ;
- spillMemInsts
- .name(name() + ".spill_mem_insts")
- .desc("Number of memory instructions sent to the spill segment")
- ;
- spillMemInsts = spillReads + spillWrites;
- groupReads
- .name(name() + ".group_reads")
- .desc("Number of reads to the group segment")
- ;
- groupWrites
- .name(name() + ".group_writes")
- .desc("Number of writes to the group segment")
- ;
- groupMemInsts
- .name(name() + ".group_mem_insts")
- .desc("Number of memory instructions sent to the group segment")
- ;
- groupMemInsts = groupReads + groupWrites;
- privReads
- .name(name() + ".private_reads")
- .desc("Number of reads to the private segment")
- ;
- privWrites
- .name(name() + ".private_writes")
- .desc("Number of writes to the private segment")
- ;
- privMemInsts
- .name(name() + ".private_mem_insts")
- .desc("Number of memory instructions sent to the private segment")
- ;
- privMemInsts = privReads + privWrites;
- readonlyReads
- .name(name() + ".readonly_reads")
- .desc("Number of reads to the readonly segment")
- ;
- readonlyWrites
- .name(name() + ".readonly_writes")
- .desc("Number of memory instructions sent to the readonly segment")
- ;
- readonlyMemInsts
- .name(name() + ".readonly_mem_insts")
- .desc("Number of memory instructions sent to the readonly segment")
- ;
- readonlyMemInsts = readonlyReads + readonlyWrites;
- kernargReads
- .name(name() + ".kernarg_reads")
- .desc("Number of reads sent to the kernarg segment")
- ;
- kernargWrites
- .name(name() + ".kernarg_writes")
- .desc("Number of memory instructions sent to the kernarg segment")
- ;
- kernargMemInsts
- .name(name() + ".kernarg_mem_insts")
- .desc("Number of memory instructions sent to the kernarg segment")
- ;
- kernargMemInsts = kernargReads + kernargWrites;
-
- tlbCycles
- .name(name() + ".tlb_cycles")
- .desc("total number of cycles for all uncoalesced requests")
- ;
-
- tlbRequests
- .name(name() + ".tlb_requests")
- .desc("number of uncoalesced requests")
- ;
-
- tlbLatency
- .name(name() + ".avg_translation_latency")
- .desc("Avg. translation latency for data translations")
- ;
-
- tlbLatency = tlbCycles / tlbRequests;
-
- hitsPerTLBLevel
- .init(4)
- .name(name() + ".TLB_hits_distribution")
- .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
- ;
-
- // fixed number of TLB levels
- for (int i = 0; i < 4; ++i) {
- if (!i)
- hitsPerTLBLevel.subname(i,"page_table");
- else
- hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
- }
-
- execRateDist
- .init(0, 10, 2)
- .name(name() + ".inst_exec_rate")
- .desc("Instruction Execution Rate: Number of executed vector "
- "instructions per cycle")
- ;
-
- ldsBankConflictDist
- .init(0, wfSize(), 2)
- .name(name() + ".lds_bank_conflicts")
- .desc("Number of bank conflicts per LDS memory packet")
- ;
-
- ldsBankAccesses
- .name(name() + ".lds_bank_access_cnt")
- .desc("Total number of LDS bank accesses")
- ;
-
- pageDivergenceDist
- // A wavefront can touch up to N pages per memory instruction where
- // N is equal to the wavefront size
- // The number of pages per bin can be configured (here it's 4).
- .init(1, wfSize(), 4)
- .name(name() + ".page_divergence_dist")
- .desc("pages touched per wf (over all mem. instr.)")
- ;
-
- controlFlowDivergenceDist
- .init(1, wfSize(), 4)
- .name(name() + ".warp_execution_dist")
- .desc("number of lanes active per instruction (oval all instructions)")
- ;
-
- activeLanesPerGMemInstrDist
- .init(1, wfSize(), 4)
- .name(name() + ".gmem_lanes_execution_dist")
- .desc("number of active lanes per global memory instruction")
- ;
-
- activeLanesPerLMemInstrDist
- .init(1, wfSize(), 4)
- .name(name() + ".lmem_lanes_execution_dist")
- .desc("number of active lanes per local memory instruction")
- ;
-
- numInstrExecuted
- .name(name() + ".num_instr_executed")
- .desc("number of instructions executed")
- ;
-
- numVecOpsExecuted
- .name(name() + ".num_vec_ops_executed")
- .desc("number of vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedF16
- .name(name() + ".num_vec_ops_f16_executed")
- .desc("number of f16 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedF32
- .name(name() + ".num_vec_ops_f32_executed")
- .desc("number of f32 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedF64
- .name(name() + ".num_vec_ops_f64_executed")
- .desc("number of f64 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedFMA16
- .name(name() + ".num_vec_ops_fma16_executed")
- .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedFMA32
- .name(name() + ".num_vec_ops_fma32_executed")
- .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedFMA64
- .name(name() + ".num_vec_ops_fma64_executed")
- .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAD16
- .name(name() + ".num_vec_ops_mad16_executed")
- .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAD32
- .name(name() + ".num_vec_ops_mad32_executed")
- .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAD64
- .name(name() + ".num_vec_ops_mad64_executed")
- .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAC16
- .name(name() + ".num_vec_ops_mac16_executed")
- .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAC32
- .name(name() + ".num_vec_ops_mac32_executed")
- .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedMAC64
- .name(name() + ".num_vec_ops_mac64_executed")
- .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
- ;
-
- numVecOpsExecutedTwoOpFP
- .name(name() + ".num_vec_ops_two_op_fp_executed")
- .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
- ;
-
- totalCycles
- .name(name() + ".num_total_cycles")
- .desc("number of cycles the CU ran for")
- ;
-
- ipc
- .name(name() + ".ipc")
- .desc("Instructions per cycle (this CU only)")
- ;
-
- vpc
- .name(name() + ".vpc")
- .desc("Vector Operations per cycle (this CU only)")
- ;
-
- vpc_f16
- .name(name() + ".vpc_f16")
- .desc("F16 Vector Operations per cycle (this CU only)")
- ;
-
- vpc_f32
- .name(name() + ".vpc_f32")
- .desc("F32 Vector Operations per cycle (this CU only)")
- ;
-
- vpc_f64
- .name(name() + ".vpc_f64")
- .desc("F64 Vector Operations per cycle (this CU only)")
- ;
-
- numALUInstsExecuted
- .name(name() + ".num_alu_insts_executed")
- .desc("Number of dynamic non-GM memory insts executed")
- ;
-
- wgBlockedDueBarrierAllocation
- .name(name() + ".wg_blocked_due_barrier_alloc")
- .desc("WG dispatch was blocked due to lack of barrier resources")
- ;
-
- wgBlockedDueLdsAllocation
- .name(name() + ".wg_blocked_due_lds_alloc")
- .desc("Workgroup blocked due to LDS capacity")
- ;
-
- ipc = numInstrExecuted / totalCycles;
- vpc = numVecOpsExecuted / totalCycles;
- vpc_f16 = numVecOpsExecutedF16 / totalCycles;
- vpc_f32 = numVecOpsExecutedF32 / totalCycles;
- vpc_f64 = numVecOpsExecutedF64 / totalCycles;
-
- numTimesWgBlockedDueVgprAlloc
- .name(name() + ".times_wg_blocked_due_vgpr_alloc")
- .desc("Number of times WGs are blocked due to VGPR allocation per "
- "SIMD")
- ;
-
- numTimesWgBlockedDueSgprAlloc
- .name(name() + ".times_wg_blocked_due_sgpr_alloc")
- .desc("Number of times WGs are blocked due to SGPR allocation per "
- "SIMD")
- ;
-
- dynamicGMemInstrCnt
- .name(name() + ".global_mem_instr_cnt")
- .desc("dynamic non-flat global memory instruction count")
- ;
-
- dynamicFlatMemInstrCnt
- .name(name() + ".flat_global_mem_instr_cnt")
- .desc("dynamic flat global memory instruction count")
- ;
-
- dynamicLMemInstrCnt
- .name(name() + ".local_mem_instr_cnt")
- .desc("dynamic local memory intruction count")
- ;
-
- numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
- dynamicLMemInstrCnt;
-
- completedWfs
- .name(name() + ".num_completed_wfs")
- .desc("number of completed wavefronts")
- ;
-
- completedWGs
- .name(name() + ".num_completed_wgs")
- .desc("number of completed workgroups")
- ;
-
- numCASOps
- .name(name() + ".num_CAS_ops")
- .desc("number of compare and swap operations")
- ;
-
- numFailedCASOps
- .name(name() + ".num_failed_CAS_ops")
- .desc("number of compare and swap operations that failed")
- ;
-
- headTailLatency
- .init(0, 1000000, 10000)
- .name(name() + ".head_tail_latency")
- .desc("ticks between first and last cache block arrival at coalescer")
- .flags(Stats::pdf | Stats::oneline)
- ;
-
- waveLevelParallelism
- .init(0, shader->n_wf * numVectorALUs, 1)
- .name(name() + ".wlp")
- .desc("wave level parallelism: count of active waves at wave launch")
- ;
-
- instInterleave
- .init(numVectorALUs, 0, 20, 1)
- .name(name() + ".interleaving")
- .desc("Measure of instruction interleaving per SIMD")
- ;
-
- // register stats of pipeline stages
- fetchStage.regStats();
- scoreboardCheckStage.regStats();
- scheduleStage.regStats();
- execStage.regStats();
-
- // register stats of memory pipelines
- globalMemoryPipe.regStats();
- localMemoryPipe.regStats();
- scalarMemoryPipe.regStats();
-
- registerManager->regStats();
-}
-
void
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->isScalar()) {
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
- sALUInsts++;
- instCyclesSALU++;
+ stats.sALUInsts++;
+ stats.instCyclesSALU++;
} else if (gpuDynInst->isLoad()) {
- scalarMemReads++;
+ stats.scalarMemReads++;
} else if (gpuDynInst->isStore()) {
- scalarMemWrites++;
+ stats.scalarMemWrites++;
}
} else {
if (gpuDynInst->isALU()) {
if (shader->total_valu_insts == shader->max_valu_insts) {
exitSimLoop("max vALU insts");
}
- vALUInsts++;
- instCyclesVALU++;
- threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
+ stats.vALUInsts++;
+ stats.instCyclesVALU++;
+ stats.threadCyclesVALU
+ += gpuDynInst->wavefront()->execMask().count();
} else if (gpuDynInst->isFlat()) {
if (gpuDynInst->isLocalMem()) {
- flatLDSInsts++;
+ stats.flatLDSInsts++;
} else {
- flatVMemInsts++;
+ stats.flatVMemInsts++;
}
} else if (gpuDynInst->isLocalMem()) {
- ldsNoFlatInsts++;
+ stats.ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {
- vectorMemReads++;
+ stats.vectorMemReads++;
} else if (gpuDynInst->isStore()) {
- vectorMemWrites++;
+ stats.vectorMemWrites++;
}
if (gpuDynInst->isLoad()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
- spillReads++;
+ stats.spillReads++;
break;
case Enums::SC_GLOBAL:
- globalReads++;
+ stats.globalReads++;
break;
case Enums::SC_GROUP:
- groupReads++;
+ stats.groupReads++;
break;
case Enums::SC_PRIVATE:
- privReads++;
+ stats.privReads++;
break;
case Enums::SC_READONLY:
- readonlyReads++;
+ stats.readonlyReads++;
break;
case Enums::SC_KERNARG:
- kernargReads++;
+ stats.kernargReads++;
break;
case Enums::SC_ARG:
- argReads++;
+ stats.argReads++;
break;
case Enums::SC_NONE:
/**
} else if (gpuDynInst->isStore()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
- spillWrites++;
+ stats.spillWrites++;
break;
case Enums::SC_GLOBAL:
- globalWrites++;
+ stats.globalWrites++;
break;
case Enums::SC_GROUP:
- groupWrites++;
+ stats.groupWrites++;
break;
case Enums::SC_PRIVATE:
- privWrites++;
+ stats.privWrites++;
break;
case Enums::SC_READONLY:
- readonlyWrites++;
+ stats.readonlyWrites++;
break;
case Enums::SC_KERNARG:
- kernargWrites++;
+ stats.kernargWrites++;
break;
case Enums::SC_ARG:
- argWrites++;
+ stats.argWrites++;
break;
case Enums::SC_NONE:
/**
}
}
}
+
+ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
+ : Stats::Group(parent),
+ ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
+ ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
+ "per-wavefront."),
+ ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
+ ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
+ "per-wavefront."),
+ ADD_STAT(instCyclesVALU,
+ "Number of cycles needed to execute VALU insts."),
+ ADD_STAT(instCyclesSALU,
+ "Number of cycles needed to execute SALU insts."),
+ ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
+ "vector ALU ops. Similar to instCyclesVALU but multiplied by "
+ "the number of active threads."),
+ ADD_STAT(vALUUtilization,
+ "Percentage of active vector ALU threads in a wave."),
+ ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
+ " accesses that resolve to LDS."),
+ ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
+ "including FLAT accesses that resolve to LDS) per-wavefront."),
+ ADD_STAT(flatVMemInsts,
+ "The number of FLAT insts that resolve to vmem issued."),
+ ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
+ "resolve to vmem issued per-wavefront."),
+ ADD_STAT(flatLDSInsts,
+ "The number of FLAT insts that resolve to LDS issued."),
+ ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
+ "resolve to LDS issued per-wavefront."),
+ ADD_STAT(vectorMemWrites,
+ "Number of vector mem write insts (excluding FLAT insts)."),
+ ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
+ "insts (excluding FLAT insts) per-wavefront."),
+ ADD_STAT(vectorMemReads,
+ "Number of vector mem read insts (excluding FLAT insts)."),
+ ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
+ "(excluding FLAT insts) per-wavefront."),
+ ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
+ ADD_STAT(scalarMemWritesPerWF,
+ "The average number of scalar mem write insts per-wavefront."),
+ ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
+ ADD_STAT(scalarMemReadsPerWF,
+ "The average number of scalar mem read insts per-wavefront."),
+ ADD_STAT(vectorMemReadsPerKiloInst,
+ "Number of vector mem reads per kilo-instruction"),
+ ADD_STAT(vectorMemWritesPerKiloInst,
+ "Number of vector mem writes per kilo-instruction"),
+ ADD_STAT(vectorMemInstsPerKiloInst,
+ "Number of vector mem insts per kilo-instruction"),
+ ADD_STAT(scalarMemReadsPerKiloInst,
+ "Number of scalar mem reads per kilo-instruction"),
+ ADD_STAT(scalarMemWritesPerKiloInst,
+ "Number of scalar mem writes per kilo-instruction"),
+ ADD_STAT(scalarMemInstsPerKiloInst,
+ "Number of scalar mem insts per kilo-instruction"),
+ ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
+ "command, data from VRF to vector memory unit, per SIMD"),
+ ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
+ "command, data from SRF to scalar memory unit, per SIMD"),
+ ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
+ "command, data from VRF to LDS unit, per SIMD"),
+ ADD_STAT(globalReads, "Number of reads to the global segment"),
+ ADD_STAT(globalWrites, "Number of writes to the global segment"),
+ ADD_STAT(globalMemInsts,
+ "Number of memory instructions sent to the global segment"),
+ ADD_STAT(argReads, "Number of reads to the arg segment"),
+ ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
+ ADD_STAT(argMemInsts,
+ "Number of memory instructions sent to the arg segment"),
+ ADD_STAT(spillReads, "Number of reads to the spill segment"),
+ ADD_STAT(spillWrites, "Number of writes to the spill segment"),
+ ADD_STAT(spillMemInsts,
+ "Number of memory instructions sent to the spill segment"),
+ ADD_STAT(groupReads, "Number of reads to the group segment"),
+ ADD_STAT(groupWrites, "Number of writes to the group segment"),
+ ADD_STAT(groupMemInsts,
+ "Number of memory instructions sent to the group segment"),
+ ADD_STAT(privReads, "Number of reads to the private segment"),
+ ADD_STAT(privWrites, "Number of writes to the private segment"),
+ ADD_STAT(privMemInsts,
+ "Number of memory instructions sent to the private segment"),
+ ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
+ ADD_STAT(readonlyWrites,
+ "Number of memory instructions sent to the readonly segment"),
+ ADD_STAT(readonlyMemInsts,
+ "Number of memory instructions sent to the readonly segment"),
+ ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
+ ADD_STAT(kernargWrites,
+ "Number of memory instructions sent to the kernarg segment"),
+ ADD_STAT(kernargMemInsts,
+ "Number of memory instructions sent to the kernarg segment"),
+ ADD_STAT(waveLevelParallelism,
+ "wave level parallelism: count of active waves at wave launch"),
+ ADD_STAT(tlbRequests, "number of uncoalesced requests"),
+ ADD_STAT(tlbCycles,
+ "total number of cycles for all uncoalesced requests"),
+ ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
+ ADD_STAT(hitsPerTLBLevel,
+ "TLB hits distribution (0 for page table, x for Lx-TLB)"),
+ ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
+ ADD_STAT(ldsBankConflictDist,
+ "Number of bank conflicts per LDS memory packet"),
+ ADD_STAT(pageDivergenceDist,
+ "pages touched per wf (over all mem. instr.)"),
+ ADD_STAT(dynamicGMemInstrCnt,
+ "dynamic non-flat global memory instruction count"),
+ ADD_STAT(dynamicFlatMemInstrCnt,
+ "dynamic flat global memory instruction count"),
+ ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
+ ADD_STAT(wgBlockedDueBarrierAllocation,
+ "WG dispatch was blocked due to lack of barrier resources"),
+ ADD_STAT(wgBlockedDueLdsAllocation,
+ "Workgroup blocked due to LDS capacity"),
+ ADD_STAT(numInstrExecuted, "number of instructions executed"),
+ ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
+ "vector instructions per cycle"),
+ ADD_STAT(numVecOpsExecuted,
+ "number of vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedF16,
+ "number of f16 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedF32,
+ "number of f32 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedF64,
+ "number of f64 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedFMA16,
+ "number of fma16 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedFMA32,
+ "number of fma32 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedFMA64,
+ "number of fma64 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAC16,
+ "number of mac16 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAC32,
+ "number of mac32 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAC64,
+ "number of mac64 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAD16,
+ "number of mad16 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAD32,
+ "number of mad32 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedMAD64,
+ "number of mad64 vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(numVecOpsExecutedTwoOpFP,
+ "number of two op FP vec ops executed (e.g. WF size/inst)"),
+ ADD_STAT(totalCycles, "number of cycles the CU ran for"),
+ ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
+ ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
+ ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
+ ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
+ ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
+ ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
+ "instruction (over all instructions)"),
+ ADD_STAT(activeLanesPerGMemInstrDist,
+ "number of active lanes per global memory instruction"),
+ ADD_STAT(activeLanesPerLMemInstrDist,
+ "number of active lanes per local memory instruction"),
+ ADD_STAT(numALUInstsExecuted,
+ "Number of dynamic non-GM memory insts executed"),
+ ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
+ "blocked due to VGPR allocation per SIMD"),
+ ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
+ "blocked due to SGPR allocation per SIMD"),
+ ADD_STAT(numCASOps, "number of compare and swap operations"),
+ ADD_STAT(numFailedCASOps,
+ "number of compare and swap operations that failed"),
+ ADD_STAT(completedWfs, "number of completed wavefronts"),
+ ADD_STAT(completedWGs, "number of completed workgroups"),
+ ADD_STAT(headTailLatency, "ticks between first and last cache block "
+ "arrival at coalescer"),
+ ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
+{
+ ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
+
+ instCyclesVMemPerSimd.init(cu->numVectorALUs);
+ instCyclesScMemPerSimd.init(cu->numVectorALUs);
+ instCyclesLdsPerSimd.init(cu->numVectorALUs);
+
+ hitsPerTLBLevel.init(4);
+ execRateDist.init(0, 10, 2);
+ ldsBankConflictDist.init(0, cu->wfSize(), 2);
+
+ pageDivergenceDist.init(1, cu->wfSize(), 4);
+ controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
+ activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
+ activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
+
+ headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
+ waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
+ instInterleave.init(cu->numVectorALUs, 0, 20, 1);
+
+ vALUInstsPerWF = vALUInsts / completedWfs;
+ sALUInstsPerWF = sALUInsts / completedWfs;
+ vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
+ ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
+ flatVMemInstsPerWF = flatVMemInsts / completedWfs;
+ flatLDSInstsPerWF = flatLDSInsts / completedWfs;
+ vectorMemWritesPerWF = vectorMemWrites / completedWfs;
+ vectorMemReadsPerWF = vectorMemReads / completedWfs;
+ scalarMemWritesPerWF = scalarMemWrites / completedWfs;
+ scalarMemReadsPerWF = scalarMemReads / completedWfs;
+
+ vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
+ vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
+ vectorMemInstsPerKiloInst =
+ ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
+ scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
+ scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
+ scalarMemInstsPerKiloInst =
+ ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
+
+ globalMemInsts = globalReads + globalWrites;
+ argMemInsts = argReads + argWrites;
+ spillMemInsts = spillReads + spillWrites;
+ groupMemInsts = groupReads + groupWrites;
+ privMemInsts = privReads + privWrites;
+ readonlyMemInsts = readonlyReads + readonlyWrites;
+ kernargMemInsts = kernargReads + kernargWrites;
+
+ tlbLatency = tlbCycles / tlbRequests;
+
+ // fixed number of TLB levels
+ for (int i = 0; i < 4; ++i) {
+ if (!i)
+ hitsPerTLBLevel.subname(i,"page_table");
+ else
+ hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+ }
+
+ ipc = numInstrExecuted / totalCycles;
+ vpc = numVecOpsExecuted / totalCycles;
+ vpc_f16 = numVecOpsExecutedF16 / totalCycles;
+ vpc_f32 = numVecOpsExecutedF32 / totalCycles;
+ vpc_f64 = numVecOpsExecutedF64 / totalCycles;
+
+ numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+ dynamicLMemInstrCnt;
+}
#include "base/callback.hh"
#include "base/compiler.hh"
#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
- // Track the amount of interleaving between wavefronts on each SIMD.
- // This stat is sampled using instExecPerSimd to compute the number of
- // instructions that have been executed on a SIMD between a WF executing
- // two successive instructions.
- Stats::VectorDistribution instInterleave;
-
// tracks the number of dyn inst executed per SIMD
std::vector<uint64_t> instExecPerSimd;
LdsState &lds;
public:
- Stats::Scalar vALUInsts;
- Stats::Formula vALUInstsPerWF;
- Stats::Scalar sALUInsts;
- Stats::Formula sALUInstsPerWF;
- Stats::Scalar instCyclesVALU;
- Stats::Scalar instCyclesSALU;
- Stats::Scalar threadCyclesVALU;
- Stats::Formula vALUUtilization;
- Stats::Scalar ldsNoFlatInsts;
- Stats::Formula ldsNoFlatInstsPerWF;
- Stats::Scalar flatVMemInsts;
- Stats::Formula flatVMemInstsPerWF;
- Stats::Scalar flatLDSInsts;
- Stats::Formula flatLDSInstsPerWF;
- Stats::Scalar vectorMemWrites;
- Stats::Formula vectorMemWritesPerWF;
- Stats::Scalar vectorMemReads;
- Stats::Formula vectorMemReadsPerWF;
- Stats::Scalar scalarMemWrites;
- Stats::Formula scalarMemWritesPerWF;
- Stats::Scalar scalarMemReads;
- Stats::Formula scalarMemReadsPerWF;
-
- Stats::Formula vectorMemReadsPerKiloInst;
- Stats::Formula vectorMemWritesPerKiloInst;
- Stats::Formula vectorMemInstsPerKiloInst;
- Stats::Formula scalarMemReadsPerKiloInst;
- Stats::Formula scalarMemWritesPerKiloInst;
- Stats::Formula scalarMemInstsPerKiloInst;
-
- // Cycles required to send register source (addr and data) from
- // register files to memory pipeline, per SIMD.
- Stats::Vector instCyclesVMemPerSimd;
- Stats::Vector instCyclesScMemPerSimd;
- Stats::Vector instCyclesLdsPerSimd;
-
- Stats::Scalar globalReads;
- Stats::Scalar globalWrites;
- Stats::Formula globalMemInsts;
- Stats::Scalar argReads;
- Stats::Scalar argWrites;
- Stats::Formula argMemInsts;
- Stats::Scalar spillReads;
- Stats::Scalar spillWrites;
- Stats::Formula spillMemInsts;
- Stats::Scalar groupReads;
- Stats::Scalar groupWrites;
- Stats::Formula groupMemInsts;
- Stats::Scalar privReads;
- Stats::Scalar privWrites;
- Stats::Formula privMemInsts;
- Stats::Scalar readonlyReads;
- Stats::Scalar readonlyWrites;
- Stats::Formula readonlyMemInsts;
- Stats::Scalar kernargReads;
- Stats::Scalar kernargWrites;
- Stats::Formula kernargMemInsts;
-
- int activeWaves;
- Stats::Distribution waveLevelParallelism;
-
- void updateInstStats(GPUDynInstPtr gpuDynInst);
-
- // the following stats compute the avg. TLB accesslatency per
- // uncoalesced request (only for data)
- Stats::Scalar tlbRequests;
- Stats::Scalar tlbCycles;
- Stats::Formula tlbLatency;
- // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
- Stats::Vector hitsPerTLBLevel;
-
- Stats::Scalar ldsBankAccesses;
- Stats::Distribution ldsBankConflictDist;
-
- // over all memory instructions executed over all wavefronts
- // how many touched 0-4 pages, 4-8, ..., 60-64 pages
- Stats::Distribution pageDivergenceDist;
- // count of non-flat global memory vector instructions executed
- Stats::Scalar dynamicGMemInstrCnt;
- // count of flat global memory vector instructions executed
- Stats::Scalar dynamicFlatMemInstrCnt;
- Stats::Scalar dynamicLMemInstrCnt;
-
- Stats::Scalar wgBlockedDueBarrierAllocation;
- Stats::Scalar wgBlockedDueLdsAllocation;
- // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
- // active when the instruction is committed, this number is still
- // incremented by 1
- Stats::Scalar numInstrExecuted;
- // Number of cycles among successive instruction executions across all
- // wavefronts of the same CU
- Stats::Distribution execRateDist;
- // number of individual vector operations executed
- Stats::Scalar numVecOpsExecuted;
- // number of individual f16 vector operations executed
- Stats::Scalar numVecOpsExecutedF16;
- // number of individual f32 vector operations executed
- Stats::Scalar numVecOpsExecutedF32;
- // number of individual f64 vector operations executed
- Stats::Scalar numVecOpsExecutedF64;
- // number of individual FMA 16,32,64 vector operations executed
- Stats::Scalar numVecOpsExecutedFMA16;
- Stats::Scalar numVecOpsExecutedFMA32;
- Stats::Scalar numVecOpsExecutedFMA64;
- // number of individual MAC 16,32,64 vector operations executed
- Stats::Scalar numVecOpsExecutedMAC16;
- Stats::Scalar numVecOpsExecutedMAC32;
- Stats::Scalar numVecOpsExecutedMAC64;
- // number of individual MAD 16,32,64 vector operations executed
- Stats::Scalar numVecOpsExecutedMAD16;
- Stats::Scalar numVecOpsExecutedMAD32;
- Stats::Scalar numVecOpsExecutedMAD64;
- // total number of two op FP vector operations executed
- Stats::Scalar numVecOpsExecutedTwoOpFP;
- // Total cycles that something is running on the GPU
- Stats::Scalar totalCycles;
- Stats::Formula vpc; // vector ops per cycle
- Stats::Formula vpc_f16; // vector ops per cycle
- Stats::Formula vpc_f32; // vector ops per cycle
- Stats::Formula vpc_f64; // vector ops per cycle
- Stats::Formula ipc; // vector instructions per cycle
- Stats::Distribution controlFlowDivergenceDist;
- Stats::Distribution activeLanesPerGMemInstrDist;
- Stats::Distribution activeLanesPerLMemInstrDist;
- // number of vector ALU instructions received
- Stats::Formula numALUInstsExecuted;
- // number of times a WG can not start due to lack of free VGPRs in SIMDs
- Stats::Scalar numTimesWgBlockedDueVgprAlloc;
- // number of times a WG can not start due to lack of free SGPRs in SIMDs
- Stats::Scalar numTimesWgBlockedDueSgprAlloc;
- Stats::Scalar numCASOps;
- Stats::Scalar numFailedCASOps;
- Stats::Scalar completedWfs;
- Stats::Scalar completedWGs;
-
- // distrubtion in latency difference between first and last cache block
- // arrival ticks
- Stats::Distribution headTailLatency;
-
- void
- regStats() override;
-
LdsState &
getLds() const
{
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
+
+ public:
+ void updateInstStats(GPUDynInstPtr gpuDynInst);
+ int activeWaves;
+
+ struct ComputeUnitStats : public Stats::Group
+ {
+ ComputeUnitStats(Stats::Group *parent, int n_wf);
+
+ Stats::Scalar vALUInsts;
+ Stats::Formula vALUInstsPerWF;
+ Stats::Scalar sALUInsts;
+ Stats::Formula sALUInstsPerWF;
+ Stats::Scalar instCyclesVALU;
+ Stats::Scalar instCyclesSALU;
+ Stats::Scalar threadCyclesVALU;
+ Stats::Formula vALUUtilization;
+ Stats::Scalar ldsNoFlatInsts;
+ Stats::Formula ldsNoFlatInstsPerWF;
+ Stats::Scalar flatVMemInsts;
+ Stats::Formula flatVMemInstsPerWF;
+ Stats::Scalar flatLDSInsts;
+ Stats::Formula flatLDSInstsPerWF;
+ Stats::Scalar vectorMemWrites;
+ Stats::Formula vectorMemWritesPerWF;
+ Stats::Scalar vectorMemReads;
+ Stats::Formula vectorMemReadsPerWF;
+ Stats::Scalar scalarMemWrites;
+ Stats::Formula scalarMemWritesPerWF;
+ Stats::Scalar scalarMemReads;
+ Stats::Formula scalarMemReadsPerWF;
+
+ Stats::Formula vectorMemReadsPerKiloInst;
+ Stats::Formula vectorMemWritesPerKiloInst;
+ Stats::Formula vectorMemInstsPerKiloInst;
+ Stats::Formula scalarMemReadsPerKiloInst;
+ Stats::Formula scalarMemWritesPerKiloInst;
+ Stats::Formula scalarMemInstsPerKiloInst;
+
+ // Cycles required to send register source (addr and data) from
+ // register files to memory pipeline, per SIMD.
+ Stats::Vector instCyclesVMemPerSimd;
+ Stats::Vector instCyclesScMemPerSimd;
+ Stats::Vector instCyclesLdsPerSimd;
+
+ Stats::Scalar globalReads;
+ Stats::Scalar globalWrites;
+ Stats::Formula globalMemInsts;
+ Stats::Scalar argReads;
+ Stats::Scalar argWrites;
+ Stats::Formula argMemInsts;
+ Stats::Scalar spillReads;
+ Stats::Scalar spillWrites;
+ Stats::Formula spillMemInsts;
+ Stats::Scalar groupReads;
+ Stats::Scalar groupWrites;
+ Stats::Formula groupMemInsts;
+ Stats::Scalar privReads;
+ Stats::Scalar privWrites;
+ Stats::Formula privMemInsts;
+ Stats::Scalar readonlyReads;
+ Stats::Scalar readonlyWrites;
+ Stats::Formula readonlyMemInsts;
+ Stats::Scalar kernargReads;
+ Stats::Scalar kernargWrites;
+ Stats::Formula kernargMemInsts;
+
+ Stats::Distribution waveLevelParallelism;
+
+ // the following stats compute the avg. TLB accesslatency per
+ // uncoalesced request (only for data)
+ Stats::Scalar tlbRequests;
+ Stats::Scalar tlbCycles;
+ Stats::Formula tlbLatency;
+ // hitsPerTLBLevel[x] are the hits in Level x TLB.
+ // x = 0 is the page table.
+ Stats::Vector hitsPerTLBLevel;
+
+ Stats::Scalar ldsBankAccesses;
+ Stats::Distribution ldsBankConflictDist;
+
+ // over all memory instructions executed over all wavefronts
+ // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+ Stats::Distribution pageDivergenceDist;
+ // count of non-flat global memory vector instructions executed
+ Stats::Scalar dynamicGMemInstrCnt;
+ // count of flat global memory vector instructions executed
+ Stats::Scalar dynamicFlatMemInstrCnt;
+ Stats::Scalar dynamicLMemInstrCnt;
+
+ Stats::Scalar wgBlockedDueBarrierAllocation;
+ Stats::Scalar wgBlockedDueLdsAllocation;
+ // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+ // active when the instruction is committed, this number is still
+ // incremented by 1
+ Stats::Scalar numInstrExecuted;
+ // Number of cycles among successive instruction executions across all
+ // wavefronts of the same CU
+ Stats::Distribution execRateDist;
+ // number of individual vector operations executed
+ Stats::Scalar numVecOpsExecuted;
+ // number of individual f16 vector operations executed
+ Stats::Scalar numVecOpsExecutedF16;
+ // number of individual f32 vector operations executed
+ Stats::Scalar numVecOpsExecutedF32;
+ // number of individual f64 vector operations executed
+ Stats::Scalar numVecOpsExecutedF64;
+ // number of individual FMA 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedFMA16;
+ Stats::Scalar numVecOpsExecutedFMA32;
+ Stats::Scalar numVecOpsExecutedFMA64;
+ // number of individual MAC 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedMAC16;
+ Stats::Scalar numVecOpsExecutedMAC32;
+ Stats::Scalar numVecOpsExecutedMAC64;
+ // number of individual MAD 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedMAD16;
+ Stats::Scalar numVecOpsExecutedMAD32;
+ Stats::Scalar numVecOpsExecutedMAD64;
+ // total number of two op FP vector operations executed
+ Stats::Scalar numVecOpsExecutedTwoOpFP;
+ // Total cycles that something is running on the GPU
+ Stats::Scalar totalCycles;
+ Stats::Formula vpc; // vector ops per cycle
+ Stats::Formula vpc_f16; // vector ops per cycle
+ Stats::Formula vpc_f32; // vector ops per cycle
+ Stats::Formula vpc_f64; // vector ops per cycle
+ Stats::Formula ipc; // vector instructions per cycle
+ Stats::Distribution controlFlowDivergenceDist;
+ Stats::Distribution activeLanesPerGMemInstrDist;
+ Stats::Distribution activeLanesPerLMemInstrDist;
+ // number of vector ALU instructions received
+ Stats::Formula numALUInstsExecuted;
+ // number of times a WG cannot start due to lack of free VGPRs in SIMDs
+ Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+ // number of times a WG cannot start due to lack of free SGPRs in SIMDs
+ Stats::Scalar numTimesWgBlockedDueSgprAlloc;
+ Stats::Scalar numCASOps;
+ Stats::Scalar numFailedCASOps;
+ Stats::Scalar completedWfs;
+ Stats::Scalar completedWGs;
+
+ // distrubtion in latency difference between first and last cache block
+ // arrival ticks
+ Stats::Distribution headTailLatency;
+
+ // Track the amount of interleaving between wavefronts on each SIMD.
+ // This stat is sampled using instExecPerSimd to compute the number
+ // of instructions that have been executed on a SIMD between a WF
+ // executing two successive instructions.
+ Stats::VectorDistribution instInterleave;
+ } stats;
};
#endif // __COMPUTE_UNIT_HH__
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
tickEvent([this]{ exec(); },
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
- dispatchActive(false)
+ dispatchActive(false), stats(this)
{
schedule(&tickEvent, 0);
}
{
}
-void
-GPUDispatcher::regStats()
-{
- numKernelLaunched
- .name(name() + ".num_kernel_launched")
- .desc("number of kernel launched")
- ;
-
- cyclesWaitingForDispatch
- .name(name() + ".cycles_wait_dispatch")
- .desc("number of cycles with outstanding wavefronts "
- "that are waiting to be dispatched")
- ;
-}
-
HSAQueueEntry*
GPUDispatcher::hsaTask(int disp_id)
{
void
GPUDispatcher::dispatch(HSAQueueEntry *task)
{
- ++numKernelLaunched;
+ ++stats.numKernelLaunched;
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
task->kernelName(), task->dispatchId());
DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
if (execIds.size() > 0) {
- ++cyclesWaitingForDispatch;
+ ++stats.cyclesWaitingForDispatch;
}
/**
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
+
+GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
+ : Stats::Group(parent),
+ ADD_STAT(numKernelLaunched, "number of kernel launched"),
+ ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
+ "wavefronts that are waiting to be dispatched")
+{
+}
#include <vector>
#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "dev/hsa/hsa_packet.hh"
#include "params/GPUDispatcher.hh"
#include "sim/sim_object.hh"
void serialize(CheckpointOut &cp) const override;
void unserialize(CheckpointIn &cp) override;
- void regStats() override;
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
void setShader(Shader *new_shader);
void exec();
std::queue<int> doneIds;
// is there a kernel in execution?
bool dispatchActive;
- /*statistics*/
- Stats::Scalar numKernelLaunched;
- Stats::Scalar cyclesWaitingForDispatch;
+
+ protected:
+ struct GPUDispatcherStats : public Stats::Group
+ {
+ GPUDispatcherStats(Stats::Group *parent);
+
+ Stats::Scalar numKernelLaunched;
+ Stats::Scalar cyclesWaitingForDispatch;
+ } stats;
};
#endif // __GPU_COMPUTE_DISPATCHER_HH__
: computeUnit(cu), fromSchedule(from_schedule),
lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
- executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
+ executionResourcesUsed(0), _name(cu.name() + ".ExecStage"),
+ stats(&cu)
{
- numTransActiveIdle = 0;
+ stats.numTransActiveIdle = 0;
idle_dur = 0;
}
if (stage == IdleExec) {
// count cycles when no instruction to a specific execution resource
// is executed
- numCyclesWithNoInstrTypeIssued[unitId]++;
+ stats.numCyclesWithNoInstrTypeIssued[unitId]++;
} else if (stage == BusyExec) {
// count the number of cycles an instruction to a specific execution
// resource type was issued
- numCyclesWithInstrTypeIssued[unitId]++;
+ stats.numCyclesWithInstrTypeIssued[unitId]++;
thisTimeInstExecuted = true;
instrExecuted = true;
++executionResourcesUsed;
} else if (stage == PostExec) {
// count the number of transitions from active to idle
if (lastTimeInstExecuted && !thisTimeInstExecuted) {
- ++numTransActiveIdle;
+ ++stats.numTransActiveIdle;
}
if (!lastTimeInstExecuted && thisTimeInstExecuted) {
- idleDur.sample(idle_dur);
+ stats.idleDur.sample(idle_dur);
idle_dur = 0;
} else if (!thisTimeInstExecuted) {
idle_dur++;
// track the number of cycles we either issued at least
// instruction or issued no instructions at all
if (instrExecuted) {
- numCyclesWithInstrIssued++;
+ stats.numCyclesWithInstrIssued++;
} else {
- numCyclesWithNoIssue++;
+ stats.numCyclesWithNoIssue++;
}
- spc.sample(executionResourcesUsed);
+ stats.spc.sample(executionResourcesUsed);
}
}
collectStatistics(PostExec, 0);
}
-void
-ExecStage::regStats()
+ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
+ : Stats::Group(parent, "ExecStage"),
+ ADD_STAT(numTransActiveIdle,
+ "number of CU transitions from active to idle"),
+ ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
+ ADD_STAT(numCyclesWithInstrIssued,
+ "number of cycles the CU issued at least one instruction"),
+ ADD_STAT(spc,
+ "Execution units active per cycle (Exec unit=SIMD,MemPipe)"),
+ ADD_STAT(idleDur, "duration of idle periods in cycles"),
+ ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one "
+ "instruction issued to execution resource type"),
+ ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions"
+ " issued to execution resource type")
{
- numTransActiveIdle
- .name(name() + ".num_transitions_active_to_idle")
- .desc("number of CU transitions from active to idle")
- ;
-
- numCyclesWithNoIssue
- .name(name() + ".num_cycles_with_no_issue")
- .desc("number of cycles the CU issues nothing")
- ;
-
- numCyclesWithInstrIssued
- .name(name() + ".num_cycles_with_instr_issued")
- .desc("number of cycles the CU issued at least one instruction")
- ;
-
- spc
- .init(0, computeUnit.numExeUnits(), 1)
- .name(name() + ".spc")
- .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
- ;
+ ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);
- idleDur
- .init(0,75,5)
- .name(name() + ".idle_duration_in_cycles")
- .desc("duration of idle periods in cycles")
- ;
-
- numCyclesWithInstrTypeIssued
- .init(computeUnit.numExeUnits())
- .name(name() + ".num_cycles_issue_exec_rsrc")
- .desc("Number of cycles at least one instruction issued to "
- "execution resource type")
- ;
-
- numCyclesWithNoInstrTypeIssued
- .init(computeUnit.numExeUnits())
- .name(name() + ".num_cycles_no_issue_exec_rsrc")
- .desc("Number of clks no instructions issued to execution "
- "resource type")
- ;
+ spc.init(0, compute_unit->numExeUnits(), 1);
+ idleDur.init(0, 75, 5);
+ numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
+ numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());
int c = 0;
- for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
+ for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) {
std::string s = "VectorALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
}
- for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
+ for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) {
std::string s = "ScalarALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
-
- numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
- numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
}
#include <utility>
#include <vector>
-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
class ComputeUnit;
class ScheduleToExecute;
void dumpDispList();
const std::string& name() const { return _name; }
- void regStats();
- // number of idle cycles
- Stats::Scalar numCyclesWithNoIssue;
- // number of busy cycles
- Stats::Scalar numCyclesWithInstrIssued;
- // number of cycles during which at least one
- // instruction was issued to an execution resource type
- Stats::Vector numCyclesWithInstrTypeIssued;
- // number of idle cycles during which the scheduler
- // issued no instructions targeting a specific
- // execution resource type
- Stats::Vector numCyclesWithNoInstrTypeIssued;
- // SIMDs active per cycle
- Stats::Distribution spc;
private:
void collectStatistics(enum STAT_STATUS stage, int unitId);
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
- Stats::Scalar numTransActiveIdle;
- Stats::Distribution idleDur;
int executionResourcesUsed;
uint64_t idle_dur;
const std::string _name;
+
+ protected:
+ struct ExecStageStats : public Stats::Group
+ {
+ ExecStageStats(Stats::Group *parent);
+
+ // number of transitions from active to idle
+ Stats::Scalar numTransActiveIdle;
+ // number of idle cycles
+ Stats::Scalar numCyclesWithNoIssue;
+ // number of busy cycles
+ Stats::Scalar numCyclesWithInstrIssued;
+ // SIMDs active per cycle
+ Stats::Distribution spc;
+ // duration of idle periods in cycles
+ Stats::Distribution idleDur;
+ // number of cycles during which at least one
+ // instruction was issued to an execution resource type
+ Stats::Vector numCyclesWithInstrTypeIssued;
+ // number of idle cycles during which the scheduler
+ // issued no instructions targeting a specific
+ // execution resource type
+ Stats::Vector numCyclesWithNoInstrTypeIssued;
+ } stats;
};
#endif // __EXEC_STAGE_HH__
FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu)
: numVectorALUs(p.num_SIMDs), computeUnit(cu),
- _name(cu.name() + ".FetchStage")
+ _name(cu.name() + ".FetchStage"), stats(&cu)
{
for (int j = 0; j < numVectorALUs; ++j) {
FetchUnit newFetchUnit(p, cu);
const unsigned num_instructions = pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst);
- instFetchInstReturned.sample(num_instructions);
+ stats.instFetchInstReturned.sample(num_instructions);
uint32_t simdId = wavefront->simdId;
_fetchUnit[simdId].processFetchReturn(pkt);
}
_fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
}
-void
-FetchStage::regStats()
+FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
+ : Stats::Group(parent, "FetchStage"),
+ ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
+ "received record how many instructions you got from it")
{
- instFetchInstReturned
- .init(1, 32, 1)
- .name(name() + ".inst_fetch_instr_returned")
- .desc("For each instruction fetch request recieved record how many "
- "instructions you got from it")
- ;
+ instFetchInstReturned.init(1, 32, 1);
}
#include <vector>
#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "gpu-compute/fetch_unit.hh"
// Instruction fetch stage.
// Stats related variables and methods
const std::string& name() const { return _name; }
- void regStats();
- Stats::Distribution instFetchInstReturned;
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
private:
// instantiated per VALU/SIMD
std::vector<FetchUnit> _fetchUnit;
const std::string _name;
+
+ protected:
+ struct FetchStageStats : public Stats::Group
+ {
+ FetchStageStats(Stats::Group *parent);
+
+ Stats::Distribution instFetchInstReturned;
+ } stats;
};
#endif // __FETCH_STAGE_HH__
: computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
gmQueueSize(p.global_mem_queue_size),
maxWaveRequests(p.max_wave_requests), inflightStores(0),
- inflightLoads(0)
+ inflightLoads(0), stats(&cu)
{
}
mem_req->second.second = true;
}
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+ : Stats::Group(parent, "GlobalMemPipeline"),
+ ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+ "are delayed before updating the VRF")
{
- loadVrfBankConflictCycles
- .name(name() + ".load_vrf_bank_conflict_cycles")
- .desc("total number of cycles GM data are delayed before updating "
- "the VRF")
- ;
}
#include <queue>
#include <string>
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
}
const std::string &name() const { return _name; }
- void regStats();
void
incLoadVRFBankConflictCycles(int num_cycles)
{
- loadVrfBankConflictCycles += num_cycles;
+ stats.loadVrfBankConflictCycles += num_cycles;
}
bool coalescerReady(GPUDynInstPtr mp) const;
int gmQueueSize;
int maxWaveRequests;
- // number of cycles of delaying the update of a VGPR that is the
- // target of a load instruction (or the load component of an atomic)
- // The delay is due to VRF bank conflicts
- Stats::Scalar loadVrfBankConflictCycles;
// Counters to track the inflight loads and stores
// so that we can provide the proper backpressure
// on the number of inflight memory operations.
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+ protected:
+ struct GlobalMemPipelineStats : public Stats::Group
+ {
+ GlobalMemPipelineStats(Stats::Group *parent);
+
+ // number of cycles of delaying the update of a VGPR that is the
+ // target of a load instruction (or the load component of an atomic)
+ // The delay is due to VRF bank conflicts
+ Stats::Scalar loadVrfBankConflictCycles;
+ } stats;
};
#endif // __GLOBAL_MEMORY_PIPELINE_HH__
{
if (_staticInst->isLocalMem()) {
// access to LDS (shared) memory
- cu->dynamicLMemInstrCnt++;
+ cu->stats.dynamicLMemInstrCnt++;
} else if (_staticInst->isFlat()) {
- cu->dynamicFlatMemInstrCnt++;
+ cu->stats.dynamicFlatMemInstrCnt++;
} else {
// access to global memory
// update PageDivergence histogram
int number_pages_touched = cu->pagesTouched.size();
assert(number_pages_touched);
- cu->pageDivergenceDist.sample(number_pages_touched);
+ cu->stats.pageDivergenceDist.sample(number_pages_touched);
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
// total number of memory instructions (dynamic)
// Atomics are counted as a single memory instruction.
// this is # memory instructions per wavefronts, not per workitem
- cu->dynamicGMemInstrCnt++;
+ cu->stats.dynamicGMemInstrCnt++;
}
}
void
execute(T *b)
{
- computeUnit->numCASOps++;
+ computeUnit->stats.numCASOps++;
if (*b == c) {
*b = s;
} else {
- computeUnit->numFailedCASOps++;
+ computeUnit->stats.numFailedCASOps++;
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
: ClockedObject(p), configAddress(0), size(p.size),
cleanupEvent([this]{ cleanup(); }, name(), false,
Event::Maximum_Pri),
- exitEvent([this]{ exitCallback(); }, name())
+ exitEvent([this]{ exitCallback(); }, name()), stats(this)
{
assoc = p.assoc;
assert(assoc <= size);
return tlb_hit;
}
- localNumTLBAccesses++;
+ stats.localNumTLBAccesses++;
if (!entry) {
- localNumTLBMisses++;
+ stats.localNumTLBMisses++;
} else {
- localNumTLBHits++;
+ stats.localNumTLBHits++;
}
}
}
DPRINTF(GPUTLB, "Paging enabled.\n");
// The vaddr already has the segment base applied.
TlbEntry *entry = lookup(vaddr);
- localNumTLBAccesses++;
+ stats.localNumTLBAccesses++;
if (!entry) {
- localNumTLBMisses++;
+ stats.localNumTLBMisses++;
if (timing) {
latency = missLatency1;
}
DPRINTF(GPUTLB, "Miss was serviced.\n");
}
} else {
- localNumTLBHits++;
+ stats.localNumTLBHits++;
if (timing) {
latency = hitLatency;
{
}
- void
- GpuTLB::regStats()
- {
- ClockedObject::regStats();
-
- localNumTLBAccesses
- .name(name() + ".local_TLB_accesses")
- .desc("Number of TLB accesses")
- ;
-
- localNumTLBHits
- .name(name() + ".local_TLB_hits")
- .desc("Number of TLB hits")
- ;
-
- localNumTLBMisses
- .name(name() + ".local_TLB_misses")
- .desc("Number of TLB misses")
- ;
-
- localTLBMissRate
- .name(name() + ".local_TLB_miss_rate")
- .desc("TLB miss rate")
- ;
-
- accessCycles
- .name(name() + ".access_cycles")
- .desc("Cycles spent accessing this TLB level")
- ;
-
- pageTableCycles
- .name(name() + ".page_table_cycles")
- .desc("Cycles spent accessing the page table")
- ;
-
- localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
-
- numUniquePages
- .name(name() + ".unique_pages")
- .desc("Number of unique pages touched")
- ;
-
- localCycles
- .name(name() + ".local_cycles")
- .desc("Number of cycles spent in queue for all incoming reqs")
- ;
-
- localLatency
- .name(name() + ".local_latency")
- .desc("Avg. latency over incoming coalesced reqs")
- ;
-
- localLatency = localCycles / localNumTLBAccesses;
-
- globalNumTLBAccesses
- .name(name() + ".global_TLB_accesses")
- .desc("Number of TLB accesses")
- ;
-
- globalNumTLBHits
- .name(name() + ".global_TLB_hits")
- .desc("Number of TLB hits")
- ;
-
- globalNumTLBMisses
- .name(name() + ".global_TLB_misses")
- .desc("Number of TLB misses")
- ;
-
- globalTLBMissRate
- .name(name() + ".global_TLB_miss_rate")
- .desc("TLB miss rate")
- ;
-
- globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
-
- avgReuseDistance
- .name(name() + ".avg_reuse_distance")
- .desc("avg. reuse distance over all pages (in ticks)")
- ;
-
- }
-
/**
* Do the TLB lookup for this coalesced request and schedule
* another event <TLB access latency> cycles later.
int req_cnt = sender_state->reqCnt.back();
if (update_stats) {
- accessCycles -= (curTick() * req_cnt);
- localCycles -= curTick();
+ stats.accessCycles -= (curTick() * req_cnt);
+ stats.localCycles -= curTick();
updatePageFootprint(virt_page_addr);
- globalNumTLBAccesses += req_cnt;
+ stats.globalNumTLBAccesses += req_cnt;
}
tlbOutcome lookup_outcome = TLB_MISS;
// the reqCnt has an entry per level, so its size tells us
// which level we are in
sender_state->hitLevel = sender_state->reqCnt.size();
- globalNumTLBHits += req_cnt;
+ stats.globalNumTLBHits += req_cnt;
}
} else {
if (update_stats)
- globalNumTLBMisses += req_cnt;
+ stats.globalNumTLBMisses += req_cnt;
}
/*
handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
if (update_stats) {
- accessCycles += (req_cnt * curTick());
- localCycles += curTick();
+ stats.accessCycles += (req_cnt * curTick());
+ stats.localCycles += curTick();
}
} else if (outcome == TLB_MISS) {
DPRINTF(GPUTLB, "This is a TLB miss\n");
if (update_stats) {
- accessCycles += (req_cnt*curTick());
- localCycles += curTick();
+ stats.accessCycles += (req_cnt*curTick());
+ stats.localCycles += curTick();
}
if (hasMemSidePort) {
// the reply back till when we propagate it to the coalescer
// above.
if (update_stats) {
- accessCycles += (req_cnt * 1);
- localCycles += 1;
+ stats.accessCycles += (req_cnt * 1);
+ stats.localCycles += 1;
}
/**
"addr %#x\n", virtPageAddr);
if (update_stats)
- pageTableCycles -= (req_cnt*curTick());
+ stats.pageTableCycles -= (req_cnt*curTick());
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
assert(tlb_event);
}
} else if (outcome == PAGE_WALK) {
if (update_stats)
- pageTableCycles += (req_cnt*curTick());
+ stats.pageTableCycles += (req_cnt*curTick());
// Need to access the page table and update the TLB
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
// functional mode means no coalescing
// global metrics are the same as the local metrics
if (update_stats) {
- tlb->globalNumTLBAccesses++;
+ tlb->stats.globalNumTLBAccesses++;
if (success) {
sender_state->hitLevel = sender_state->reqCnt.size();
- tlb->globalNumTLBHits++;
+ tlb->stats.globalNumTLBHits++;
}
}
if (!success) {
if (update_stats)
- tlb->globalNumTLBMisses++;
+ tlb->stats.globalNumTLBMisses++;
if (tlb->hasMemSidePort) {
// there is a TLB below -> propagate down the TLB hierarchy
tlb->memSidePort[0]->sendFunctional(pkt);
bool first_page_access = ret.second;
if (first_page_access) {
- numUniquePages++;
+ stats.numUniquePages++;
} else {
int accessed_before;
accessed_before = curTick() - ret.first->second.lastTimeAccessed;
if (accessDistance) {
ret.first->second.localTLBAccesses
- .push_back(localNumTLBAccesses.value());
+ .push_back(stats.localNumTLBAccesses.value());
}
}
}
if (!TLBFootprint.empty()) {
- avgReuseDistance =
+ stats.avgReuseDistance =
sum_avg_reuse_distance_per_page / TLBFootprint.size();
}
//clear the TLBFootprint map
TLBFootprint.clear();
}
+
+ GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
+ : Stats::Group(parent),
+ ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
+ ADD_STAT(localNumTLBHits, "Number of TLB hits"),
+ ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
+ ADD_STAT(localTLBMissRate, "TLB miss rate"),
+ ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
+ ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
+ ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
+ ADD_STAT(globalTLBMissRate, "TLB miss rate"),
+ ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
+ ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
+ ADD_STAT(numUniquePages, "Number of unique pages touched"),
+ ADD_STAT(localCycles, "Number of cycles spent in queue for all "
+ "incoming reqs"),
+ ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"),
+ ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in "
+ "ticks)")
+ {
+ localLatency = localCycles / localNumTLBAccesses;
+
+ localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+ globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+ }
} // namespace X86ISA
#include "base/callback.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/port.hh"
#include "mem/request.hh"
int missLatency1;
int missLatency2;
- // local_stats are as seen from the TLB
- // without taking into account coalescing
- Stats::Scalar localNumTLBAccesses;
- Stats::Scalar localNumTLBHits;
- Stats::Scalar localNumTLBMisses;
- Stats::Formula localTLBMissRate;
-
- // global_stats are as seen from the
- // CU's perspective taking into account
- // all coalesced requests.
- Stats::Scalar globalNumTLBAccesses;
- Stats::Scalar globalNumTLBHits;
- Stats::Scalar globalNumTLBMisses;
- Stats::Formula globalTLBMissRate;
-
- // from the CU perspective (global)
- Stats::Scalar accessCycles;
- // from the CU perspective (global)
- Stats::Scalar pageTableCycles;
- Stats::Scalar numUniquePages;
- // from the perspective of this TLB
- Stats::Scalar localCycles;
- // from the perspective of this TLB
- Stats::Formula localLatency;
- // I take the avg. per page and then
- // the avg. over all pages.
- Stats::Scalar avgReuseDistance;
-
- void regStats() override;
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
void exitCallback();
EventFunctionWrapper exitEvent;
+
+ protected:
+ struct GpuTLBStats : public Stats::Group
+ {
+ GpuTLBStats(Stats::Group *parent);
+
+ // local_stats are as seen from the TLB
+ // without taking into account coalescing
+ Stats::Scalar localNumTLBAccesses;
+ Stats::Scalar localNumTLBHits;
+ Stats::Scalar localNumTLBMisses;
+ Stats::Formula localTLBMissRate;
+
+ // global_stats are as seen from the
+ // CU's perspective taking into account
+ // all coalesced requests.
+ Stats::Scalar globalNumTLBAccesses;
+ Stats::Scalar globalNumTLBHits;
+ Stats::Scalar globalNumTLBMisses;
+ Stats::Formula globalTLBMissRate;
+
+ // from the CU perspective (global)
+ Stats::Scalar accessCycles;
+ // from the CU perspective (global)
+ Stats::Scalar pageTableCycles;
+ Stats::Scalar numUniquePages;
+ // from the perspective of this TLB
+ Stats::Scalar localCycles;
+ // from the perspective of this TLB
+ Stats::Formula localLatency;
+ // I take the avg. per page and then
+ // the avg. over all pages.
+ Stats::Scalar avgReuseDistance;
+ } stats;
};
}
// the number of conflicts this packet will have when accessing the LDS
unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
// count the total number of physical LDS bank accessed
- parent->ldsBankAccesses += bankAccesses;
+ parent->stats.ldsBankAccesses += bankAccesses;
// count the LDS bank conflicts. A number set to 1 indicates one
// access per bank maximum so there are no bank conflicts
- parent->ldsBankConflictDist.sample(bankConflicts-1);
+ parent->stats.ldsBankConflictDist.sample(bankConflicts-1);
GPUDynInstPtr dynInst = getDynInstr(packet);
// account for the LDS bank conflict overhead
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
: computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
- lmQueueSize(p.local_mem_queue_size)
+ lmQueueSize(p.local_mem_queue_size), stats(&cu)
{
}
lmIssuedRequests.push(gpuDynInst);
}
-void
-LocalMemPipeline::regStats()
+
+LocalMemPipeline::
+LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
+ : Stats::Group(parent, "LocalMemPipeline"),
+ ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
+ "are delayed before updating the VRF")
{
- loadVrfBankConflictCycles
- .name(name() + ".load_vrf_bank_conflict_cycles")
- .desc("total number of cycles LDS data are delayed before updating "
- "the VRF")
- ;
}
#include <queue>
#include <string>
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
-#include "sim/stats.hh"
/*
* @file local_memory_pipeline.hh
}
const std::string& name() const { return _name; }
- void regStats();
void
incLoadVRFBankConflictCycles(int num_cycles)
{
- loadVrfBankConflictCycles += num_cycles;
+ stats.loadVrfBankConflictCycles += num_cycles;
}
private:
ComputeUnit &computeUnit;
const std::string _name;
int lmQueueSize;
- Stats::Scalar loadVrfBankConflictCycles;
+
// Local Memory Request Fifo: all shared memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> lmIssuedRequests;
// Local Memory Response Fifo: all responses of shared memory
// requests are sent to this FIFO from LDS
std::queue<GPUDynInstPtr> lmReturnedRequests;
+
+ protected:
+ struct LocalMemPipelineStats : public Stats::Group
+ {
+ LocalMemPipelineStats(Stats::Group *parent);
+
+ Stats::Scalar loadVrfBankConflictCycles;
+ } stats;
};
#endif // __LOCAL_MEMORY_PIPELINE_HH__
#include "params/RegisterFile.hh"
RegisterFile::RegisterFile(const RegisterFileParams &p)
- : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs)
+ : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this)
{
fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
{
}
-void
-RegisterFile::regStats()
-{
- registerReads
- .name(name() + ".register_reads")
- .desc("Total number of DWORDs read from register file")
- ;
-
- registerWrites
- .name(name() + ".register_writes")
- .desc("Total number of DWORDS written to register file")
- ;
-
- sramReads
- .name(name() + ".sram_reads")
- .desc("Total number of register file bank SRAM activations for reads")
- ;
-
- sramWrites
- .name(name() + ".sram_writes")
- .desc("Total number of register file bank SRAM activations for writes")
- ;
+RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
+ : Stats::Group(parent),
+ ADD_STAT(registerReads,
+ "Total number of DWORDs read from register file"),
+ ADD_STAT(registerWrites,
+ "Total number of DWORDS written to register file"),
+ ADD_STAT(sramReads,
+ "Total number of register file bank SRAM activations for reads"),
+ ADD_STAT(sramWrites,
+ "Total number of register file bank SRAM activations for writes")
+{
}
virtual ~RegisterFile();
virtual void setParent(ComputeUnit *_computeUnit);
int numRegs() const { return _numRegs; }
- virtual void regStats() override;
// State functions
// numer of registers in this register file
int _numRegs;
- // Stats
- // Total number of register reads, incremented once per DWORD per thread
- Stats::Scalar registerReads;
- // Total number of register writes, incremented once per DWORD per thread
- Stats::Scalar registerWrites;
-
- // Number of register file SRAM activations for reads.
- // The register file may be implemented with multiple SRAMs. This stat
- // tracks how many times the SRAMs are accessed for reads.
- Stats::Scalar sramReads;
- // Number of register file SRAM activations for writes
- Stats::Scalar sramWrites;
+
+ struct RegisterFileStats : public Stats::Group
+ {
+ RegisterFileStats(Stats::Group *parent);
+
+ // Total number of register reads per DWORD per thread
+ Stats::Scalar registerReads;
+ // Total number of register writes per DWORD per thread
+ Stats::Scalar registerWrites;
+
+ // Number of register file SRAM activations for reads.
+ // The register file may be implemented with multiple SRAMs. This stat
+ // tracks how many times the SRAMs are accessed for reads.
+ Stats::Scalar sramReads;
+ // Number of register file SRAM activations for writes
+ Stats::Scalar sramWrites;
+ } stats;
};
#endif // __REGISTER_FILE_HH__
{
policy->freeRegisters(w);
}
-
-void
-RegisterManager::regStats()
-{
- policy->regStats();
-}
void setParent(ComputeUnit *cu);
void exec();
- // Stats related variables and methods
- void regStats();
-
// lookup virtual to physical register translation
int mapVgpr(Wavefront* w, int vgprIndex);
int mapSgpr(Wavefront* w, int sgprIndex);
// free all remaining registers held by specified WF
virtual void freeRegisters(Wavefront *w) = 0;
- // stats
- virtual void regStats() = 0;
-
protected:
ComputeUnit *cu;
};
computeUnit.cu_id, mp->simdId, mp->wfSlotId);
}
}
-
-void
-ScalarMemPipeline::regStats()
-{
-}
}
const std::string& name() const { return _name; }
- void regStats();
private:
ComputeUnit &computeUnit;
if (regBusy(pSgpr)) {
if (ii->isDstOperand(i)) {
- w->numTimesBlockedDueWAXDependencies++;
+ w->stats.numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pSgpr);
- w->numTimesBlockedDueRAWDependencies++;
+ w->stats.numTimesBlockedDueRAWDependencies++;
}
return false;
}
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
int DWORDs = ii->getOperandSize(i) <= 4 ? 1
: ii->getOperandSize(i) / 4;
- registerReads += DWORDs;
+ stats.registerReads += DWORDs;
}
}
enqRegFreeEvent(physReg, tickDelay);
}
- registerWrites += nRegs;
+ stats.registerWrites += nRegs;
}
}
}
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
}
- registerWrites += nRegs;
+ stats.registerWrites += nRegs;
}
}
}
_name(cu.name() + ".ScheduleStage"),
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
- locMemBusRdy(false), locMemIssueRdy(false)
+ locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
{
for (int j = 0; j < cu.numExeUnits(); ++j) {
scheduler.emplace_back(p);
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
- rdyListEmpty[j]++;
+ stats.rdyListEmpty[j]++;
continue;
}
- rdyListNotEmpty[j]++;
+ stats.rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *wf = scheduler[j].chooseWave();
if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
- wf->schCycles++;
- addToSchListStalls[j]++;
+ wf->stats.schCycles++;
+ stats.addToSchListStalls[j]++;
} else {
if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
wf->incLGKMInstsIssued();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
- rdyListEmpty[j]++;
+ stats.rdyListEmpty[j]++;
continue;
}
- rdyListNotEmpty[j]++;
+ stats.rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *wf = scheduler[j].chooseWave();
if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
- wf->schCycles++;
- addToSchListStalls[j]++;
+ wf->stats.schCycles++;
+ stats.addToSchListStalls[j]++;
}
}
computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
return true;
} else {
- rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
if (!accessSrfWr) {
- rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
}
if (!accessVrfWr) {
- rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
}
// Increment stall counts for WF
- wf->schStalls++;
- wf->schRfAccessStalls++;
+ wf->stats.schStalls++;
+ wf->stats.schRfAccessStalls++;
}
return false;
}
return true;
} else {
// Number of stall cycles due to RF access denied
- rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
// Count number of denials due to each reason
// Multiple items may contribute to the denied request
if (!accessVrf) {
- rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
}
if (!accessSrf) {
- rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+ stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
}
// Increment stall counts for WF
- wf->schStalls++;
- wf->schRfAccessStalls++;
+ wf->stats.schStalls++;
+ wf->stats.schRfAccessStalls++;
DPRINTF(GPUSched, "schList[%d]: Could not add: "
"SIMD[%d] WV[%d]: %d: %s\n",
exeType, wf->simdId, wf->wfDynId,
// TODO: Scalar NOP does not require SALU in hardware,
// and is executed out of IB directly.
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
- dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
- dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (gpu_dyn_inst->isEndOfKernel()) {
// EndPgm instruction
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
- dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
}
} else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
|| gpu_dyn_inst->isALU()) {
// Barrier, Branch, or ALU instruction
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
- dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
- dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
bool rdy = true;
if (!glbMemIssueRdy) {
rdy = false;
- dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy) {
rdy = false;
- dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
- dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
- dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+ stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
}
if (!rdy) {
return false;
bool rdy = true;
if (!scalarMemIssueRdy) {
rdy = false;
- dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
}
if (!scalarMemBusRdy) {
rdy = false;
- dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.scalarMemoryPipe
.isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+ wf->scalarWrGmReqsInPipe))
{
rdy = false;
- dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+ stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
bool rdy = true;
if (!locMemIssueRdy) {
rdy = false;
- dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+ stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
}
if (!locMemBusRdy) {
rdy = false;
- dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+ stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
- dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+ stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
bool rdy = true;
if (!glbMemIssueRdy || !locMemIssueRdy) {
rdy = false;
- dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+ stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy || !locMemBusRdy) {
rdy = false;
- dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+ stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
- dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+ stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
- dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+ stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
- dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+ stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
gpu_dyn_inst->disassemble());
return false;
}
- dispNrdyStalls[SCH_RDY]++;
+ stats.dispNrdyStalls[SCH_RDY]++;
return true;
}
} else {
// Either another wave has been dispatched, or this wave
// was not ready, so it is stalled this cycle
- schIter->first->wavefront()->schStalls++;
+ schIter->first->wavefront()->stats.schStalls++;
if (!dispRdy) {
// not ready for dispatch, increment stall stat
- schIter->first->wavefront()->schResourceStalls++;
+ schIter->first->wavefront()->stats.schResourceStalls++;
}
// Examine next wave for this resource
schIter++;
// Increment stall count if no wave sent to dispatchList for
// current execution resource
if (!dispatched) {
- schListToDispListStalls[j]++;
+ stats.schListToDispListStalls[j]++;
} else {
- schListToDispList[j]++;
+ stats.schListToDispList[j]++;
}
}
}
reinsertToSchList(wf->localMem, toExecute
.readyInst(wf->localMem));
// Increment stall stats for LDS-VRF arbitration
- ldsBusArbStalls++;
+ stats.ldsBusArbStalls++;
toExecute.readyInst(wf->localMem)
- ->wavefront()->schLdsArbStalls++;
+ ->wavefront()->stats.schLdsArbStalls++;
}
// With arbitration of LM pipe complete, transition the
// LM pipe to SKIP state in the dispatchList to inform EX stage
// Increment the number of cycles the wave spends in the
// SCH stage, since this loop visits every wave in SCH.
- wf->schCycles++;
+ wf->stats.schCycles++;
bool vrfRdy = true;
if (!gpu_dyn_inst->isScalar()) {
p.second = RFBUSY;
// Increment stall stats
- wf->schStalls++;
- wf->schOpdNrdyStalls++;
+ wf->stats.schStalls++;
+ wf->stats.schOpdNrdyStalls++;
- opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+ stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
if (!vrfRdy) {
- opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+ stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
}
if (!srfRdy) {
- opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+ stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
}
}
}
wavesInSch.erase(w->wfDynId);
}
-void
-ScheduleStage::regStats()
+ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
+ int num_exec_units)
+ : Stats::Group(parent, "ScheduleStage"),
+ ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
+ "execution resource"),
+ ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
+ "list per execution resource"),
+ ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
+ "schList per execution resource when ready list is not empty"),
+ ADD_STAT(schListToDispList, "number of cycles a wave is added to "
+ "dispatchList per execution resource"),
+ ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
+ " dispatchList per execution resource"),
+ ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
+ ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
+ "conflicts"),
+ ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
+ "ready"),
+ ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
+ "ready")
{
- rdyListNotEmpty
- .init(computeUnit.numExeUnits())
- .name(name() + ".rdy_list_not_empty")
- .desc("number of cycles one or more wave on ready list per "
- "execution resource")
- ;
-
- rdyListEmpty
- .init(computeUnit.numExeUnits())
- .name(name() + ".rdy_list_empty")
- .desc("number of cycles no wave on ready list per "
- "execution resource")
- ;
-
- addToSchListStalls
- .init(computeUnit.numExeUnits())
- .name(name() + ".sch_list_add_stalls")
- .desc("number of cycles a wave is not added to schList per "
- "execution resource when ready list is not empty")
- ;
-
- schListToDispList
- .init(computeUnit.numExeUnits())
- .name(name() + ".sch_list_to_disp_list")
- .desc("number of cycles a wave is added to dispatchList per "
- "execution resource")
- ;
-
- schListToDispListStalls
- .init(computeUnit.numExeUnits())
- .name(name() + ".sch_list_to_disp_list_stalls")
- .desc("number of cycles no wave is added to dispatchList per "
- "execution resource")
- ;
-
- // Operand Readiness Stall Cycles
- opdNrdyStalls
- .init(SCH_RF_OPD_NRDY_CONDITIONS)
- .name(name() + ".opd_nrdy_stalls")
- .desc("number of stalls in SCH due to operands not ready")
- ;
+ rdyListNotEmpty.init(num_exec_units);
+ rdyListEmpty.init(num_exec_units);
+ addToSchListStalls.init(num_exec_units);
+ schListToDispList.init(num_exec_units);
+ schListToDispListStalls.init(num_exec_units);
+ opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
+ dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
+ rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);
+
opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
- // dispatchReady Stall Cycles
- dispNrdyStalls
- .init(SCH_NRDY_CONDITIONS)
- .name(name() + ".disp_nrdy_stalls")
- .desc("number of stalls in SCH due to resource not ready")
- ;
dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
csprintf("FlatMemFIFO"));
dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
- // RF Access Stall Cycles
- rfAccessStalls
- .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
- .name(name() + ".rf_access_stalls")
- .desc("number of stalls due to RF access denied")
- ;
rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
-
- // Stall cycles due to wave losing LDS bus arbitration
- ldsBusArbStalls
- .name(name() + ".lds_bus_arb_stalls")
- .desc("number of stalls due to VRF->LDS bus conflicts")
- ;
}
#include <utility>
#include <vector>
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/misc.hh"
#include "gpu-compute/scheduler.hh"
SCH_RF_ACCESS_NRDY_CONDITIONS
};
- void regStats();
-
// Called by ExecStage to inform SCH of instruction execution
void deleteFromSch(Wavefront *w);
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
- // Stats
-
- // Number of cycles with empty (or not empty) readyList, per execution
- // resource, when the CU is active (not sleeping)
- Stats::Vector rdyListEmpty;
- Stats::Vector rdyListNotEmpty;
-
- // Number of cycles, per execution resource, when at least one wave
- // was on the readyList and picked by scheduler, but was unable to be
- // added to the schList, when the CU is active (not sleeping)
- Stats::Vector addToSchListStalls;
-
- // Number of cycles, per execution resource, when a wave is selected
- // as candidate for dispatchList from schList
- // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
- Stats::Vector schListToDispList;
-
- // Per execution resource stat, incremented once per cycle if no wave
- // was selected as candidate for dispatch and moved to dispatchList
- Stats::Vector schListToDispListStalls;
-
- // Number of times a wave is selected by the scheduler but cannot
- // be added to the schList due to register files not being able to
- // support reads or writes of operands. RF_ACCESS_NRDY condition is always
- // incremented if at least one read/write not supported, other
- // conditions are incremented independently from each other.
- Stats::Vector rfAccessStalls;
-
- // Number of times a wave is executing FLAT instruction and
- // forces another wave occupying its required local memory resource
- // to be deselected for execution, and placed back on schList
- Stats::Scalar ldsBusArbStalls;
-
- // Count of times VRF and/or SRF blocks waves on schList from
- // performing RFBUSY->RFREADY transition
- Stats::Vector opdNrdyStalls;
-
- // Count of times resource required for dispatch is not ready and
- // blocks wave in RFREADY state on schList from potentially moving
- // to dispatchList
- Stats::Vector dispNrdyStalls;
-
const std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
// the VRF/SRF availability or limits imposed by paremeters (to be added)
// of the SCH stage or CU.
std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
+
+ protected:
+ struct ScheduleStageStats : public Stats::Group
+ {
+ ScheduleStageStats(Stats::Group *parent, int num_exec_units);
+
+ // Number of cycles with empty (or not empty) readyList, per execution
+ // resource, when the CU is active (not sleeping)
+ Stats::Vector rdyListEmpty;
+ Stats::Vector rdyListNotEmpty;
+
+ // Number of cycles, per execution resource, when at least one wave
+ // was on the readyList and picked by scheduler, but was unable to be
+ // added to the schList, when the CU is active (not sleeping)
+ Stats::Vector addToSchListStalls;
+
+ // Number of cycles, per execution resource, when a wave is selected
+ // as candidate for dispatchList from schList
+ // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+ Stats::Vector schListToDispList;
+
+ // Per execution resource stat, incremented once per cycle if no wave
+ // was selected as candidate for dispatch and moved to dispatchList
+ Stats::Vector schListToDispListStalls;
+
+ // Number of times a wave is selected by the scheduler but cannot
+ // be added to the schList due to register files not being able to
+ // support reads or writes of operands. RF_ACCESS_NRDY condition is
+ // always incremented if at least one read/write not supported, other
+ // conditions are incremented independently from each other.
+ Stats::Vector rfAccessStalls;
+
+ // Number of times a wave is executing FLAT instruction and
+ // forces another wave occupying its required local memory resource
+ // to be deselected for execution, and placed back on schList
+ Stats::Scalar ldsBusArbStalls;
+
+ // Count of times VRF and/or SRF blocks waves on schList from
+ // performing RFBUSY->RFREADY transition
+ Stats::Vector opdNrdyStalls;
+
+ // Count of times resource required for dispatch is not ready and
+ // blocks wave in RFREADY state on schList from potentially moving
+ // to dispatchList
+ Stats::Vector dispNrdyStalls;
+ } stats;
};
#endif // __SCHEDULE_STAGE_HH__
ScoreboardCheckToSchedule
&to_schedule)
: computeUnit(cu), toSchedule(to_schedule),
- _name(cu.name() + ".ScoreboardCheckStage")
+ _name(cu.name() + ".ScoreboardCheckStage"), stats(&cu)
{
}
{
panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
"Instruction ready status %d is illegal!!!", rdyStatus);
- stallCycles[rdyStatus]++;
+ stats.stallCycles[rdyStatus]++;
}
// Return true if this wavefront is ready
}
}
-void
-ScoreboardCheckStage::regStats()
+ScoreboardCheckStage::
+ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
+ : Stats::Group(parent, "ScoreboardCheckStage"),
+ ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
{
- stallCycles
- .init(NRDY_CONDITIONS)
- .name(name() + ".stall_cycles")
- .desc("number of cycles wave stalled in SCB")
- ;
+ stallCycles.init(NRDY_CONDITIONS);
+
stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
#include <utility>
#include <vector>
-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
class ComputeUnit;
class ScoreboardCheckToSchedule;
// Stats related variables and methods
const std::string& name() const { return _name; }
- void regStats();
private:
void collectStatistics(nonrdytype_e rdyStatus);
*/
ScoreboardCheckToSchedule &toSchedule;
- // Stats
- Stats::Vector stallCycles;
-
const std::string _name;
+
+ protected:
+ struct ScoreboardCheckStageStats : public Stats::Group
+ {
+ ScoreboardCheckStageStats(Stats::Group *parent);
+
+ Stats::Vector stallCycles;
+ } stats;
};
#endif // __SCOREBOARD_CHECK_STAGE_HH__
globalMemSize(p.globalmem),
nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
_dispatcher(*p.dispatcher),
- max_valu_insts(p.max_valu_insts), total_valu_insts(0)
+ max_valu_insts(p.max_valu_insts), total_valu_insts(0),
+ stats(this, p.CUs[0]->wfSize())
{
gpuCmdProc.setShader(this);
_dispatcher.setShader(this);
return scheduledSomething;
}
-void
-Shader::regStats()
-{
- ClockedObject::regStats();
-
- shaderActiveTicks
- .name(name() + ".shader_active_ticks")
- .desc("Total ticks that any CU attached to this shader is active")
- ;
- allLatencyDist
- .init(0, 1600000, 10000)
- .name(name() + ".allLatencyDist")
- .desc("delay distribution for all")
- .flags(Stats::pdf | Stats::oneline);
-
- loadLatencyDist
- .init(0, 1600000, 10000)
- .name(name() + ".loadLatencyDist")
- .desc("delay distribution for loads")
- .flags(Stats::pdf | Stats::oneline);
-
- storeLatencyDist
- .init(0, 1600000, 10000)
- .name(name() + ".storeLatencyDist")
- .desc("delay distribution for stores")
- .flags(Stats::pdf | Stats::oneline);
-
- vectorInstSrcOperand
- .init(4)
- .name(name() + ".vec_inst_src_operand")
- .desc("vector instruction source operand distribution");
-
- vectorInstDstOperand
- .init(4)
- .name(name() + ".vec_inst_dst_operand")
- .desc("vector instruction destination operand distribution");
-
- initToCoalesceLatency
- .init(0, 1600000, 10000)
- .name(name() + ".initToCoalesceLatency")
- .desc("Ticks from vmem inst initiateAcc to coalescer issue")
- .flags(Stats::pdf | Stats::oneline);
-
- rubyNetworkLatency
- .init(0, 1600000, 10000)
- .name(name() + ".rubyNetworkLatency")
- .desc("Ticks from coalescer issue to coalescer hit callback")
- .flags(Stats::pdf | Stats::oneline);
-
- gmEnqueueLatency
- .init(0, 1600000, 10000)
- .name(name() + ".gmEnqueueLatency")
- .desc("Ticks from coalescer hit callback to GM pipe enqueue")
- .flags(Stats::pdf | Stats::oneline);
-
- gmToCompleteLatency
- .init(0, 1600000, 10000)
- .name(name() + ".gmToCompleteLatency")
- .desc("Ticks queued in GM pipes ordered response buffer")
- .flags(Stats::pdf | Stats::oneline);
-
- coalsrLineAddresses
- .init(0, 20, 1)
- .name(name() + ".coalsrLineAddresses")
- .desc("Number of cache lines for coalesced request")
- .flags(Stats::pdf | Stats::oneline);
-
- int wfSize = cuList[0]->wfSize();
- cacheBlockRoundTrip = new Stats::Distribution[wfSize];
- for (int idx = 0; idx < wfSize; ++idx) {
- std::stringstream namestr;
- ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
- cacheBlockRoundTrip[idx]
- .init(0, 1600000, 10000)
- .name(namestr.str())
- .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
- .flags(Stats::pdf | Stats::oneline);
- }
-}
-
void
Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id)
void
Shader::sampleStore(const Tick accessTime)
{
- storeLatencyDist.sample(accessTime);
- allLatencyDist.sample(accessTime);
+ stats.storeLatencyDist.sample(accessTime);
+ stats.allLatencyDist.sample(accessTime);
}
/*
void
Shader::sampleLoad(const Tick accessTime)
{
- loadLatencyDist.sample(accessTime);
- allLatencyDist.sample(accessTime);
+ stats.loadLatencyDist.sample(accessTime);
+ stats.allLatencyDist.sample(accessTime);
}
void
Tick t4 = roundTripTime[3];
Tick t5 = roundTripTime[4];
- initToCoalesceLatency.sample(t2-t1);
- rubyNetworkLatency.sample(t3-t2);
- gmEnqueueLatency.sample(t4-t3);
- gmToCompleteLatency.sample(t5-t4);
+ stats.initToCoalesceLatency.sample(t2-t1);
+ stats.rubyNetworkLatency.sample(t3-t2);
+ stats.gmEnqueueLatency.sample(t4-t3);
+ stats.gmToCompleteLatency.sample(t5-t4);
}
void
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
{
- coalsrLineAddresses.sample(lineMap.size());
+ stats.coalsrLineAddresses.sample(lineMap.size());
std::vector<Tick> netTimes;
// For each cache block address generated by a vmem inst, calculate
// Nth distribution.
int idx = 0;
for (auto& time : netTimes) {
- cacheBlockRoundTrip[idx].sample(time);
+ stats.cacheBlockRoundTrip[idx].sample(time);
++idx;
}
}
"Invalid activeCu size\n");
_activeCus--;
if (!_activeCus)
- shaderActiveTicks += curTick() - _lastInactiveTick;
+ stats.shaderActiveTicks += curTick() - _lastInactiveTick;
+}
+
+Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
+ : Stats::Group(parent),
+ ADD_STAT(allLatencyDist, "delay distribution for all"),
+ ADD_STAT(loadLatencyDist, "delay distribution for loads"),
+ ADD_STAT(storeLatencyDist, "delay distribution for stores"),
+ ADD_STAT(initToCoalesceLatency,
+ "Ticks from vmem inst initiateAcc to coalescer issue"),
+ ADD_STAT(rubyNetworkLatency,
+ "Ticks from coalescer issue to coalescer hit callback"),
+ ADD_STAT(gmEnqueueLatency,
+ "Ticks from coalescer hit callback to GM pipe enqueue"),
+ ADD_STAT(gmToCompleteLatency,
+ "Ticks queued in GM pipes ordered response buffer"),
+ ADD_STAT(coalsrLineAddresses,
+ "Number of cache lines for coalesced request"),
+ ADD_STAT(shaderActiveTicks,
+ "Total ticks that any CU attached to this shader is active"),
+ ADD_STAT(vectorInstSrcOperand,
+ "vector instruction source operand distribution"),
+ ADD_STAT(vectorInstDstOperand,
+ "vector instruction destination operand distribution")
+{
+ allLatencyDist
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ loadLatencyDist
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ storeLatencyDist
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ initToCoalesceLatency
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ rubyNetworkLatency
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ gmEnqueueLatency
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ gmToCompleteLatency
+ .init(0, 1600000, 10000)
+ .flags(Stats::pdf | Stats::oneline);
+
+ coalsrLineAddresses
+ .init(0, 20, 1)
+ .flags(Stats::pdf | Stats::oneline);
+
+ vectorInstSrcOperand.init(4);
+ vectorInstDstOperand.init(4);
+
+ cacheBlockRoundTrip = new Stats::Distribution[wf_size];
+ for (int idx = 0; idx < wf_size; ++idx) {
+ std::stringstream namestr;
+ ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
+ static_cast<Shader*>(parent)->name(), idx);
+ cacheBlockRoundTrip[idx]
+ .init(0, 1600000, 10000)
+ .name(namestr.str())
+ .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+ .flags(Stats::pdf | Stats::oneline);
+ }
}
#include <string>
#include "arch/isa.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
- // some stats for measuring latency
- Stats::Distribution allLatencyDist;
- Stats::Distribution loadLatencyDist;
- Stats::Distribution storeLatencyDist;
-
- // average ticks from vmem inst initiateAcc to coalescer issue,
- // average ticks from coalescer issue to coalescer hit callback,
- // average ticks from coalescer hit callback to GM pipe enqueue,
- // and average ticks spent in GM pipe's ordered resp buffer.
- Stats::Distribution initToCoalesceLatency;
- Stats::Distribution rubyNetworkLatency;
- Stats::Distribution gmEnqueueLatency;
- Stats::Distribution gmToCompleteLatency;
-
- // average number of cache blocks requested by vmem inst, and
- // average ticks for cache blocks to main memory for the Nth
- // cache block generated by a vmem inst.
- Stats::Distribution coalsrLineAddresses;
- Stats::Distribution *cacheBlockRoundTrip;
-
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
- /**
- * Statistics
- */
- Stats::Scalar shaderActiveTicks;
- Stats::Vector vectorInstSrcOperand;
- Stats::Vector vectorInstDstOperand;
- void regStats();
-
int64_t max_valu_insts;
int64_t total_valu_insts;
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void notifyCuSleep();
+
+ void
+ incVectorInstSrcOperand(int num_operands)
+ {
+ stats.vectorInstSrcOperand[num_operands]++;
+ }
+
+ void
+ incVectorInstDstOperand(int num_operands)
+ {
+ stats.vectorInstDstOperand[num_operands]++;
+ }
+
+ protected:
+ struct ShaderStats : public Stats::Group
+ {
+ ShaderStats(Stats::Group *parent, int wf_size);
+
+ // some stats for measuring latency
+ Stats::Distribution allLatencyDist;
+ Stats::Distribution loadLatencyDist;
+ Stats::Distribution storeLatencyDist;
+
+ // average ticks from vmem inst initiateAcc to coalescer issue,
+ Stats::Distribution initToCoalesceLatency;
+
+ // average ticks from coalescer issue to coalescer hit callback,
+ Stats::Distribution rubyNetworkLatency;
+
+ // average ticks from coalescer hit callback to GM pipe enqueue,
+ Stats::Distribution gmEnqueueLatency;
+
+ // average ticks spent in GM pipe's ordered resp buffer.
+ Stats::Distribution gmToCompleteLatency;
+
+ // average number of cache blocks requested by vmem inst
+ Stats::Distribution coalsrLineAddresses;
+
+ // average ticks for cache blocks to main memory for the Nth
+ // cache block generated by a vmem inst.
+ Stats::Distribution *cacheBlockRoundTrip;
+
+ Stats::Scalar shaderActiveTicks;
+ Stats::Vector vectorInstSrcOperand;
+ Stats::Vector vectorInstDstOperand;
+ } stats;
};
#endif // __SHADER_HH__
w->reservedScalarRegs = 0;
w->startSgprIndex = 0;
}
-
-void
-StaticRegisterManagerPolicy::regStats()
-{
-}
int scalarDemand) override;
void freeRegisters(Wavefront *w) override;
-
- void regStats() override;
};
#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
false, Event::CPU_Tick_Pri),
cleanupEvent([this]{ processCleanupEvent(); },
"Cleanup issuedTranslationsTable hashmap",
- false, Event::Maximum_Pri)
+ false, Event::Maximum_Pri),
+ stats(this)
{
// create the response ports based on the number of connected ports
for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
sender_state->reqCnt.push_back(req_cnt);
// update statistics
- coalescer->uncoalescedAccesses++;
+ coalescer->stats.uncoalescedAccesses++;
req_cnt = sender_state->reqCnt.back();
DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
- coalescer->queuingCycles -= (curTick() * req_cnt);
- coalescer->localqueuingCycles -= curTick();
+ coalescer->stats.queuingCycles -= (curTick() * req_cnt);
+ coalescer->stats.localqueuingCycles -= curTick();
}
// FIXME if you want to coalesce not based on the issueTime
// and make necessary allocations.
if (!coalescedReq_cnt || !didCoalesce) {
if (update_stats)
- coalescer->coalescedAccesses++;
+ coalescer->stats.coalescedAccesses++;
std::vector<PacketPtr> new_array;
new_array.push_back(pkt);
bool update_stats = !sender_state->prefetch;
if (update_stats)
- coalescer->uncoalescedAccesses++;
+ coalescer->stats.uncoalescedAccesses++;
// If there is a pending timing request for this virtual address
// print a warning message. This is a temporary caveat of
// by the one we just sent counting all the way from
// the top of TLB hiearchy (i.e., from the CU)
int req_cnt = tmp_sender_state->reqCnt.back();
- queuingCycles += (curTick() * req_cnt);
+ stats.queuingCycles += (curTick() * req_cnt);
DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
name(), req_cnt);
// pkt_cnt is number of packets we coalesced into the one
// we just sent but only at this coalescer level
int pkt_cnt = iter->second[vector_index].size();
- localqueuingCycles += (curTick() * pkt_cnt);
+ stats.localqueuingCycles += (curTick() * pkt_cnt);
}
DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
}
}
-void
-TLBCoalescer::regStats()
+TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
+ : Stats::Group(parent),
+ ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
+ ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
+ ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
+ ADD_STAT(localqueuingCycles,
+ "Number of cycles spent in queue for all incoming reqs"),
+ ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
{
- ClockedObject::regStats();
-
- uncoalescedAccesses
- .name(name() + ".uncoalesced_accesses")
- .desc("Number of uncoalesced TLB accesses")
- ;
-
- coalescedAccesses
- .name(name() + ".coalesced_accesses")
- .desc("Number of coalesced TLB accesses")
- ;
-
- queuingCycles
- .name(name() + ".queuing_cycles")
- .desc("Number of cycles spent in queue")
- ;
-
- localqueuingCycles
- .name(name() + ".local_queuing_cycles")
- .desc("Number of cycles spent in queue for all incoming reqs")
- ;
-
- localLatency
- .name(name() + ".local_latency")
- .desc("Avg. latency over all incoming pkts")
- ;
-
localLatency = localqueuingCycles / uncoalescedAccesses;
}
CoalescingTable issuedTranslationsTable;
- // number of packets the coalescer receives
- Stats::Scalar uncoalescedAccesses;
- // number packets the coalescer send to the TLB
- Stats::Scalar coalescedAccesses;
-
- // Number of cycles the coalesced requests spend waiting in
- // coalescerFIFO. For each packet the coalescer receives we take into
- // account the number of all uncoalesced requests this pkt "represents"
- Stats::Scalar queuingCycles;
-
- // On average how much time a request from the
- // uncoalescedAccesses that reaches the TLB
- // spends waiting?
- Stats::Scalar localqueuingCycles;
- // localqueuingCycles/uncoalescedAccesses
- Stats::Formula localLatency;
-
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
void updatePhysAddresses(PacketPtr pkt);
- void regStats() override;
class CpuSidePort : public ResponsePort
{
// this FIFO queue keeps track of the virt. page
// addresses that are pending cleanup
std::queue<Addr> cleanupQueue;
+
+ protected:
+ struct TLBCoalescerStats : public Stats::Group
+ {
+ TLBCoalescerStats(Stats::Group *parent);
+
+ // number of packets the coalescer receives
+ Stats::Scalar uncoalescedAccesses;
+ // number packets the coalescer send to the TLB
+ Stats::Scalar coalescedAccesses;
+
+ // Number of cycles the coalesced requests spend waiting in
+ // coalescerFIFO. For each packet the coalescer receives we take into
+ // account the number of all uncoalesced requests this pkt "represents"
+ Stats::Scalar queuingCycles;
+
+ // On average how much time a request from the
+ // uncoalescedAccesses that reaches the TLB
+ // spends waiting?
+ Stats::Scalar localqueuingCycles;
+ // localqueuingCycles/uncoalescedAccesses
+ Stats::Formula localLatency;
+ } stats;
};
#endif // __TLB_COALESCER_HH__
->mapVgpr(w, vgprIdx + j);
if (regBusy(pVgpr)) {
if (ii->isDstOperand(i)) {
- w->numTimesBlockedDueWAXDependencies++;
+ w->stats.numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pVgpr);
- w->numTimesBlockedDueRAWDependencies++;
+ w->stats.numTimesBlockedDueRAWDependencies++;
}
return false;
}
{
// increment count of number of DWORDs read from VRF
int DWORDs = ii->numSrcVecDWORDs();
- registerReads += (DWORDs * w->execMask().count());
+ stats.registerReads += (DWORDs * w->execMask().count());
uint64_t mask = w->execMask().to_ullong();
int srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
- sramReads += DWORDs;
+ stats.sramReads += DWORDs;
}
mask = mask >> 4;
}
// increment count of number of DWORDs written to VRF
DWORDs = ii->numDstVecDWORDs();
- registerWrites += (DWORDs * w->execMask().count());
+ stats.registerWrites += (DWORDs * w->execMask().count());
mask = w->execMask().to_ullong();
srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
- sramWrites += DWORDs;
+ stats.sramWrites += DWORDs;
}
mask = mask >> 4;
}
}
// increment count of number of DWORDs written to VRF
int DWORDs = ii->numDstVecDWORDs();
- registerWrites += (DWORDs * ii->exec_mask.count());
+ stats.registerWrites += (DWORDs * ii->exec_mask.count());
uint64_t mask = ii->exec_mask.to_ullong();
int srams = ii->exec_mask.size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
- sramWrites += DWORDs;
+ stats.sramWrites += DWORDs;
}
mask = mask >> 4;
}
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
- barId(WFBarrier::InvalidID)
+ barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
vecReads.clear();
}
-void
-Wavefront::regStats()
-{
- SimObject::regStats();
-
- // FIXME: the name of the WF needs to be unique
- numTimesBlockedDueWAXDependencies
- .name(name() + ".timesBlockedDueWAXDependencies")
- .desc("number of times the wf's instructions are blocked due to WAW "
- "or WAR dependencies")
- ;
-
- // FIXME: the name of the WF needs to be unique
- numTimesBlockedDueRAWDependencies
- .name(name() + ".timesBlockedDueRAWDependencies")
- .desc("number of times the wf's instructions are blocked due to RAW "
- "dependencies")
- ;
-
- numInstrExecuted
- .name(name() + ".num_instr_executed")
- .desc("number of instructions executed by this WF slot")
- ;
-
- schCycles
- .name(name() + ".sch_cycles")
- .desc("number of cycles spent in schedule stage")
- ;
-
- schStalls
- .name(name() + ".sch_stalls")
- .desc("number of cycles WF is stalled in SCH stage")
- ;
-
- schRfAccessStalls
- .name(name() + ".sch_rf_access_stalls")
- .desc("number of cycles wave selected in SCH but RF denied adding "
- "instruction")
- ;
-
- schResourceStalls
- .name(name() + ".sch_resource_stalls")
- .desc("number of cycles stalled in sch by resource not available")
- ;
-
- schOpdNrdyStalls
- .name(name() + ".sch_opd_nrdy_stalls")
- .desc("number of cycles stalled in sch waiting for RF reads to "
- "complete")
- ;
-
- schLdsArbStalls
- .name(name() + ".sch_lds_arb_stalls")
- .desc("number of cycles wave stalled due to LDS-VRF arbitration")
- ;
-
- vecRawDistance
- .init(0,20,1)
- .name(name() + ".vec_raw_distance")
- .desc("Count of RAW distance in dynamic instructions for this WF")
- ;
-
- readsPerWrite
- .init(0,4,1)
- .name(name() + ".vec_reads_per_write")
- .desc("Count of Vector reads per write for this WF")
- ;
-}
-
void
Wavefront::init()
{
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
- computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
- computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
- computeUnit->numInstrExecuted++;
- numInstrExecuted++;
+ computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands());
+ computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands());
+ computeUnit->stats.numInstrExecuted++;
+ stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
- computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
- computeUnit->lastExecCycle[simdId]);
- computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+ computeUnit->stats.execRateDist.sample(
+ computeUnit->stats.totalCycles.value() -
+ computeUnit->lastExecCycle[simdId]);
+ computeUnit->lastExecCycle[simdId] =
+ computeUnit->stats.totalCycles.value();
if (lastInstExec) {
- computeUnit->instInterleave[simdId].
+ computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
if (ii->isSrcOperand(i)) {
// This check should never fail, but to be safe we check
if (rawDist.find(vgpr+n) != rawDist.end()) {
- vecRawDistance.
- sample(numInstrExecuted.value() - rawDist[vgpr+n]);
+ stats.vecRawDistance.sample(
+ stats.numInstrExecuted.value() - rawDist[vgpr+n]);
}
// increment number of reads to this register
vecReads[vgpr+n]++;
// for the first write to each physical register
if (rawDist.find(vgpr+n) != rawDist.end()) {
// sample the number of reads that were performed
- readsPerWrite.sample(vecReads[vgpr+n]);
+ stats.readsPerWrite.sample(vecReads[vgpr+n]);
}
// on a write, reset count of reads to 0
vecReads[vgpr+n] = 0;
- rawDist[vgpr+n] = numInstrExecuted.value();
+ rawDist[vgpr+n] = stats.numInstrExecuted.value();
}
}
}
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
- computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
- computeUnit->numVecOpsExecuted += num_active_lanes;
+ computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
+ computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
- computeUnit->numVecOpsExecutedF16 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
- computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAC()) {
- computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAD()) {
- computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
- computeUnit->numVecOpsExecutedF32 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
- computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAC()) {
- computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAD()) {
- computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
- computeUnit->numVecOpsExecutedF64 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
- computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAC()) {
- computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
else if (ii->isMAD()) {
- computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
- computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
+ computeUnit->stats.numVecOpsExecutedTwoOpFP
+ += num_active_lanes;
}
}
if (isGmInstruction(ii)) {
- computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+ computeUnit->stats.activeLanesPerGMemInstrDist.sample(
+ num_active_lanes);
} else if (isLmInstruction(ii)) {
- computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+ computeUnit->stats.activeLanesPerLMemInstrDist.sample(
+ num_active_lanes);
}
}
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesVMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesScMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesVMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesScMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesVMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesScMemPerSimd[simdId] +=
+ computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesLdsPerSimd[simdId] +=
+ computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesLdsPerSimd[simdId] +=
+ computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
- computeUnit->instCyclesLdsPerSimd[simdId] +=
+ computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
{
barId = WFBarrier::InvalidID;
}
+
+Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
+ : Stats::Group(parent),
+ ADD_STAT(numInstrExecuted,
+ "number of instructions executed by this WF slot"),
+ ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
+ ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
+ ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
+ "RF denied adding instruction"),
+ ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
+ " not available"),
+ ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
+ "RF reads to complete"),
+ ADD_STAT(schLdsArbStalls,
+ "number of cycles wave stalled due to LDS-VRF arbitration"),
+ // FIXME: the name of the WF needs to be unique
+ ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
+ "instructions are blocked due to WAW or WAR dependencies"),
+ // FIXME: the name of the WF needs to be unique
+ ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
+ "instructions are blocked due to RAW dependencies"),
+ ADD_STAT(vecRawDistance,
+ "Count of RAW distance in dynamic instructions for this WF"),
+ ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
+{
+ vecRawDistance.init(0, 20, 1);
+ readsPerWrite.init(0, 4, 1);
+}
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/compute_unit.hh"
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
- // Wavefront slot stats
-
- // Number of instructions executed by this wavefront slot across all
- // dynamic wavefronts
- Stats::Scalar numInstrExecuted;
-
- // Number of cycles this WF spends in SCH stage
- Stats::Scalar schCycles;
-
- // Number of stall cycles encounterd by this WF in SCH stage
- Stats::Scalar schStalls;
-
- // The following stats sum to the value of schStalls, and record, per
- // WF slot, what the cause of each stall was at a coarse granularity.
-
- // Cycles WF is selected by scheduler, but RFs cannot support instruction
- Stats::Scalar schRfAccessStalls;
- // Cycles spent waiting for execution resources
- Stats::Scalar schResourceStalls;
- // cycles spent waiting for RF reads to complete in SCH stage
- Stats::Scalar schOpdNrdyStalls;
- // LDS arbitration stall cycles. WF attempts to execute LM instruction,
- // but another wave is executing FLAT, which requires LM and GM and forces
- // this WF to stall.
- Stats::Scalar schLdsArbStalls;
-
- // number of times an instruction of a WF is blocked from being issued
- // due to WAR and WAW dependencies
- Stats::Scalar numTimesBlockedDueWAXDependencies;
- // number of times an instruction of a WF is blocked from being issued
- // due to WAR and WAW dependencies
- Stats::Scalar numTimesBlockedDueRAWDependencies;
-
// dyn inst id (per SIMD) of last instruction exec from this wave
uint64_t lastInstExec;
- // Distribution to track the distance between producer and consumer
- // for vector register values
- Stats::Distribution vecRawDistance;
// Map to track the dyn instruction id of each vector register value
// produced, indexed by physical vector register ID
std::unordered_map<int,uint64_t> rawDist;
- // Distribution to track the number of times every vector register
- // value produced is consumed.
- Stats::Distribution readsPerWrite;
// Counts the number of reads performed to each physical register
// - counts are reset to 0 for each dynamic wavefront launched
std::vector<int> vecReads;
// called by SCH stage to reserve
std::vector<int> reserveResources();
bool stopFetch();
- void regStats();
Addr pc() const;
void pc(Addr new_pc);
Addr _pc;
VectorMask _execMask;
int barId;
+
+ public:
+ struct WavefrontStats : public Stats::Group
+ {
+ WavefrontStats(Stats::Group *parent);
+
+ // Number of instructions executed by this wavefront slot across all
+ // dynamic wavefronts
+ Stats::Scalar numInstrExecuted;
+
+ // Number of cycles this WF spends in SCH stage
+ Stats::Scalar schCycles;
+
+ // Number of stall cycles encounterd by this WF in SCH stage
+ Stats::Scalar schStalls;
+
+ // The following stats sum to the value of schStalls, and record, per
+ // WF slot, what the cause of each stall was at a coarse granularity.
+
+ // Cycles WF is selected by scheduler, but RFs cannot support
+ // instruction
+ Stats::Scalar schRfAccessStalls;
+ // Cycles spent waiting for execution resources
+ Stats::Scalar schResourceStalls;
+ // cycles spent waiting for RF reads to complete in SCH stage
+ Stats::Scalar schOpdNrdyStalls;
+ // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+ // but another wave is executing FLAT, which requires LM and GM and
+ // forces this WF to stall.
+ Stats::Scalar schLdsArbStalls;
+
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueWAXDependencies;
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueRAWDependencies;
+
+ // Distribution to track the distance between producer and consumer
+ // for vector register values
+ Stats::Distribution vecRawDistance;
+
+ // Distribution to track the number of times every vector register
+ // value produced is consumed.
+ Stats::Distribution readsPerWrite;
+ } stats;
};
#endif // __GPU_COMPUTE_WAVEFRONT_HH__