arch-gcn3,gpu-compute: Update stats style for GPU

author Matthew Poremba <matthew.poremba@amd.com>

Thu, 14 Jan 2021 16:29:37 +0000 (10:29 -0600)

committer Matthew Poremba <matthew.poremba@amd.com>

Mon, 18 Jan 2021 17:58:05 +0000 (17:58 +0000)
author Matthew Poremba <matthew.poremba@amd.com>
Thu, 14 Jan 2021 16:29:37 +0000 (10:29 -0600)
committer Matthew Poremba <matthew.poremba@amd.com>
Mon, 18 Jan 2021 17:58:05 +0000 (17:58 +0000)
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc

index e8951a9a5d733d2fd81a8176e6ba1099f206d714..565e8540442461e2daea9e14b88c2254bf0e619d 100644 (file)
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3800,7 +3800,7 @@ namespace Gcn3ISA
              wf->computeUnit->cu_id, wf->wgId, refCount);
  
          wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->completedWfs++;
+        wf->computeUnit->stats.completedWfs++;
          wf->computeUnit->activeWaves--;
  
          panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
@@ -3811,7 +3811,7 @@ namespace Gcn3ISA
  
          for (int i = 0; i < wf->vecReads.size(); i++) {
              if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->readsPerWrite.sample(wf->vecReads.at(i));
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
              }
          }
          wf->vecReads.clear();
@@ -3853,7 +3853,7 @@ namespace Gcn3ISA
              if (!kernelEnd || !relNeeded) {
                  wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
                  wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->completedWGs++;
+                wf->computeUnit->stats.completedWGs++;
  
                  return;
              }
@@ -3877,7 +3877,7 @@ namespace Gcn3ISA
              // call shader to prepare the flush operations
              wf->computeUnit->shader->prepareFlush(gpuDynInst);
  
-            wf->computeUnit->completedWGs++;
+            wf->computeUnit->stats.completedWGs++;
          } else {
              wf->computeUnit->shader->dispatcher().scheduleDispatch();
          }
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index d460861e221a8fff3e6739e1707fcf76b25d3645..636fd554ecfe876aa56c01c9207eb2a9dd022884 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -106,7 +106,8 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
      _numBarrierSlots(p.num_barrier_slots),
      globalSeqNum(0), wavefrontSize(p.wf_size),
      scoreboardCheckToSchedule(p),
-    scheduleToExecute(p)
+    scheduleToExecute(p),
+    stats(this, p.n_wf)
  {
      /**
       * This check is necessary because std::bitset only provides conversion
@@ -367,7 +368,7 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
      w->initRegState(task, w->actualWgSzTotal);
      w->start(_n_wave++, task->codeAddr());
  
-    waveLevelParallelism.sample(activeWaves);
+    stats.waveLevelParallelism.sample(activeWaves);
      activeWaves++;
  }
  
@@ -612,22 +613,22 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
              freeWfSlots, numMappedWfs, vregAvail, sregAvail);
  
      if (!vregAvail) {
-        ++numTimesWgBlockedDueVgprAlloc;
+        ++stats.numTimesWgBlockedDueVgprAlloc;
      }
  
      if (!sregAvail) {
-        ++numTimesWgBlockedDueSgprAlloc;
+        ++stats.numTimesWgBlockedDueSgprAlloc;
      }
  
      // Return true if enough WF slots to submit workgroup and if there are
      // enough VGPRs to schedule all WFs to their SIMD units
      bool ldsAvail = lds.canReserve(task->ldsSize());
      if (!ldsAvail) {
-        wgBlockedDueLdsAllocation++;
+        stats.wgBlockedDueLdsAllocation++;
      }
  
      if (!barrier_avail) {
-        wgBlockedDueBarrierAllocation++;
+        stats.wgBlockedDueBarrierAllocation++;
      }
  
      // Return true if the following are all true:
@@ -734,7 +735,7 @@ ComputeUnit::exec()
      scoreboardCheckStage.exec();
      fetchStage.exec();
  
-    totalCycles++;
+    stats.totalCycles++;
  
      // Put this CU to sleep if there is no more work to be done.
      if (!isDone()) {
@@ -1032,8 +1033,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
          fatal("pkt is not a read nor a write\n");
      }
  
-    tlbCycles -= curTick();
-    ++tlbRequests;
+    stats.tlbCycles -= curTick();
+    ++stats.tlbRequests;
  
      PortID tlbPort_index = perLaneTLB ? index : 0;
  
@@ -1075,7 +1076,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
              // update the hitLevel distribution
              int hit_level = translation_state->hitLevel;
              assert(hit_level != -1);
-            hitsPerTLBLevel[hit_level]++;
+            stats.hitsPerTLBLevel[hit_level]++;
  
              // New SenderState for the memory access
              X86ISA::GpuTLB::TranslationState *sender_state =
@@ -1346,7 +1347,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
          // for the first cache block.
          if (compute_unit->headTailMap.count(gpuDynInst)) {
              Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
-            compute_unit->headTailLatency.sample(curTick() - headTick);
+            compute_unit->stats.headTailLatency.sample(curTick() - headTick);
              compute_unit->headTailMap.erase(gpuDynInst);
          }
  
@@ -1381,7 +1382,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
              pkt->req->getVaddr(), line);
  
      assert(pkt->senderState);
-    computeUnit->tlbCycles += curTick();
+    computeUnit->stats.tlbCycles += curTick();
  
      // pop off the TLB translation state
      X86ISA::GpuTLB::TranslationState *translation_state =
@@ -1402,7 +1403,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
  
      // update the hitLevel distribution
      int hit_level = translation_state->hitLevel;
-    computeUnit->hitsPerTLBLevel[hit_level]++;
+    computeUnit->stats.hitsPerTLBLevel[hit_level]++;
  
      delete translation_state->tlbEntry;
      assert(!translation_state->ports.size());
@@ -1788,561 +1789,17 @@ ComputeUnit::ITLBPort::recvReqRetry()
      }
  }
  
-void
-ComputeUnit::regStats()
-{
-    ClockedObject::regStats();
-
-    vALUInsts
-        .name(name() + ".valu_insts")
-        .desc("Number of vector ALU insts issued.")
-        ;
-    vALUInstsPerWF
-        .name(name() + ".valu_insts_per_wf")
-        .desc("The avg. number of vector ALU insts issued per-wavefront.")
-        ;
-    sALUInsts
-        .name(name() + ".salu_insts")
-        .desc("Number of scalar ALU insts issued.")
-        ;
-    sALUInstsPerWF
-        .name(name() + ".salu_insts_per_wf")
-        .desc("The avg. number of scalar ALU insts issued per-wavefront.")
-        ;
-    instCyclesVALU
-        .name(name() + ".inst_cycles_valu")
-        .desc("Number of cycles needed to execute VALU insts.")
-        ;
-    instCyclesSALU
-        .name(name() + ".inst_cycles_salu")
-        .desc("Number of cycles needed to execute SALU insts.")
-        ;
-    threadCyclesVALU
-        .name(name() + ".thread_cycles_valu")
-        .desc("Number of thread cycles used to execute vector ALU ops. "
-              "Similar to instCyclesVALU but multiplied by the number of "
-              "active threads.")
-        ;
-    vALUUtilization
-        .name(name() + ".valu_utilization")
-        .desc("Percentage of active vector ALU threads in a wave.")
-        ;
-    ldsNoFlatInsts
-        .name(name() + ".lds_no_flat_insts")
-        .desc("Number of LDS insts issued, not including FLAT "
-              "accesses that resolve to LDS.")
-        ;
-    ldsNoFlatInstsPerWF
-        .name(name() + ".lds_no_flat_insts_per_wf")
-        .desc("The avg. number of LDS insts (not including FLAT "
-              "accesses that resolve to LDS) per-wavefront.")
-        ;
-    flatVMemInsts
-        .name(name() + ".flat_vmem_insts")
-        .desc("The number of FLAT insts that resolve to vmem issued.")
-        ;
-    flatVMemInstsPerWF
-        .name(name() + ".flat_vmem_insts_per_wf")
-        .desc("The average number of FLAT insts that resolve to vmem "
-              "issued per-wavefront.")
-        ;
-    flatLDSInsts
-        .name(name() + ".flat_lds_insts")
-        .desc("The number of FLAT insts that resolve to LDS issued.")
-        ;
-    flatLDSInstsPerWF
-        .name(name() + ".flat_lds_insts_per_wf")
-        .desc("The average number of FLAT insts that resolve to LDS "
-              "issued per-wavefront.")
-        ;
-    vectorMemWrites
-        .name(name() + ".vector_mem_writes")
-        .desc("Number of vector mem write insts (excluding FLAT insts).")
-        ;
-    vectorMemWritesPerWF
-        .name(name() + ".vector_mem_writes_per_wf")
-        .desc("The average number of vector mem write insts "
-              "(excluding FLAT insts) per-wavefront.")
-        ;
-    vectorMemReads
-        .name(name() + ".vector_mem_reads")
-        .desc("Number of vector mem read insts (excluding FLAT insts).")
-        ;
-    vectorMemReadsPerWF
-        .name(name() + ".vector_mem_reads_per_wf")
-        .desc("The avg. number of vector mem read insts (excluding "
-              "FLAT insts) per-wavefront.")
-        ;
-    scalarMemWrites
-        .name(name() + ".scalar_mem_writes")
-        .desc("Number of scalar mem write insts.")
-        ;
-    scalarMemWritesPerWF
-        .name(name() + ".scalar_mem_writes_per_wf")
-        .desc("The average number of scalar mem write insts per-wavefront.")
-        ;
-    scalarMemReads
-        .name(name() + ".scalar_mem_reads")
-        .desc("Number of scalar mem read insts.")
-        ;
-    scalarMemReadsPerWF
-        .name(name() + ".scalar_mem_reads_per_wf")
-        .desc("The average number of scalar mem read insts per-wavefront.")
-        ;
-
-    vALUInstsPerWF = vALUInsts / completedWfs;
-    sALUInstsPerWF = sALUInsts / completedWfs;
-    vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
-    ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
-    flatVMemInstsPerWF = flatVMemInsts / completedWfs;
-    flatLDSInstsPerWF = flatLDSInsts / completedWfs;
-    vectorMemWritesPerWF = vectorMemWrites / completedWfs;
-    vectorMemReadsPerWF = vectorMemReads / completedWfs;
-    scalarMemWritesPerWF = scalarMemWrites / completedWfs;
-    scalarMemReadsPerWF = scalarMemReads / completedWfs;
-
-    vectorMemReadsPerKiloInst
-        .name(name() + ".vector_mem_reads_per_kilo_inst")
-        .desc("Number of vector mem reads per kilo-instruction")
-        ;
-    vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
-    vectorMemWritesPerKiloInst
-        .name(name() + ".vector_mem_writes_per_kilo_inst")
-        .desc("Number of vector mem writes per kilo-instruction")
-        ;
-    vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
-    vectorMemInstsPerKiloInst
-        .name(name() + ".vector_mem_insts_per_kilo_inst")
-        .desc("Number of vector mem insts per kilo-instruction")
-        ;
-    vectorMemInstsPerKiloInst =
-        ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
-    scalarMemReadsPerKiloInst
-        .name(name() + ".scalar_mem_reads_per_kilo_inst")
-        .desc("Number of scalar mem reads per kilo-instruction")
-    ;
-    scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
-    scalarMemWritesPerKiloInst
-        .name(name() + ".scalar_mem_writes_per_kilo_inst")
-        .desc("Number of scalar mem writes per kilo-instruction")
-    ;
-    scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
-    scalarMemInstsPerKiloInst
-        .name(name() + ".scalar_mem_insts_per_kilo_inst")
-        .desc("Number of scalar mem insts per kilo-instruction")
-        ;
-    scalarMemInstsPerKiloInst =
-        ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
-
-    instCyclesVMemPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_vector_memory")
-       .desc("Number of cycles to send address, command, data from VRF to "
-             "vector memory unit, per SIMD")
-       ;
-
-    instCyclesScMemPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_scalar_memory")
-       .desc("Number of cycles to send address, command, data from SRF to "
-             "scalar memory unit, per SIMD")
-       ;
-
-    instCyclesLdsPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_lds")
-       .desc("Number of cycles to send address, command, data from VRF to "
-             "LDS unit, per SIMD")
-       ;
-
-    globalReads
-        .name(name() + ".global_mem_reads")
-        .desc("Number of reads to the global segment")
-    ;
-    globalWrites
-        .name(name() + ".global_mem_writes")
-        .desc("Number of writes to the global segment")
-    ;
-    globalMemInsts
-        .name(name() + ".global_mem_insts")
-        .desc("Number of memory instructions sent to the global segment")
-    ;
-    globalMemInsts = globalReads + globalWrites;
-    argReads
-        .name(name() + ".arg_reads")
-        .desc("Number of reads to the arg segment")
-    ;
-    argWrites
-        .name(name() + ".arg_writes")
-        .desc("NUmber of writes to the arg segment")
-    ;
-    argMemInsts
-        .name(name() + ".arg_mem_insts")
-        .desc("Number of memory instructions sent to the arg segment")
-    ;
-    argMemInsts = argReads + argWrites;
-    spillReads
-        .name(name() + ".spill_reads")
-        .desc("Number of reads to the spill segment")
-    ;
-    spillWrites
-        .name(name() + ".spill_writes")
-        .desc("Number of writes to the spill segment")
-    ;
-    spillMemInsts
-        .name(name() + ".spill_mem_insts")
-        .desc("Number of memory instructions sent to the spill segment")
-    ;
-    spillMemInsts = spillReads + spillWrites;
-    groupReads
-        .name(name() + ".group_reads")
-        .desc("Number of reads to the group segment")
-    ;
-    groupWrites
-        .name(name() + ".group_writes")
-        .desc("Number of writes to the group segment")
-    ;
-    groupMemInsts
-        .name(name() + ".group_mem_insts")
-        .desc("Number of memory instructions sent to the group segment")
-    ;
-    groupMemInsts = groupReads + groupWrites;
-    privReads
-        .name(name() + ".private_reads")
-        .desc("Number of reads to the private segment")
-    ;
-    privWrites
-        .name(name() + ".private_writes")
-        .desc("Number of writes to the private segment")
-    ;
-    privMemInsts
-        .name(name() + ".private_mem_insts")
-        .desc("Number of memory instructions sent to the private segment")
-    ;
-    privMemInsts = privReads + privWrites;
-    readonlyReads
-        .name(name() + ".readonly_reads")
-        .desc("Number of reads to the readonly segment")
-    ;
-    readonlyWrites
-        .name(name() + ".readonly_writes")
-        .desc("Number of memory instructions sent to the readonly segment")
-    ;
-    readonlyMemInsts
-        .name(name() + ".readonly_mem_insts")
-        .desc("Number of memory instructions sent to the readonly segment")
-    ;
-    readonlyMemInsts = readonlyReads + readonlyWrites;
-    kernargReads
-        .name(name() + ".kernarg_reads")
-        .desc("Number of reads sent to the kernarg segment")
-    ;
-    kernargWrites
-        .name(name() + ".kernarg_writes")
-        .desc("Number of memory instructions sent to the kernarg segment")
-    ;
-    kernargMemInsts
-        .name(name() + ".kernarg_mem_insts")
-        .desc("Number of memory instructions sent to the kernarg segment")
-    ;
-    kernargMemInsts = kernargReads + kernargWrites;
-
-    tlbCycles
-        .name(name() + ".tlb_cycles")
-        .desc("total number of cycles for all uncoalesced requests")
-        ;
-
-    tlbRequests
-        .name(name() + ".tlb_requests")
-        .desc("number of uncoalesced requests")
-        ;
-
-    tlbLatency
-        .name(name() + ".avg_translation_latency")
-        .desc("Avg. translation latency for data translations")
-        ;
-
-    tlbLatency = tlbCycles / tlbRequests;
-
-    hitsPerTLBLevel
-       .init(4)
-       .name(name() + ".TLB_hits_distribution")
-       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
-       ;
-
-    // fixed number of TLB levels
-    for (int i = 0; i < 4; ++i) {
-        if (!i)
-            hitsPerTLBLevel.subname(i,"page_table");
-        else
-            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
-    }
-
-    execRateDist
-        .init(0, 10, 2)
-        .name(name() + ".inst_exec_rate")
-        .desc("Instruction Execution Rate: Number of executed vector "
-              "instructions per cycle")
-        ;
-
-    ldsBankConflictDist
-       .init(0, wfSize(), 2)
-       .name(name() + ".lds_bank_conflicts")
-       .desc("Number of bank conflicts per LDS memory packet")
-       ;
-
-    ldsBankAccesses
-        .name(name() + ".lds_bank_access_cnt")
-        .desc("Total number of LDS bank accesses")
-        ;
-
-    pageDivergenceDist
-        // A wavefront can touch up to N pages per memory instruction where
-        // N is equal to the wavefront size
-        // The number of pages per bin can be configured (here it's 4).
-       .init(1, wfSize(), 4)
-       .name(name() + ".page_divergence_dist")
-       .desc("pages touched per wf (over all mem. instr.)")
-       ;
-
-    controlFlowDivergenceDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".warp_execution_dist")
-        .desc("number of lanes active per instruction (oval all instructions)")
-        ;
-
-    activeLanesPerGMemInstrDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".gmem_lanes_execution_dist")
-        .desc("number of active lanes per global memory instruction")
-        ;
-
-    activeLanesPerLMemInstrDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".lmem_lanes_execution_dist")
-        .desc("number of active lanes per local memory instruction")
-        ;
-
-    numInstrExecuted
-        .name(name() + ".num_instr_executed")
-        .desc("number of instructions executed")
-        ;
-
-    numVecOpsExecuted
-        .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF16
-        .name(name() + ".num_vec_ops_f16_executed")
-        .desc("number of f16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF32
-        .name(name() + ".num_vec_ops_f32_executed")
-        .desc("number of f32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF64
-        .name(name() + ".num_vec_ops_f64_executed")
-        .desc("number of f64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA16
-        .name(name() + ".num_vec_ops_fma16_executed")
-        .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA32
-        .name(name() + ".num_vec_ops_fma32_executed")
-        .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA64
-        .name(name() + ".num_vec_ops_fma64_executed")
-        .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD16
-        .name(name() + ".num_vec_ops_mad16_executed")
-        .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD32
-        .name(name() + ".num_vec_ops_mad32_executed")
-        .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD64
-        .name(name() + ".num_vec_ops_mad64_executed")
-        .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC16
-        .name(name() + ".num_vec_ops_mac16_executed")
-        .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC32
-        .name(name() + ".num_vec_ops_mac32_executed")
-        .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC64
-        .name(name() + ".num_vec_ops_mac64_executed")
-        .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedTwoOpFP
-        .name(name() + ".num_vec_ops_two_op_fp_executed")
-        .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
-        ;
-
-    totalCycles
-        .name(name() + ".num_total_cycles")
-        .desc("number of cycles the CU ran for")
-        ;
-
-    ipc
-        .name(name() + ".ipc")
-        .desc("Instructions per cycle (this CU only)")
-        ;
-
-    vpc
-        .name(name() + ".vpc")
-        .desc("Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f16
-        .name(name() + ".vpc_f16")
-        .desc("F16 Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f32
-        .name(name() + ".vpc_f32")
-        .desc("F32 Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f64
-        .name(name() + ".vpc_f64")
-        .desc("F64 Vector Operations per cycle (this CU only)")
-        ;
-
-    numALUInstsExecuted
-        .name(name() + ".num_alu_insts_executed")
-        .desc("Number of dynamic non-GM memory insts executed")
-        ;
-
-    wgBlockedDueBarrierAllocation
-        .name(name() + ".wg_blocked_due_barrier_alloc")
-        .desc("WG dispatch was blocked due to lack of barrier resources")
-        ;
-
-    wgBlockedDueLdsAllocation
-        .name(name() + ".wg_blocked_due_lds_alloc")
-        .desc("Workgroup blocked due to LDS capacity")
-        ;
-
-    ipc = numInstrExecuted / totalCycles;
-    vpc = numVecOpsExecuted / totalCycles;
-    vpc_f16 = numVecOpsExecutedF16 / totalCycles;
-    vpc_f32 = numVecOpsExecutedF32 / totalCycles;
-    vpc_f64 = numVecOpsExecutedF64 / totalCycles;
-
-    numTimesWgBlockedDueVgprAlloc
-        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
-        .desc("Number of times WGs are blocked due to VGPR allocation per "
-              "SIMD")
-        ;
-
-    numTimesWgBlockedDueSgprAlloc
-        .name(name() + ".times_wg_blocked_due_sgpr_alloc")
-        .desc("Number of times WGs are blocked due to SGPR allocation per "
-              "SIMD")
-        ;
-
-    dynamicGMemInstrCnt
-        .name(name() + ".global_mem_instr_cnt")
-        .desc("dynamic non-flat global memory instruction count")
-        ;
-
-    dynamicFlatMemInstrCnt
-        .name(name() + ".flat_global_mem_instr_cnt")
-        .desc("dynamic flat global memory instruction count")
-        ;
-
-    dynamicLMemInstrCnt
-        .name(name() + ".local_mem_instr_cnt")
-        .desc("dynamic local memory intruction count")
-        ;
-
-    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
-        dynamicLMemInstrCnt;
-
-    completedWfs
-        .name(name() + ".num_completed_wfs")
-        .desc("number of completed wavefronts")
-        ;
-
-    completedWGs
-        .name(name() + ".num_completed_wgs")
-        .desc("number of completed workgroups")
-        ;
-
-    numCASOps
-        .name(name() + ".num_CAS_ops")
-        .desc("number of compare and swap operations")
-        ;
-
-    numFailedCASOps
-        .name(name() + ".num_failed_CAS_ops")
-        .desc("number of compare and swap operations that failed")
-        ;
-
-    headTailLatency
-        .init(0, 1000000, 10000)
-        .name(name() + ".head_tail_latency")
-        .desc("ticks between first and last cache block arrival at coalescer")
-        .flags(Stats::pdf | Stats::oneline)
-        ;
-
-    waveLevelParallelism
-        .init(0, shader->n_wf * numVectorALUs, 1)
-        .name(name() + ".wlp")
-        .desc("wave level parallelism: count of active waves at wave launch")
-        ;
-
-    instInterleave
-        .init(numVectorALUs, 0, 20, 1)
-        .name(name() + ".interleaving")
-        .desc("Measure of instruction interleaving per SIMD")
-        ;
-
-    // register stats of pipeline stages
-    fetchStage.regStats();
-    scoreboardCheckStage.regStats();
-    scheduleStage.regStats();
-    execStage.regStats();
-
-    // register stats of memory pipelines
-    globalMemoryPipe.regStats();
-    localMemoryPipe.regStats();
-    scalarMemoryPipe.regStats();
-
-    registerManager->regStats();
-}
-
  void
  ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
  {
      if (gpuDynInst->isScalar()) {
          if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
-            sALUInsts++;
-            instCyclesSALU++;
+            stats.sALUInsts++;
+            stats.instCyclesSALU++;
          } else if (gpuDynInst->isLoad()) {
-            scalarMemReads++;
+            stats.scalarMemReads++;
          } else if (gpuDynInst->isStore()) {
-            scalarMemWrites++;
+            stats.scalarMemWrites++;
          }
      } else {
          if (gpuDynInst->isALU()) {
@@ -2350,45 +1807,46 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
              if (shader->total_valu_insts == shader->max_valu_insts) {
                  exitSimLoop("max vALU insts");
              }
-            vALUInsts++;
-            instCyclesVALU++;
-            threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
+            stats.vALUInsts++;
+            stats.instCyclesVALU++;
+            stats.threadCyclesVALU
+                += gpuDynInst->wavefront()->execMask().count();
          } else if (gpuDynInst->isFlat()) {
              if (gpuDynInst->isLocalMem()) {
-                flatLDSInsts++;
+                stats.flatLDSInsts++;
              } else {
-                flatVMemInsts++;
+                stats.flatVMemInsts++;
              }
          } else if (gpuDynInst->isLocalMem()) {
-            ldsNoFlatInsts++;
+            stats.ldsNoFlatInsts++;
          } else if (gpuDynInst->isLoad()) {
-            vectorMemReads++;
+            stats.vectorMemReads++;
          } else if (gpuDynInst->isStore()) {
-            vectorMemWrites++;
+            stats.vectorMemWrites++;
          }
  
          if (gpuDynInst->isLoad()) {
              switch (gpuDynInst->executedAs()) {
                case Enums::SC_SPILL:
-                spillReads++;
+                stats.spillReads++;
                  break;
                case Enums::SC_GLOBAL:
-                globalReads++;
+                stats.globalReads++;
                  break;
                case Enums::SC_GROUP:
-                groupReads++;
+                stats.groupReads++;
                  break;
                case Enums::SC_PRIVATE:
-                privReads++;
+                stats.privReads++;
                  break;
                case Enums::SC_READONLY:
-                readonlyReads++;
+                stats.readonlyReads++;
                  break;
                case Enums::SC_KERNARG:
-                kernargReads++;
+                stats.kernargReads++;
                  break;
                case Enums::SC_ARG:
-                argReads++;
+                stats.argReads++;
                  break;
                case Enums::SC_NONE:
                  /**
@@ -2403,25 +1861,25 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
          } else if (gpuDynInst->isStore()) {
              switch (gpuDynInst->executedAs()) {
                case Enums::SC_SPILL:
-                spillWrites++;
+                stats.spillWrites++;
                  break;
                case Enums::SC_GLOBAL:
-                globalWrites++;
+                stats.globalWrites++;
                  break;
                case Enums::SC_GROUP:
-                groupWrites++;
+                stats.groupWrites++;
                  break;
                case Enums::SC_PRIVATE:
-                privWrites++;
+                stats.privWrites++;
                  break;
                case Enums::SC_READONLY:
-                readonlyWrites++;
+                stats.readonlyWrites++;
                  break;
                case Enums::SC_KERNARG:
-                kernargWrites++;
+                stats.kernargWrites++;
                  break;
                case Enums::SC_ARG:
-                argWrites++;
+                stats.argWrites++;
                  break;
                case Enums::SC_NONE:
                  /**
@@ -2636,3 +2094,241 @@ ComputeUnit::LDSPort::recvReqRetry()
          }
      }
  }
+
+ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
+    : Stats::Group(parent),
+      ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
+      ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
+               "per-wavefront."),
+      ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
+      ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
+               "per-wavefront."),
+      ADD_STAT(instCyclesVALU,
+               "Number of cycles needed to execute VALU insts."),
+      ADD_STAT(instCyclesSALU,
+               "Number of cycles needed to execute SALU insts."),
+      ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
+               "vector ALU ops. Similar to instCyclesVALU but multiplied by "
+               "the number of active threads."),
+      ADD_STAT(vALUUtilization,
+               "Percentage of active vector ALU threads in a wave."),
+      ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
+               " accesses that resolve to LDS."),
+      ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
+               "including FLAT accesses that resolve to LDS) per-wavefront."),
+      ADD_STAT(flatVMemInsts,
+               "The number of FLAT insts that resolve to vmem issued."),
+      ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
+               "resolve to vmem issued per-wavefront."),
+      ADD_STAT(flatLDSInsts,
+               "The number of FLAT insts that resolve to LDS issued."),
+      ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
+               "resolve to LDS issued per-wavefront."),
+      ADD_STAT(vectorMemWrites,
+               "Number of vector mem write insts (excluding FLAT insts)."),
+      ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
+               "insts (excluding FLAT insts) per-wavefront."),
+      ADD_STAT(vectorMemReads,
+               "Number of vector mem read insts (excluding FLAT insts)."),
+      ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
+               "(excluding FLAT insts) per-wavefront."),
+      ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
+      ADD_STAT(scalarMemWritesPerWF,
+               "The average number of scalar mem write insts per-wavefront."),
+      ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
+      ADD_STAT(scalarMemReadsPerWF,
+               "The average number of scalar mem read insts per-wavefront."),
+      ADD_STAT(vectorMemReadsPerKiloInst,
+               "Number of vector mem reads per kilo-instruction"),
+      ADD_STAT(vectorMemWritesPerKiloInst,
+               "Number of vector mem writes per kilo-instruction"),
+      ADD_STAT(vectorMemInstsPerKiloInst,
+               "Number of vector mem insts per kilo-instruction"),
+      ADD_STAT(scalarMemReadsPerKiloInst,
+               "Number of scalar mem reads per kilo-instruction"),
+      ADD_STAT(scalarMemWritesPerKiloInst,
+               "Number of scalar mem writes per kilo-instruction"),
+      ADD_STAT(scalarMemInstsPerKiloInst,
+               "Number of scalar mem insts per kilo-instruction"),
+      ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
+               "command, data from VRF to vector memory unit, per SIMD"),
+      ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
+               "command, data from SRF to scalar memory unit, per SIMD"),
+      ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
+               "command, data from VRF to LDS unit, per SIMD"),
+      ADD_STAT(globalReads, "Number of reads to the global segment"),
+      ADD_STAT(globalWrites, "Number of writes to the global segment"),
+      ADD_STAT(globalMemInsts,
+               "Number of memory instructions sent to the global segment"),
+      ADD_STAT(argReads, "Number of reads to the arg segment"),
+      ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
+      ADD_STAT(argMemInsts,
+               "Number of memory instructions sent to the arg segment"),
+      ADD_STAT(spillReads, "Number of reads to the spill segment"),
+      ADD_STAT(spillWrites, "Number of writes to the spill segment"),
+      ADD_STAT(spillMemInsts,
+               "Number of memory instructions sent to the spill segment"),
+      ADD_STAT(groupReads, "Number of reads to the group segment"),
+      ADD_STAT(groupWrites, "Number of writes to the group segment"),
+      ADD_STAT(groupMemInsts,
+               "Number of memory instructions sent to the group segment"),
+      ADD_STAT(privReads, "Number of reads to the private segment"),
+      ADD_STAT(privWrites, "Number of writes to the private segment"),
+      ADD_STAT(privMemInsts,
+               "Number of memory instructions sent to the private segment"),
+      ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
+      ADD_STAT(readonlyWrites,
+               "Number of memory instructions sent to the readonly segment"),
+      ADD_STAT(readonlyMemInsts,
+               "Number of memory instructions sent to the readonly segment"),
+      ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
+      ADD_STAT(kernargWrites,
+               "Number of memory instructions sent to the kernarg segment"),
+      ADD_STAT(kernargMemInsts,
+               "Number of memory instructions sent to the kernarg segment"),
+      ADD_STAT(waveLevelParallelism,
+               "wave level parallelism: count of active waves at wave launch"),
+      ADD_STAT(tlbRequests, "number of uncoalesced requests"),
+      ADD_STAT(tlbCycles,
+               "total number of cycles for all uncoalesced requests"),
+      ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
+      ADD_STAT(hitsPerTLBLevel,
+               "TLB hits distribution (0 for page table, x for Lx-TLB)"),
+      ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
+      ADD_STAT(ldsBankConflictDist,
+               "Number of bank conflicts per LDS memory packet"),
+      ADD_STAT(pageDivergenceDist,
+               "pages touched per wf (over all mem. instr.)"),
+      ADD_STAT(dynamicGMemInstrCnt,
+               "dynamic non-flat global memory instruction count"),
+      ADD_STAT(dynamicFlatMemInstrCnt,
+               "dynamic flat global memory instruction count"),
+      ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
+      ADD_STAT(wgBlockedDueBarrierAllocation,
+               "WG dispatch was blocked due to lack of barrier resources"),
+      ADD_STAT(wgBlockedDueLdsAllocation,
+               "Workgroup blocked due to LDS capacity"),
+      ADD_STAT(numInstrExecuted, "number of instructions executed"),
+      ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
+               "vector instructions per cycle"),
+      ADD_STAT(numVecOpsExecuted,
+               "number of vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF16,
+               "number of f16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF32,
+               "number of f32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF64,
+               "number of f64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA16,
+               "number of fma16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA32,
+               "number of fma32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA64,
+               "number of fma64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC16,
+               "number of mac16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC32,
+               "number of mac32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC64,
+               "number of mac64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD16,
+               "number of mad16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD32,
+               "number of mad32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD64,
+               "number of mad64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedTwoOpFP,
+               "number of two op FP vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(totalCycles, "number of cycles the CU ran for"),
+      ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
+      ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
+               "instruction (over all instructions)"),
+      ADD_STAT(activeLanesPerGMemInstrDist,
+               "number of active lanes per global memory instruction"),
+      ADD_STAT(activeLanesPerLMemInstrDist,
+               "number of active lanes per local memory instruction"),
+      ADD_STAT(numALUInstsExecuted,
+               "Number of dynamic non-GM memory insts executed"),
+      ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
+               "blocked due to VGPR allocation per SIMD"),
+      ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
+               "blocked due to SGPR allocation per SIMD"),
+      ADD_STAT(numCASOps, "number of compare and swap operations"),
+      ADD_STAT(numFailedCASOps,
+               "number of compare and swap operations that failed"),
+      ADD_STAT(completedWfs, "number of completed wavefronts"),
+      ADD_STAT(completedWGs, "number of completed workgroups"),
+      ADD_STAT(headTailLatency, "ticks between first and last cache block "
+               "arrival at coalescer"),
+      ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
+{
+    ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
+
+    instCyclesVMemPerSimd.init(cu->numVectorALUs);
+    instCyclesScMemPerSimd.init(cu->numVectorALUs);
+    instCyclesLdsPerSimd.init(cu->numVectorALUs);
+
+    hitsPerTLBLevel.init(4);
+    execRateDist.init(0, 10, 2);
+    ldsBankConflictDist.init(0, cu->wfSize(), 2);
+
+    pageDivergenceDist.init(1, cu->wfSize(), 4);
+    controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
+    activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
+    activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
+
+    headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
+    waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
+    instInterleave.init(cu->numVectorALUs, 0, 20, 1);
+
+    vALUInstsPerWF = vALUInsts / completedWfs;
+    sALUInstsPerWF = sALUInsts / completedWfs;
+    vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
+    ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
+    flatVMemInstsPerWF = flatVMemInsts / completedWfs;
+    flatLDSInstsPerWF = flatLDSInsts / completedWfs;
+    vectorMemWritesPerWF = vectorMemWrites / completedWfs;
+    vectorMemReadsPerWF = vectorMemReads / completedWfs;
+    scalarMemWritesPerWF = scalarMemWrites / completedWfs;
+    scalarMemReadsPerWF = scalarMemReads / completedWfs;
+
+    vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
+    vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
+    vectorMemInstsPerKiloInst =
+        ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
+    scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
+    scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
+    scalarMemInstsPerKiloInst =
+        ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
+
+    globalMemInsts = globalReads + globalWrites;
+    argMemInsts = argReads + argWrites;
+    spillMemInsts = spillReads + spillWrites;
+    groupMemInsts = groupReads + groupWrites;
+    privMemInsts = privReads + privWrites;
+    readonlyMemInsts = readonlyReads + readonlyWrites;
+    kernargMemInsts = kernargReads + kernargWrites;
+
+    tlbLatency = tlbCycles / tlbRequests;
+
+    // fixed number of TLB levels
+    for (int i = 0; i < 4; ++i) {
+        if (!i)
+            hitsPerTLBLevel.subname(i,"page_table");
+        else
+            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+    }
+
+    ipc = numInstrExecuted / totalCycles;
+    vpc = numVecOpsExecuted / totalCycles;
+    vpc_f16 = numVecOpsExecutedF16 / totalCycles;
+    vpc_f32 = numVecOpsExecutedF32 / totalCycles;
+    vpc_f64 = numVecOpsExecutedF64 / totalCycles;
+
+    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+        dynamicLMemInstrCnt;
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index ecb6d06d8075be37f3ac27b9bad8167b1885da2b..186a45664629f9e6510803ae56a50f51662b74e7 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -42,6 +42,7 @@
  #include "base/callback.hh"
  #include "base/compiler.hh"
  #include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "base/types.hh"
  #include "config/the_gpu_isa.hh"
  #include "enums/PrefetchType.hh"
@@ -320,12 +321,6 @@ class ComputeUnit : public ClockedObject
      // tracks the last cycle a vector instruction was executed on a SIMD
      std::vector<uint64_t> lastExecCycle;
  
-    // Track the amount of interleaving between wavefronts on each SIMD.
-    // This stat is sampled using instExecPerSimd to compute the number of
-    // instructions that have been executed on a SIMD between a WF executing
-    // two successive instructions.
-    Stats::VectorDistribution instInterleave;
-
      // tracks the number of dyn inst executed per SIMD
      std::vector<uint64_t> instExecPerSimd;
  
@@ -472,148 +467,6 @@ class ComputeUnit : public ClockedObject
      LdsState &lds;
  
    public:
-    Stats::Scalar vALUInsts;
-    Stats::Formula vALUInstsPerWF;
-    Stats::Scalar sALUInsts;
-    Stats::Formula sALUInstsPerWF;
-    Stats::Scalar instCyclesVALU;
-    Stats::Scalar instCyclesSALU;
-    Stats::Scalar threadCyclesVALU;
-    Stats::Formula vALUUtilization;
-    Stats::Scalar ldsNoFlatInsts;
-    Stats::Formula ldsNoFlatInstsPerWF;
-    Stats::Scalar flatVMemInsts;
-    Stats::Formula flatVMemInstsPerWF;
-    Stats::Scalar flatLDSInsts;
-    Stats::Formula flatLDSInstsPerWF;
-    Stats::Scalar vectorMemWrites;
-    Stats::Formula vectorMemWritesPerWF;
-    Stats::Scalar vectorMemReads;
-    Stats::Formula vectorMemReadsPerWF;
-    Stats::Scalar scalarMemWrites;
-    Stats::Formula scalarMemWritesPerWF;
-    Stats::Scalar scalarMemReads;
-    Stats::Formula scalarMemReadsPerWF;
-
-    Stats::Formula vectorMemReadsPerKiloInst;
-    Stats::Formula vectorMemWritesPerKiloInst;
-    Stats::Formula vectorMemInstsPerKiloInst;
-    Stats::Formula scalarMemReadsPerKiloInst;
-    Stats::Formula scalarMemWritesPerKiloInst;
-    Stats::Formula scalarMemInstsPerKiloInst;
-
-    // Cycles required to send register source (addr and data) from
-    // register files to memory pipeline, per SIMD.
-    Stats::Vector instCyclesVMemPerSimd;
-    Stats::Vector instCyclesScMemPerSimd;
-    Stats::Vector instCyclesLdsPerSimd;
-
-    Stats::Scalar globalReads;
-    Stats::Scalar globalWrites;
-    Stats::Formula globalMemInsts;
-    Stats::Scalar argReads;
-    Stats::Scalar argWrites;
-    Stats::Formula argMemInsts;
-    Stats::Scalar spillReads;
-    Stats::Scalar spillWrites;
-    Stats::Formula spillMemInsts;
-    Stats::Scalar groupReads;
-    Stats::Scalar groupWrites;
-    Stats::Formula groupMemInsts;
-    Stats::Scalar privReads;
-    Stats::Scalar privWrites;
-    Stats::Formula privMemInsts;
-    Stats::Scalar readonlyReads;
-    Stats::Scalar readonlyWrites;
-    Stats::Formula readonlyMemInsts;
-    Stats::Scalar kernargReads;
-    Stats::Scalar kernargWrites;
-    Stats::Formula kernargMemInsts;
-
-    int activeWaves;
-    Stats::Distribution waveLevelParallelism;
-
-    void updateInstStats(GPUDynInstPtr gpuDynInst);
-
-    // the following stats compute the avg. TLB accesslatency per
-    // uncoalesced request (only for data)
-    Stats::Scalar tlbRequests;
-    Stats::Scalar tlbCycles;
-    Stats::Formula tlbLatency;
-    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
-    Stats::Vector hitsPerTLBLevel;
-
-    Stats::Scalar ldsBankAccesses;
-    Stats::Distribution ldsBankConflictDist;
-
-    // over all memory instructions executed over all wavefronts
-    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
-    Stats::Distribution pageDivergenceDist;
-    // count of non-flat global memory vector instructions executed
-    Stats::Scalar dynamicGMemInstrCnt;
-    // count of flat global memory vector instructions executed
-    Stats::Scalar dynamicFlatMemInstrCnt;
-    Stats::Scalar dynamicLMemInstrCnt;
-
-    Stats::Scalar wgBlockedDueBarrierAllocation;
-    Stats::Scalar wgBlockedDueLdsAllocation;
-    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
-    // active when the instruction is committed, this number is still
-    // incremented by 1
-    Stats::Scalar numInstrExecuted;
-    // Number of cycles among successive instruction executions across all
-    // wavefronts of the same CU
-    Stats::Distribution execRateDist;
-    // number of individual vector operations executed
-    Stats::Scalar numVecOpsExecuted;
-    // number of individual f16 vector operations executed
-    Stats::Scalar numVecOpsExecutedF16;
-    // number of individual f32 vector operations executed
-    Stats::Scalar numVecOpsExecutedF32;
-    // number of individual f64 vector operations executed
-    Stats::Scalar numVecOpsExecutedF64;
-    // number of individual FMA 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedFMA16;
-    Stats::Scalar numVecOpsExecutedFMA32;
-    Stats::Scalar numVecOpsExecutedFMA64;
-    // number of individual MAC 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedMAC16;
-    Stats::Scalar numVecOpsExecutedMAC32;
-    Stats::Scalar numVecOpsExecutedMAC64;
-    // number of individual MAD 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedMAD16;
-    Stats::Scalar numVecOpsExecutedMAD32;
-    Stats::Scalar numVecOpsExecutedMAD64;
-    // total number of two op FP vector operations executed
-    Stats::Scalar numVecOpsExecutedTwoOpFP;
-    // Total cycles that something is running on the GPU
-    Stats::Scalar totalCycles;
-    Stats::Formula vpc; // vector ops per cycle
-    Stats::Formula vpc_f16; // vector ops per cycle
-    Stats::Formula vpc_f32; // vector ops per cycle
-    Stats::Formula vpc_f64; // vector ops per cycle
-    Stats::Formula ipc; // vector instructions per cycle
-    Stats::Distribution controlFlowDivergenceDist;
-    Stats::Distribution activeLanesPerGMemInstrDist;
-    Stats::Distribution activeLanesPerLMemInstrDist;
-    // number of vector ALU instructions received
-    Stats::Formula numALUInstsExecuted;
-    // number of times a WG can not start due to lack of free VGPRs in SIMDs
-    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
-    // number of times a WG can not start due to lack of free SGPRs in SIMDs
-    Stats::Scalar numTimesWgBlockedDueSgprAlloc;
-    Stats::Scalar numCASOps;
-    Stats::Scalar numFailedCASOps;
-    Stats::Scalar completedWfs;
-    Stats::Scalar completedWGs;
-
-    // distrubtion in latency difference between first and last cache block
-    // arrival ticks
-    Stats::Distribution headTailLatency;
-
-    void
-    regStats() override;
-
      LdsState &
      getLds() const
      {
@@ -1081,6 +934,158 @@ class ComputeUnit : public ClockedObject
      // a particular GPUDynInst. This is used to calculate the difference
      // between the first and last chace block arrival times.
      std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
+
+  public:
+    void updateInstStats(GPUDynInstPtr gpuDynInst);
+    int activeWaves;
+
+    struct ComputeUnitStats : public Stats::Group
+    {
+        ComputeUnitStats(Stats::Group *parent, int n_wf);
+
+        Stats::Scalar vALUInsts;
+        Stats::Formula vALUInstsPerWF;
+        Stats::Scalar sALUInsts;
+        Stats::Formula sALUInstsPerWF;
+        Stats::Scalar instCyclesVALU;
+        Stats::Scalar instCyclesSALU;
+        Stats::Scalar threadCyclesVALU;
+        Stats::Formula vALUUtilization;
+        Stats::Scalar ldsNoFlatInsts;
+        Stats::Formula ldsNoFlatInstsPerWF;
+        Stats::Scalar flatVMemInsts;
+        Stats::Formula flatVMemInstsPerWF;
+        Stats::Scalar flatLDSInsts;
+        Stats::Formula flatLDSInstsPerWF;
+        Stats::Scalar vectorMemWrites;
+        Stats::Formula vectorMemWritesPerWF;
+        Stats::Scalar vectorMemReads;
+        Stats::Formula vectorMemReadsPerWF;
+        Stats::Scalar scalarMemWrites;
+        Stats::Formula scalarMemWritesPerWF;
+        Stats::Scalar scalarMemReads;
+        Stats::Formula scalarMemReadsPerWF;
+
+        Stats::Formula vectorMemReadsPerKiloInst;
+        Stats::Formula vectorMemWritesPerKiloInst;
+        Stats::Formula vectorMemInstsPerKiloInst;
+        Stats::Formula scalarMemReadsPerKiloInst;
+        Stats::Formula scalarMemWritesPerKiloInst;
+        Stats::Formula scalarMemInstsPerKiloInst;
+
+        // Cycles required to send register source (addr and data) from
+        // register files to memory pipeline, per SIMD.
+        Stats::Vector instCyclesVMemPerSimd;
+        Stats::Vector instCyclesScMemPerSimd;
+        Stats::Vector instCyclesLdsPerSimd;
+
+        Stats::Scalar globalReads;
+        Stats::Scalar globalWrites;
+        Stats::Formula globalMemInsts;
+        Stats::Scalar argReads;
+        Stats::Scalar argWrites;
+        Stats::Formula argMemInsts;
+        Stats::Scalar spillReads;
+        Stats::Scalar spillWrites;
+        Stats::Formula spillMemInsts;
+        Stats::Scalar groupReads;
+        Stats::Scalar groupWrites;
+        Stats::Formula groupMemInsts;
+        Stats::Scalar privReads;
+        Stats::Scalar privWrites;
+        Stats::Formula privMemInsts;
+        Stats::Scalar readonlyReads;
+        Stats::Scalar readonlyWrites;
+        Stats::Formula readonlyMemInsts;
+        Stats::Scalar kernargReads;
+        Stats::Scalar kernargWrites;
+        Stats::Formula kernargMemInsts;
+
+        Stats::Distribution waveLevelParallelism;
+
+        // the following stats compute the avg. TLB accesslatency per
+        // uncoalesced request (only for data)
+        Stats::Scalar tlbRequests;
+        Stats::Scalar tlbCycles;
+        Stats::Formula tlbLatency;
+        // hitsPerTLBLevel[x] are the hits in Level x TLB.
+        // x = 0 is the page table.
+        Stats::Vector hitsPerTLBLevel;
+
+        Stats::Scalar ldsBankAccesses;
+        Stats::Distribution ldsBankConflictDist;
+
+        // over all memory instructions executed over all wavefronts
+        // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+        Stats::Distribution pageDivergenceDist;
+        // count of non-flat global memory vector instructions executed
+        Stats::Scalar dynamicGMemInstrCnt;
+        // count of flat global memory vector instructions executed
+        Stats::Scalar dynamicFlatMemInstrCnt;
+        Stats::Scalar dynamicLMemInstrCnt;
+
+        Stats::Scalar wgBlockedDueBarrierAllocation;
+        Stats::Scalar wgBlockedDueLdsAllocation;
+        // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+        // active when the instruction is committed, this number is still
+        // incremented by 1
+        Stats::Scalar numInstrExecuted;
+        // Number of cycles among successive instruction executions across all
+        // wavefronts of the same CU
+        Stats::Distribution execRateDist;
+        // number of individual vector operations executed
+        Stats::Scalar numVecOpsExecuted;
+        // number of individual f16 vector operations executed
+        Stats::Scalar numVecOpsExecutedF16;
+        // number of individual f32 vector operations executed
+        Stats::Scalar numVecOpsExecutedF32;
+        // number of individual f64 vector operations executed
+        Stats::Scalar numVecOpsExecutedF64;
+        // number of individual FMA 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedFMA16;
+        Stats::Scalar numVecOpsExecutedFMA32;
+        Stats::Scalar numVecOpsExecutedFMA64;
+        // number of individual MAC 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedMAC16;
+        Stats::Scalar numVecOpsExecutedMAC32;
+        Stats::Scalar numVecOpsExecutedMAC64;
+        // number of individual MAD 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedMAD16;
+        Stats::Scalar numVecOpsExecutedMAD32;
+        Stats::Scalar numVecOpsExecutedMAD64;
+        // total number of two op FP vector operations executed
+        Stats::Scalar numVecOpsExecutedTwoOpFP;
+        // Total cycles that something is running on the GPU
+        Stats::Scalar totalCycles;
+        Stats::Formula vpc; // vector ops per cycle
+        Stats::Formula vpc_f16; // vector ops per cycle
+        Stats::Formula vpc_f32; // vector ops per cycle
+        Stats::Formula vpc_f64; // vector ops per cycle
+        Stats::Formula ipc; // vector instructions per cycle
+        Stats::Distribution controlFlowDivergenceDist;
+        Stats::Distribution activeLanesPerGMemInstrDist;
+        Stats::Distribution activeLanesPerLMemInstrDist;
+        // number of vector ALU instructions received
+        Stats::Formula numALUInstsExecuted;
+        // number of times a WG cannot start due to lack of free VGPRs in SIMDs
+        Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+        // number of times a WG cannot start due to lack of free SGPRs in SIMDs
+        Stats::Scalar numTimesWgBlockedDueSgprAlloc;
+        Stats::Scalar numCASOps;
+        Stats::Scalar numFailedCASOps;
+        Stats::Scalar completedWfs;
+        Stats::Scalar completedWGs;
+
+        // distrubtion in latency difference between first and last cache block
+        // arrival ticks
+        Stats::Distribution headTailLatency;
+
+        // Track the amount of interleaving between wavefronts on each SIMD.
+        // This stat is sampled using instExecPerSimd to compute the number
+        // of instructions that have been executed on a SIMD between a WF
+        // executing two successive instructions.
+        Stats::VectorDistribution instInterleave;
+    } stats;
  };
  
  #endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc

index a4fe92385c490cb7fc5fc8792a225a7a9cab0435..dae7b8c12f8748900c4a437eefdfbc03e53b4661 100644 (file)
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -49,7 +49,7 @@ GPUDispatcher::GPUDispatcher(const Params &p)
      : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
        tickEvent([this]{ exec(); },
            "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
-      dispatchActive(false)
+      dispatchActive(false), stats(this)
  {
      schedule(&tickEvent, 0);
  }
@@ -58,21 +58,6 @@ GPUDispatcher::~GPUDispatcher()
  {
  }
  
-void
-GPUDispatcher::regStats()
-{
-    numKernelLaunched
-    .name(name() + ".num_kernel_launched")
-    .desc("number of kernel launched")
-    ;
-
-    cyclesWaitingForDispatch
-    .name(name() + ".cycles_wait_dispatch")
-    .desc("number of cycles with outstanding wavefronts "
-          "that are waiting to be dispatched")
-    ;
-}
-
  HSAQueueEntry*
  GPUDispatcher::hsaTask(int disp_id)
  {
@@ -127,7 +112,7 @@ GPUDispatcher::unserialize(CheckpointIn &cp)
  void
  GPUDispatcher::dispatch(HSAQueueEntry *task)
  {
-    ++numKernelLaunched;
+    ++stats.numKernelLaunched;
  
      DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
              task->kernelName(), task->dispatchId());
@@ -158,7 +143,7 @@ GPUDispatcher::exec()
      DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
  
      if (execIds.size() > 0) {
-        ++cyclesWaitingForDispatch;
+        ++stats.cyclesWaitingForDispatch;
      }
  
      /**
@@ -368,3 +353,11 @@ GPUDispatcher::scheduleDispatch()
          schedule(&tickEvent, curTick() + shader->clockPeriod());
      }
  }
+
+GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(numKernelLaunched, "number of kernel launched"),
+      ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
+               "wavefronts that are waiting to be dispatched")
+{
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh

index b8cd3f1ef786d74d00793412846f92f0402f9bcf..3cd65f6644641b07c6886c7f0d7f0bef3be62ae5 100644 (file)
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -48,6 +48,7 @@
  #include <vector>
  
  #include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "dev/hsa/hsa_packet.hh"
  #include "params/GPUDispatcher.hh"
  #include "sim/sim_object.hh"
@@ -67,7 +68,6 @@ class GPUDispatcher : public SimObject
  
      void serialize(CheckpointOut &cp) const override;
      void unserialize(CheckpointIn &cp) override;
-    void regStats() override;
      void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
      void setShader(Shader *new_shader);
      void exec();
@@ -91,9 +91,15 @@ class GPUDispatcher : public SimObject
      std::queue<int> doneIds;
      // is there a kernel in execution?
      bool dispatchActive;
-    /*statistics*/
-    Stats::Scalar numKernelLaunched;
-    Stats::Scalar cyclesWaitingForDispatch;
+
+  protected:
+    struct GPUDispatcherStats : public Stats::Group
+    {
+        GPUDispatcherStats(Stats::Group *parent);
+
+        Stats::Scalar numKernelLaunched;
+        Stats::Scalar cyclesWaitingForDispatch;
+    } stats;
  };
  
  #endif // __GPU_COMPUTE_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc

index 81806270ced11b7754b2a8027189441cc88504f8..5c57bb3b3a4772f7ab2bf2f990550f3e3e3793e3 100644 (file)
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -46,10 +46,11 @@ ExecStage::ExecStage(const ComputeUnitParams &p, ComputeUnit &cu,
      : computeUnit(cu), fromSchedule(from_schedule),
        lastTimeInstExecuted(false),
        thisTimeInstExecuted(false), instrExecuted (false),
-      executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
+      executionResourcesUsed(0), _name(cu.name() + ".ExecStage"),
+      stats(&cu)
  
  {
-    numTransActiveIdle = 0;
+    stats.numTransActiveIdle = 0;
      idle_dur = 0;
  }
  
@@ -64,22 +65,22 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
      if (stage == IdleExec) {
          // count cycles when no instruction to a specific execution resource
          // is executed
-        numCyclesWithNoInstrTypeIssued[unitId]++;
+        stats.numCyclesWithNoInstrTypeIssued[unitId]++;
      } else if (stage == BusyExec) {
          // count the number of cycles an instruction to a specific execution
          // resource type was issued
-        numCyclesWithInstrTypeIssued[unitId]++;
+        stats.numCyclesWithInstrTypeIssued[unitId]++;
          thisTimeInstExecuted = true;
          instrExecuted = true;
          ++executionResourcesUsed;
      } else if (stage == PostExec) {
          // count the number of transitions from active to idle
          if (lastTimeInstExecuted && !thisTimeInstExecuted) {
-            ++numTransActiveIdle;
+            ++stats.numTransActiveIdle;
          }
  
          if (!lastTimeInstExecuted && thisTimeInstExecuted) {
-            idleDur.sample(idle_dur);
+            stats.idleDur.sample(idle_dur);
              idle_dur = 0;
          } else if (!thisTimeInstExecuted) {
              idle_dur++;
@@ -89,11 +90,11 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
          // track the number of cycles we either issued at least
          // instruction or issued no instructions at all
          if (instrExecuted) {
-            numCyclesWithInstrIssued++;
+            stats.numCyclesWithInstrIssued++;
          } else {
-            numCyclesWithNoIssue++;
+            stats.numCyclesWithNoIssue++;
          }
-        spc.sample(executionResourcesUsed);
+        stats.spc.sample(executionResourcesUsed);
      }
  }
  
@@ -196,57 +197,35 @@ ExecStage::exec()
      collectStatistics(PostExec, 0);
  }
  
-void
-ExecStage::regStats()
+ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "ExecStage"),
+      ADD_STAT(numTransActiveIdle,
+               "number of CU transitions from active to idle"),
+      ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
+      ADD_STAT(numCyclesWithInstrIssued,
+               "number of cycles the CU issued at least one instruction"),
+      ADD_STAT(spc,
+               "Execution units active per cycle (Exec unit=SIMD,MemPipe)"),
+      ADD_STAT(idleDur, "duration of idle periods in cycles"),
+      ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one "
+               "instruction issued to execution resource type"),
+      ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions"
+               " issued to execution resource type")
  {
-    numTransActiveIdle
-       .name(name() + ".num_transitions_active_to_idle")
-       .desc("number of CU transitions from active to idle")
-        ;
-
-    numCyclesWithNoIssue
-        .name(name() + ".num_cycles_with_no_issue")
-        .desc("number of cycles the CU issues nothing")
-        ;
-
-    numCyclesWithInstrIssued
-        .name(name() + ".num_cycles_with_instr_issued")
-        .desc("number of cycles the CU issued at least one instruction")
-        ;
-
-    spc
-        .init(0, computeUnit.numExeUnits(), 1)
-        .name(name() + ".spc")
-        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
-        ;
+    ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);
  
-    idleDur
-        .init(0,75,5)
-        .name(name() + ".idle_duration_in_cycles")
-        .desc("duration of idle periods in cycles")
-        ;
-
-    numCyclesWithInstrTypeIssued
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".num_cycles_issue_exec_rsrc")
-        .desc("Number of cycles at least one instruction issued to "
-              "execution resource type")
-        ;
-
-    numCyclesWithNoInstrTypeIssued
-        .init(computeUnit.numExeUnits())
-       .name(name() + ".num_cycles_no_issue_exec_rsrc")
-       .desc("Number of clks no instructions issued to execution "
-             "resource type")
-       ;
+    spc.init(0, compute_unit->numExeUnits(), 1);
+    idleDur.init(0, 75, 5);
+    numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
+    numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());
  
      int c = 0;
-    for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
+    for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) {
          std::string s = "VectorALU" + std::to_string(i);
          numCyclesWithNoInstrTypeIssued.subname(c, s);
          numCyclesWithInstrTypeIssued.subname(c, s);
      }
-    for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
+    for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) {
          std::string s = "ScalarALU" + std::to_string(i);
          numCyclesWithNoInstrTypeIssued.subname(c, s);
          numCyclesWithInstrTypeIssued.subname(c, s);
@@ -256,7 +235,4 @@ ExecStage::regStats()
  
      numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
      numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
-
-    numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
-    numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
  }
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh

index 4051b31d9c669122c06558e668c7914756d30be4..c560b244045f5edd2fef374bb991e91121382da7 100644 (file)
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -39,7 +39,8 @@
  #include <utility>
  #include <vector>
  
-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  
  class ComputeUnit;
  class ScheduleToExecute;
@@ -81,20 +82,6 @@ class ExecStage
      void dumpDispList();
  
      const std::string& name() const { return _name; }
-    void regStats();
-    // number of idle cycles
-    Stats::Scalar numCyclesWithNoIssue;
-    // number of busy cycles
-    Stats::Scalar numCyclesWithInstrIssued;
-    // number of cycles during which at least one
-    // instruction was issued to an execution resource type
-    Stats::Vector numCyclesWithInstrTypeIssued;
-    // number of idle cycles during which the scheduler
-    // issued no instructions targeting a specific
-    // execution resource type
-    Stats::Vector numCyclesWithNoInstrTypeIssued;
-    // SIMDs active per cycle
-    Stats::Distribution spc;
  
    private:
      void collectStatistics(enum STAT_STATUS stage, int unitId);
@@ -105,11 +92,33 @@ class ExecStage
      bool lastTimeInstExecuted;
      bool thisTimeInstExecuted;
      bool instrExecuted;
-    Stats::Scalar  numTransActiveIdle;
-    Stats::Distribution idleDur;
      int executionResourcesUsed;
      uint64_t idle_dur;
      const std::string _name;
+
+  protected:
+    struct ExecStageStats : public Stats::Group
+    {
+        ExecStageStats(Stats::Group *parent);
+
+        // number of transitions from active to idle
+        Stats::Scalar numTransActiveIdle;
+        // number of idle cycles
+        Stats::Scalar numCyclesWithNoIssue;
+        // number of busy cycles
+        Stats::Scalar numCyclesWithInstrIssued;
+        // SIMDs active per cycle
+        Stats::Distribution spc;
+        // duration of idle periods in cycles
+        Stats::Distribution idleDur;
+        // number of cycles during which at least one
+        // instruction was issued to an execution resource type
+        Stats::Vector numCyclesWithInstrTypeIssued;
+        // number of idle cycles during which the scheduler
+        // issued no instructions targeting a specific
+        // execution resource type
+        Stats::Vector numCyclesWithNoInstrTypeIssued;
+    } stats;
  };
  
  #endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc

index 8a37756dbc30e2bc8b3a1e37f52f91052575b928..21374bb53133d6ab0bfdc199795fe0c7c9e07c13 100644 (file)
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -38,7 +38,7 @@
  
  FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu)
      : numVectorALUs(p.num_SIMDs), computeUnit(cu),
-      _name(cu.name() + ".FetchStage")
+      _name(cu.name() + ".FetchStage"), stats(&cu)
  {
      for (int j = 0; j < numVectorALUs; ++j) {
          FetchUnit newFetchUnit(p, cu);
@@ -79,7 +79,7 @@ FetchStage::processFetchReturn(PacketPtr pkt)
      const unsigned num_instructions = pkt->req->getSize() /
          sizeof(TheGpuISA::RawMachInst);
  
-    instFetchInstReturned.sample(num_instructions);
+    stats.instFetchInstReturned.sample(num_instructions);
      uint32_t simdId = wavefront->simdId;
      _fetchUnit[simdId].processFetchReturn(pkt);
  }
@@ -90,13 +90,10 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
      _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
  }
  
-void
-FetchStage::regStats()
+FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "FetchStage"),
+      ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
+               "received record how many instructions you got from it")
  {
-    instFetchInstReturned
-        .init(1, 32, 1)
-        .name(name() + ".inst_fetch_instr_returned")
-        .desc("For each instruction fetch request recieved record how many "
-              "instructions you got from it")
-        ;
+        instFetchInstReturned.init(1, 32, 1);
  }
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh

index 8e6996b17e4d56ef52b1a903f5633c19daa68c68..3967d6de405c26bef2bad3c40787c4e0305ec251 100644 (file)
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -38,6 +38,7 @@
  #include <vector>
  
  #include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "gpu-compute/fetch_unit.hh"
  
  // Instruction fetch stage.
@@ -61,8 +62,6 @@ class FetchStage
  
      // Stats related variables and methods
      const std::string& name() const { return _name; }
-    void regStats();
-    Stats::Distribution instFetchInstReturned;
      FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
  
    private:
@@ -73,6 +72,14 @@ class FetchStage
      // instantiated per VALU/SIMD
      std::vector<FetchUnit> _fetchUnit;
      const std::string _name;
+
+  protected:
+    struct FetchStageStats : public Stats::Group
+    {
+        FetchStageStats(Stats::Group *parent);
+
+        Stats::Distribution instFetchInstReturned;
+    } stats;
  };
  
  #endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index f6d60cf0843b8355654e1ed4ba5465fec3481f54..48f767b567a0fe4aebd19ce2551ec1b2b12c330f 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -48,7 +48,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
      : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
        gmQueueSize(p.global_mem_queue_size),
        maxWaveRequests(p.max_wave_requests), inflightStores(0),
-      inflightLoads(0)
+      inflightLoads(0), stats(&cu)
  {
  }
  
@@ -293,12 +293,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
      mem_req->second.second = true;
  }
  
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "GlobalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+               "are delayed before updating the VRF")
  {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles GM data are delayed before updating "
-              "the VRF")
-        ;
  }
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh

index a1b652a7de4381026c2927fdef72ef47e87fe174..e8a1fb03392c584fed6b51c5df343802468f8f2f 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -37,6 +37,8 @@
  #include <queue>
  #include <string>
  
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "gpu-compute/misc.hh"
  #include "params/ComputeUnit.hh"
  #include "sim/stats.hh"
@@ -95,11 +97,10 @@ class GlobalMemPipeline
      }
  
      const std::string &name() const { return _name; }
-    void regStats();
      void
      incLoadVRFBankConflictCycles(int num_cycles)
      {
-        loadVrfBankConflictCycles += num_cycles;
+        stats.loadVrfBankConflictCycles += num_cycles;
      }
  
      bool coalescerReady(GPUDynInstPtr mp) const;
@@ -113,10 +114,6 @@ class GlobalMemPipeline
      int gmQueueSize;
      int maxWaveRequests;
  
-    // number of cycles of delaying the update of a VGPR that is the
-    // target of a load instruction (or the load component of an atomic)
-    // The delay is due to VRF bank conflicts
-    Stats::Scalar loadVrfBankConflictCycles;
      // Counters to track the inflight loads and stores
      // so that we can provide the proper backpressure
      // on the number of inflight memory operations.
@@ -144,6 +141,17 @@ class GlobalMemPipeline
      // Global Memory Request FIFO: all global memory requests
      // are issued to this FIFO from the memory pipelines
      std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+  protected:
+    struct GlobalMemPipelineStats : public Stats::Group
+    {
+        GlobalMemPipelineStats(Stats::Group *parent);
+
+        // number of cycles of delaying the update of a VGPR that is the
+        // target of a load instruction (or the load component of an atomic)
+        // The delay is due to VRF bank conflicts
+        Stats::Scalar loadVrfBankConflictCycles;
+    } stats;
  };
  
  #endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc

index 38e4ecf492a0eb1417de2e892b8a5ba550d9ac6d..a17a93fcd3125d66ce8dd94d4a97bb3636c77c62 100644 (file)
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -930,16 +930,16 @@ GPUDynInst::updateStats()
  {
      if (_staticInst->isLocalMem()) {
          // access to LDS (shared) memory
-        cu->dynamicLMemInstrCnt++;
+        cu->stats.dynamicLMemInstrCnt++;
      } else if (_staticInst->isFlat()) {
-        cu->dynamicFlatMemInstrCnt++;
+        cu->stats.dynamicFlatMemInstrCnt++;
      } else {
          // access to global memory
  
          // update PageDivergence histogram
          int number_pages_touched = cu->pagesTouched.size();
          assert(number_pages_touched);
-        cu->pageDivergenceDist.sample(number_pages_touched);
+        cu->stats.pageDivergenceDist.sample(number_pages_touched);
  
          std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
  
@@ -962,7 +962,7 @@ GPUDynInst::updateStats()
          // total number of memory instructions (dynamic)
          // Atomics are counted as a single memory instruction.
          // this is # memory instructions per wavefronts, not per workitem
-        cu->dynamicGMemInstrCnt++;
+        cu->stats.dynamicGMemInstrCnt++;
      }
  }
  
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh

index cdb130e2f28650ec82d4eb154886d623db1ee1ef..8c7cf878758446cf36b078894eb3739e55eb3e32 100644 (file)
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -63,12 +63,12 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
      void
      execute(T *b)
      {
-        computeUnit->numCASOps++;
+        computeUnit->stats.numCASOps++;
  
          if (*b == c) {
              *b = s;
          } else {
-            computeUnit->numFailedCASOps++;
+            computeUnit->stats.numFailedCASOps++;
          }
      }
      AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc

index 700a894caa86a4f410a29fc34dea9802f6ccb28c..f61f3827eeeb44b6d23de8a03e5fdfd22ab644d5 100644 (file)
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -67,7 +67,7 @@ namespace X86ISA
          : ClockedObject(p), configAddress(0), size(p.size),
            cleanupEvent([this]{ cleanup(); }, name(), false,
                         Event::Maximum_Pri),
-          exitEvent([this]{ exitCallback(); }, name())
+          exitEvent([this]{ exitCallback(); }, name()), stats(this)
      {
          assoc = p.assoc;
          assert(assoc <= size);
@@ -402,12 +402,12 @@ namespace X86ISA
                      return tlb_hit;
                  }
  
-                localNumTLBAccesses++;
+                stats.localNumTLBAccesses++;
  
                  if (!entry) {
-                    localNumTLBMisses++;
+                    stats.localNumTLBMisses++;
                  } else {
-                    localNumTLBHits++;
+                    stats.localNumTLBHits++;
                  }
              }
          }
@@ -499,10 +499,10 @@ namespace X86ISA
                  DPRINTF(GPUTLB, "Paging enabled.\n");
                  // The vaddr already has the segment base applied.
                  TlbEntry *entry = lookup(vaddr);
-                localNumTLBAccesses++;
+                stats.localNumTLBAccesses++;
  
                  if (!entry) {
-                    localNumTLBMisses++;
+                    stats.localNumTLBMisses++;
                      if (timing) {
                          latency = missLatency1;
                      }
@@ -544,7 +544,7 @@ namespace X86ISA
                          DPRINTF(GPUTLB, "Miss was serviced.\n");
                      }
                  } else {
-                    localNumTLBHits++;
+                    stats.localNumTLBHits++;
  
                      if (timing) {
                          latency = hitLatency;
@@ -659,89 +659,6 @@ namespace X86ISA
      {
      }
  
-    void
-    GpuTLB::regStats()
-    {
-        ClockedObject::regStats();
-
-        localNumTLBAccesses
-            .name(name() + ".local_TLB_accesses")
-            .desc("Number of TLB accesses")
-            ;
-
-        localNumTLBHits
-            .name(name() + ".local_TLB_hits")
-            .desc("Number of TLB hits")
-            ;
-
-        localNumTLBMisses
-            .name(name() + ".local_TLB_misses")
-            .desc("Number of TLB misses")
-            ;
-
-        localTLBMissRate
-            .name(name() + ".local_TLB_miss_rate")
-            .desc("TLB miss rate")
-            ;
-
-        accessCycles
-            .name(name() + ".access_cycles")
-            .desc("Cycles spent accessing this TLB level")
-            ;
-
-        pageTableCycles
-            .name(name() + ".page_table_cycles")
-            .desc("Cycles spent accessing the page table")
-            ;
-
-        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
-
-        numUniquePages
-            .name(name() + ".unique_pages")
-            .desc("Number of unique pages touched")
-            ;
-
-        localCycles
-            .name(name() + ".local_cycles")
-            .desc("Number of cycles spent in queue for all incoming reqs")
-            ;
-
-        localLatency
-            .name(name() + ".local_latency")
-            .desc("Avg. latency over incoming coalesced reqs")
-            ;
-
-        localLatency = localCycles / localNumTLBAccesses;
-
-        globalNumTLBAccesses
-            .name(name() + ".global_TLB_accesses")
-            .desc("Number of TLB accesses")
-            ;
-
-        globalNumTLBHits
-            .name(name() + ".global_TLB_hits")
-            .desc("Number of TLB hits")
-            ;
-
-        globalNumTLBMisses
-            .name(name() + ".global_TLB_misses")
-            .desc("Number of TLB misses")
-            ;
-
-        globalTLBMissRate
-            .name(name() + ".global_TLB_miss_rate")
-            .desc("TLB miss rate")
-            ;
-
-        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
-
-        avgReuseDistance
-            .name(name() + ".avg_reuse_distance")
-            .desc("avg. reuse distance over all pages (in ticks)")
-            ;
-
-    }
-
      /**
       * Do the TLB lookup for this coalesced request and schedule
       * another event <TLB access latency> cycles later.
@@ -768,10 +685,10 @@ namespace X86ISA
          int req_cnt = sender_state->reqCnt.back();
  
          if (update_stats) {
-            accessCycles -= (curTick() * req_cnt);
-            localCycles -= curTick();
+            stats.accessCycles -= (curTick() * req_cnt);
+            stats.localCycles -= curTick();
              updatePageFootprint(virt_page_addr);
-            globalNumTLBAccesses += req_cnt;
+            stats.globalNumTLBAccesses += req_cnt;
          }
  
          tlbOutcome lookup_outcome = TLB_MISS;
@@ -795,11 +712,11 @@ namespace X86ISA
                  // the reqCnt has an entry per level, so its size tells us
                  // which level we are in
                  sender_state->hitLevel = sender_state->reqCnt.size();
-                globalNumTLBHits += req_cnt;
+                stats.globalNumTLBHits += req_cnt;
              }
          } else {
              if (update_stats)
-                globalNumTLBMisses += req_cnt;
+                stats.globalNumTLBMisses += req_cnt;
          }
  
          /*
@@ -981,16 +898,16 @@ namespace X86ISA
              handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
  
              if (update_stats) {
-                accessCycles += (req_cnt * curTick());
-                localCycles += curTick();
+                stats.accessCycles += (req_cnt * curTick());
+                stats.localCycles += curTick();
              }
  
          } else if (outcome == TLB_MISS) {
  
              DPRINTF(GPUTLB, "This is a TLB miss\n");
              if (update_stats) {
-                accessCycles += (req_cnt*curTick());
-                localCycles += curTick();
+                stats.accessCycles += (req_cnt*curTick());
+                stats.localCycles += curTick();
              }
  
              if (hasMemSidePort) {
@@ -998,8 +915,8 @@ namespace X86ISA
                  // the reply back till when we propagate it to the coalescer
                  // above.
                  if (update_stats) {
-                    accessCycles += (req_cnt * 1);
-                    localCycles += 1;
+                    stats.accessCycles += (req_cnt * 1);
+                    stats.localCycles += 1;
                  }
  
                  /**
@@ -1022,7 +939,7 @@ namespace X86ISA
                          "addr %#x\n", virtPageAddr);
  
                  if (update_stats)
-                    pageTableCycles -= (req_cnt*curTick());
+                    stats.pageTableCycles -= (req_cnt*curTick());
  
                  TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
                  assert(tlb_event);
@@ -1032,7 +949,7 @@ namespace X86ISA
              }
          } else if (outcome == PAGE_WALK) {
              if (update_stats)
-                pageTableCycles += (req_cnt*curTick());
+                stats.pageTableCycles += (req_cnt*curTick());
  
              // Need to access the page table and update the TLB
              DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
@@ -1222,17 +1139,17 @@ namespace X86ISA
          // functional mode means no coalescing
          // global metrics are the same as the local metrics
          if (update_stats) {
-            tlb->globalNumTLBAccesses++;
+            tlb->stats.globalNumTLBAccesses++;
  
              if (success) {
                  sender_state->hitLevel = sender_state->reqCnt.size();
-                tlb->globalNumTLBHits++;
+                tlb->stats.globalNumTLBHits++;
              }
          }
  
          if (!success) {
              if (update_stats)
-                tlb->globalNumTLBMisses++;
+                tlb->stats.globalNumTLBMisses++;
              if (tlb->hasMemSidePort) {
                  // there is a TLB below -> propagate down the TLB hierarchy
                  tlb->memSidePort[0]->sendFunctional(pkt);
@@ -1405,7 +1322,7 @@ namespace X86ISA
          bool first_page_access = ret.second;
  
          if (first_page_access) {
-            numUniquePages++;
+            stats.numUniquePages++;
          } else  {
              int accessed_before;
              accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
@@ -1417,7 +1334,7 @@ namespace X86ISA
  
          if (accessDistance) {
              ret.first->second.localTLBAccesses
-                .push_back(localNumTLBAccesses.value());
+                .push_back(stats.localNumTLBAccesses.value());
          }
      }
  
@@ -1506,11 +1423,36 @@ namespace X86ISA
          }
  
          if (!TLBFootprint.empty()) {
-            avgReuseDistance =
+            stats.avgReuseDistance =
                  sum_avg_reuse_distance_per_page / TLBFootprint.size();
          }
  
          //clear the TLBFootprint map
          TLBFootprint.clear();
      }
+
+    GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
+        : Stats::Group(parent),
+          ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
+          ADD_STAT(localNumTLBHits, "Number of TLB hits"),
+          ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
+          ADD_STAT(localTLBMissRate, "TLB miss rate"),
+          ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
+          ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
+          ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
+          ADD_STAT(globalTLBMissRate, "TLB miss rate"),
+          ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
+          ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
+          ADD_STAT(numUniquePages, "Number of unique pages touched"),
+          ADD_STAT(localCycles, "Number of cycles spent in queue for all "
+                   "incoming reqs"),
+          ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"),
+          ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in "
+                   "ticks)")
+    {
+        localLatency = localCycles / localNumTLBAccesses;
+
+        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+    }
  } // namespace X86ISA
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh

index edf5914a79dc3327968a167bca8c451cb08e0cd9..1df907bdb34caa8d21ef4469f2654f7a96a878fd 100644 (file)
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -47,6 +47,7 @@
  #include "base/callback.hh"
  #include "base/logging.hh"
  #include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "gpu-compute/compute_unit.hh"
  #include "mem/port.hh"
  #include "mem/request.hh"
@@ -169,35 +170,6 @@ namespace X86ISA
          int missLatency1;
          int missLatency2;
  
-        // local_stats are as seen from the TLB
-        // without taking into account coalescing
-        Stats::Scalar localNumTLBAccesses;
-        Stats::Scalar localNumTLBHits;
-        Stats::Scalar localNumTLBMisses;
-        Stats::Formula localTLBMissRate;
-
-        // global_stats are as seen from the
-        // CU's perspective taking into account
-        // all coalesced requests.
-        Stats::Scalar globalNumTLBAccesses;
-        Stats::Scalar globalNumTLBHits;
-        Stats::Scalar globalNumTLBMisses;
-        Stats::Formula globalTLBMissRate;
-
-        // from the CU perspective (global)
-        Stats::Scalar accessCycles;
-        // from the CU perspective (global)
-        Stats::Scalar pageTableCycles;
-        Stats::Scalar numUniquePages;
-        // from the perspective of this TLB
-        Stats::Scalar localCycles;
-        // from the perspective of this TLB
-        Stats::Formula localLatency;
-        // I take the avg. per page and then
-        // the avg. over all pages.
-        Stats::Scalar avgReuseDistance;
-
-        void regStats() override;
          void updatePageFootprint(Addr virt_page_addr);
          void printAccessPattern();
  
@@ -426,6 +398,40 @@ namespace X86ISA
          void exitCallback();
  
          EventFunctionWrapper exitEvent;
+
+      protected:
+        struct GpuTLBStats : public Stats::Group
+        {
+            GpuTLBStats(Stats::Group *parent);
+
+            // local_stats are as seen from the TLB
+            // without taking into account coalescing
+            Stats::Scalar localNumTLBAccesses;
+            Stats::Scalar localNumTLBHits;
+            Stats::Scalar localNumTLBMisses;
+            Stats::Formula localTLBMissRate;
+
+            // global_stats are as seen from the
+            // CU's perspective taking into account
+            // all coalesced requests.
+            Stats::Scalar globalNumTLBAccesses;
+            Stats::Scalar globalNumTLBHits;
+            Stats::Scalar globalNumTLBMisses;
+            Stats::Formula globalTLBMissRate;
+
+            // from the CU perspective (global)
+            Stats::Scalar accessCycles;
+            // from the CU perspective (global)
+            Stats::Scalar pageTableCycles;
+            Stats::Scalar numUniquePages;
+            // from the perspective of this TLB
+            Stats::Scalar localCycles;
+            // from the perspective of this TLB
+            Stats::Formula localLatency;
+            // I take the avg. per page and then
+            // the avg. over all pages.
+            Stats::Scalar avgReuseDistance;
+        } stats;
      };
  }
  
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc

index 1f653cb74ff0d431bd33c367eafbaa93ddac9c1a..c3bafb274a189906b2c59c71a7e6edf91c8dbabc 100644 (file)
--- a/src/gpu-compute/lds_state.cc
+++ b/src/gpu-compute/lds_state.cc
@@ -189,10 +189,10 @@ LdsState::processPacket(PacketPtr packet)
      // the number of conflicts this packet will have when accessing the LDS
      unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
      // count the total number of physical LDS bank accessed
-    parent->ldsBankAccesses += bankAccesses;
+    parent->stats.ldsBankAccesses += bankAccesses;
      // count the LDS bank conflicts. A number set to 1 indicates one
      // access per bank maximum so there are no bank conflicts
-    parent->ldsBankConflictDist.sample(bankConflicts-1);
+    parent->stats.ldsBankConflictDist.sample(bankConflicts-1);
  
      GPUDynInstPtr dynInst = getDynInstr(packet);
      // account for the LDS bank conflict overhead
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc

index 3b39820f33bf09ca62fae44e1c7b6f0c122f3f1f..d441a29a7d75640b1b7a0cb20b44c20ae82dd70e 100644 (file)
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -43,7 +43,7 @@
  
  LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
      : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
-      lmQueueSize(p.local_mem_queue_size)
+      lmQueueSize(p.local_mem_queue_size), stats(&cu)
  {
  }
  
@@ -124,12 +124,11 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
      lmIssuedRequests.push(gpuDynInst);
  }
  
-void
-LocalMemPipeline::regStats()
+
+LocalMemPipeline::
+LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "LocalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
+               "are delayed before updating the VRF")
  {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles LDS data are delayed before updating "
-              "the VRF")
-        ;
  }
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh

index 98cc75b7ad64940de3bf13e3a9e2e96e5a27d8b5..83895656e2dd9d8ec428c65e6e25e02d87fa5017 100644 (file)
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -37,9 +37,10 @@
  #include <queue>
  #include <string>
  
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "gpu-compute/misc.hh"
  #include "params/ComputeUnit.hh"
-#include "sim/stats.hh"
  
  /*
   * @file local_memory_pipeline.hh
@@ -75,19 +76,18 @@ class LocalMemPipeline
      }
  
      const std::string& name() const { return _name; }
-    void regStats();
  
      void
      incLoadVRFBankConflictCycles(int num_cycles)
      {
-        loadVrfBankConflictCycles += num_cycles;
+        stats.loadVrfBankConflictCycles += num_cycles;
      }
  
    private:
      ComputeUnit &computeUnit;
      const std::string _name;
      int lmQueueSize;
-    Stats::Scalar loadVrfBankConflictCycles;
+
      // Local Memory Request Fifo: all shared memory requests
      // are issued to this FIFO from the memory pipelines
      std::queue<GPUDynInstPtr> lmIssuedRequests;
@@ -95,6 +95,14 @@ class LocalMemPipeline
      // Local Memory Response Fifo: all responses of shared memory
      // requests are sent to this FIFO from LDS
      std::queue<GPUDynInstPtr> lmReturnedRequests;
+
+  protected:
+    struct LocalMemPipelineStats : public Stats::Group
+    {
+        LocalMemPipelineStats(Stats::Group *parent);
+
+        Stats::Scalar loadVrfBankConflictCycles;
+    } stats;
  };
  
  #endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc

index 42a74e8eb32ced36428b76ccf48ce07334b6e629..9c97c6224ac8f75cadebecf616fad78b038b9ca4 100644 (file)
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -49,7 +49,7 @@
  #include "params/RegisterFile.hh"
  
  RegisterFile::RegisterFile(const RegisterFileParams &p)
-    : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs)
+    : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this)
  {
      fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
      fatal_if(simdId < 0, "Illegal SIMD id for VRF");
@@ -192,26 +192,15 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
  {
  }
  
-void
-RegisterFile::regStats()
-{
-    registerReads
-        .name(name() + ".register_reads")
-        .desc("Total number of DWORDs read from register file")
-        ;
-
-    registerWrites
-        .name(name() + ".register_writes")
-        .desc("Total number of DWORDS written to register file")
-        ;
-
-    sramReads
-        .name(name() + ".sram_reads")
-        .desc("Total number of register file bank SRAM activations for reads")
-        ;
-
-    sramWrites
-        .name(name() + ".sram_writes")
-        .desc("Total number of register file bank SRAM activations for writes")
-        ;
+RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(registerReads,
+              "Total number of DWORDs read from register file"),
+      ADD_STAT(registerWrites,
+              "Total number of DWORDS written to register file"),
+      ADD_STAT(sramReads,
+              "Total number of register file bank SRAM activations for reads"),
+      ADD_STAT(sramWrites,
+              "Total number of register file bank SRAM activations for writes")
+{
  }
diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh

index 8a417a357447da39f412936698d551c1282d92db..75913cedb054e7736bf79625fd50b7f98460e25e 100644 (file)
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -62,7 +62,6 @@ class RegisterFile : public SimObject
      virtual ~RegisterFile();
      virtual void setParent(ComputeUnit *_computeUnit);
      int numRegs() const { return _numRegs; }
-    virtual void regStats() override;
  
      // State functions
  
@@ -154,18 +153,23 @@ class RegisterFile : public SimObject
  
      // numer of registers in this register file
      int _numRegs;
-    // Stats
-    // Total number of register reads, incremented once per DWORD per thread
-    Stats::Scalar registerReads;
-    // Total number of register writes, incremented once per DWORD per thread
-    Stats::Scalar registerWrites;
-
-    // Number of register file SRAM activations for reads.
-    // The register file may be implemented with multiple SRAMs. This stat
-    // tracks how many times the SRAMs are accessed for reads.
-    Stats::Scalar sramReads;
-    // Number of register file SRAM activations for writes
-    Stats::Scalar sramWrites;
+
+    struct RegisterFileStats : public Stats::Group
+    {
+        RegisterFileStats(Stats::Group *parent);
+
+        // Total number of register reads per DWORD per thread
+        Stats::Scalar registerReads;
+        // Total number of register writes per DWORD per thread
+        Stats::Scalar registerWrites;
+
+        // Number of register file SRAM activations for reads.
+        // The register file may be implemented with multiple SRAMs. This stat
+        // tracks how many times the SRAMs are accessed for reads.
+        Stats::Scalar sramReads;
+        // Number of register file SRAM activations for writes
+        Stats::Scalar sramWrites;
+    } stats;
  };
  
  #endif // __REGISTER_FILE_HH__
diff --git a/src/gpu-compute/register_manager.cc b/src/gpu-compute/register_manager.cc

index f8487554b09b6aa5e7ae80b162b8ac31dbf97cd3..781ecc2e7ba2332e24cb142ac2365bb59538bbdf 100644 (file)
--- a/src/gpu-compute/register_manager.cc
+++ b/src/gpu-compute/register_manager.cc
@@ -129,9 +129,3 @@ RegisterManager::freeRegisters(Wavefront* w)
  {
      policy->freeRegisters(w);
  }
-
-void
-RegisterManager::regStats()
-{
-    policy->regStats();
-}
diff --git a/src/gpu-compute/register_manager.hh b/src/gpu-compute/register_manager.hh

index e09a748f131f71eaa9f6a4b100c49bbef4926e40..448523f3eea99dfdbd3993775ea5f204e3681d25 100644 (file)
--- a/src/gpu-compute/register_manager.hh
+++ b/src/gpu-compute/register_manager.hh
@@ -63,9 +63,6 @@ class RegisterManager : public SimObject
      void setParent(ComputeUnit *cu);
      void exec();
  
-    // Stats related variables and methods
-    void regStats();
-
      // lookup virtual to physical register translation
      int mapVgpr(Wavefront* w, int vgprIndex);
      int mapSgpr(Wavefront* w, int sgprIndex);
diff --git a/src/gpu-compute/register_manager_policy.hh b/src/gpu-compute/register_manager_policy.hh

index 2a5a2eb1e5371f1b8fe0d92efc0374ee223efb8a..e4f34760ae6ad8c54af264ccbdf8664e873052d7 100644 (file)
--- a/src/gpu-compute/register_manager_policy.hh
+++ b/src/gpu-compute/register_manager_policy.hh
@@ -76,9 +76,6 @@ class RegisterManagerPolicy
      // free all remaining registers held by specified WF
      virtual void freeRegisters(Wavefront *w) = 0;
  
-    // stats
-    virtual void regStats() = 0;
-
    protected:
      ComputeUnit *cu;
  };
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc

index 35db8a351579cc07f998ce7ce678e9d339fb1517..f80095c8023701392d92ebda87409f5427b095dc 100644 (file)
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -142,8 +142,3 @@ ScalarMemPipeline::exec()
                  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
      }
  }
-
-void
-ScalarMemPipeline::regStats()
-{
-}
diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh

index 7f1acecbb657b12b9f62305da16fdc574ddcf583..0c015d1c8295e1b4fd9b7697f09ab0343b75d7e9 100644 (file)
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -85,7 +85,6 @@ class ScalarMemPipeline
      }
  
      const std::string& name() const { return _name; }
-    void regStats();
  
    private:
      ComputeUnit &computeUnit;
diff --git a/src/gpu-compute/scalar_register_file.cc b/src/gpu-compute/scalar_register_file.cc

index d8083ea31a41bc6cf317c586c8d2f39155a4ce6b..33267ceea0447bf8276a144e0b876f8ce34408d2 100644 (file)
--- a/src/gpu-compute/scalar_register_file.cc
+++ b/src/gpu-compute/scalar_register_file.cc
@@ -66,11 +66,11 @@ ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
  
                  if (regBusy(pSgpr)) {
                      if (ii->isDstOperand(i)) {
-                        w->numTimesBlockedDueWAXDependencies++;
+                        w->stats.numTimesBlockedDueWAXDependencies++;
                      } else if (ii->isSrcOperand(i)) {
                          DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                                  w->wfDynId, ii->disassemble(), pSgpr);
-                        w->numTimesBlockedDueRAWDependencies++;
+                        w->stats.numTimesBlockedDueRAWDependencies++;
                      }
                      return false;
                  }
@@ -109,7 +109,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
          if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
              int DWORDs = ii->getOperandSize(i) <= 4 ? 1
                  : ii->getOperandSize(i) / 4;
-            registerReads += DWORDs;
+            stats.registerReads += DWORDs;
          }
      }
  
@@ -128,7 +128,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
                      enqRegFreeEvent(physReg, tickDelay);
                  }
  
-                registerWrites += nRegs;
+                stats.registerWrites += nRegs;
              }
          }
      }
@@ -152,7 +152,7 @@ ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
                  enqRegFreeEvent(physReg, computeUnit->clockPeriod());
              }
  
-            registerWrites += nRegs;
+            stats.registerWrites += nRegs;
          }
      }
  }
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc

index 54e931306e018128bcbdffd5ba156723280f6917..02580fe7da1b1c46642a04e53ce622f1b6bf1010 100644 (file)
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -51,7 +51,7 @@ ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
        _name(cu.name() + ".ScheduleStage"),
        vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
        scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
-      locMemBusRdy(false), locMemIssueRdy(false)
+      locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
  {
      for (int j = 0; j < cu.numExeUnits(); ++j) {
          scheduler.emplace_back(p);
@@ -121,10 +121,10 @@ ScheduleStage::exec()
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
-            rdyListEmpty[j]++;
+            stats.rdyListEmpty[j]++;
              continue;
          }
-        rdyListNotEmpty[j]++;
+        stats.rdyListNotEmpty[j]++;
  
          // Pick a wave and attempt to add it to schList
          Wavefront *wf = scheduler[j].chooseWave();
@@ -133,8 +133,8 @@ ScheduleStage::exec()
          if (!addToSchList(j, gpu_dyn_inst)) {
              // For waves not added to schList, increment count of cycles
              // this wave spends in SCH stage.
-            wf->schCycles++;
-            addToSchListStalls[j]++;
+            wf->stats.schCycles++;
+            stats.addToSchListStalls[j]++;
          } else {
              if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
                  wf->incLGKMInstsIssued();
@@ -160,10 +160,10 @@ ScheduleStage::exec()
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
-            rdyListEmpty[j]++;
+            stats.rdyListEmpty[j]++;
              continue;
          }
-        rdyListNotEmpty[j]++;
+        stats.rdyListNotEmpty[j]++;
  
          // Pick a wave and attempt to add it to schList
          Wavefront *wf = scheduler[j].chooseWave();
@@ -172,8 +172,8 @@ ScheduleStage::exec()
          if (!addToSchList(j, gpu_dyn_inst)) {
              // For waves not added to schList, increment count of cycles
              // this wave spends in SCH stage.
-            wf->schCycles++;
-            addToSchListStalls[j]++;
+            wf->stats.schCycles++;
+            stats.addToSchListStalls[j]++;
          }
      }
  
@@ -241,17 +241,17 @@ ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
          computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
          return true;
      } else {
-        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
          if (!accessSrfWr) {
-            rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
          }
          if (!accessVrfWr) {
-            rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
          }
  
          // Increment stall counts for WF
-        wf->schStalls++;
-        wf->schRfAccessStalls++;
+        wf->stats.schStalls++;
+        wf->stats.schRfAccessStalls++;
      }
      return false;
  }
@@ -329,19 +329,19 @@ ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
          return true;
      } else {
          // Number of stall cycles due to RF access denied
-        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
          // Count number of denials due to each reason
          // Multiple items may contribute to the denied request
          if (!accessVrf) {
-            rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
          }
          if (!accessSrf) {
-            rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
          }
  
          // Increment stall counts for WF
-        wf->schStalls++;
-        wf->schRfAccessStalls++;
+        wf->stats.schStalls++;
+        wf->stats.schRfAccessStalls++;
          DPRINTF(GPUSched, "schList[%d]: Could not add: "
                  "SIMD[%d] WV[%d]: %d: %s\n",
                  exeType, wf->simdId, wf->wfDynId,
@@ -424,26 +424,26 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
          // TODO: Scalar NOP does not require SALU in hardware,
          // and is executed out of IB directly.
          if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
          } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
-            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
              return false;
          }
      } else if (gpu_dyn_inst->isEndOfKernel()) {
          // EndPgm instruction
          if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
          }
      } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
                 || gpu_dyn_inst->isALU()) {
          // Barrier, Branch, or ALU instruction
          if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
          } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
-            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
              return false;
          }
      } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
@@ -451,19 +451,19 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
          bool rdy = true;
          if (!glbMemIssueRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
          }
          if (!glbMemBusRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
          }
          if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
              rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
          }
          if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
              rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
          }
          if (!rdy) {
              return false;
@@ -473,18 +473,18 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
          bool rdy = true;
          if (!scalarMemIssueRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
          }
          if (!scalarMemBusRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
          }
          if (!computeUnit.scalarMemoryPipe
              .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
              + wf->scalarWrGmReqsInPipe))
          {
              rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
          }
          if (!rdy) {
              return false;
@@ -494,16 +494,16 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
          bool rdy = true;
          if (!locMemIssueRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
          }
          if (!locMemBusRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
          }
          if (!computeUnit.localMemoryPipe.
                  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
              rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
          }
          if (!rdy) {
              return false;
@@ -513,24 +513,24 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
          bool rdy = true;
          if (!glbMemIssueRdy || !locMemIssueRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
          }
          if (!glbMemBusRdy || !locMemBusRdy) {
              rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
          }
          if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
              rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
          }
          if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
              rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
          }
          if (!computeUnit.localMemoryPipe.
                  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
              rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
          }
          if (!rdy) {
              return false;
@@ -540,7 +540,7 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
                gpu_dyn_inst->disassemble());
          return false;
      }
-    dispNrdyStalls[SCH_RDY]++;
+    stats.dispNrdyStalls[SCH_RDY]++;
      return true;
  }
  
@@ -584,10 +584,10 @@ ScheduleStage::fillDispatchList()
                  } else {
                      // Either another wave has been dispatched, or this wave
                      // was not ready, so it is stalled this cycle
-                    schIter->first->wavefront()->schStalls++;
+                    schIter->first->wavefront()->stats.schStalls++;
                      if (!dispRdy) {
                          // not ready for dispatch, increment stall stat
-                        schIter->first->wavefront()->schResourceStalls++;
+                        schIter->first->wavefront()->stats.schResourceStalls++;
                      }
                      // Examine next wave for this resource
                      schIter++;
@@ -601,9 +601,9 @@ ScheduleStage::fillDispatchList()
          // Increment stall count if no wave sent to dispatchList for
          // current execution resource
          if (!dispatched) {
-            schListToDispListStalls[j]++;
+            stats.schListToDispListStalls[j]++;
          } else {
-            schListToDispList[j]++;
+            stats.schListToDispList[j]++;
          }
      }
  }
@@ -635,9 +635,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
                  reinsertToSchList(wf->localMem, toExecute
                                    .readyInst(wf->localMem));
                  // Increment stall stats for LDS-VRF arbitration
-                ldsBusArbStalls++;
+                stats.ldsBusArbStalls++;
                  toExecute.readyInst(wf->localMem)
-                    ->wavefront()->schLdsArbStalls++;
+                    ->wavefront()->stats.schLdsArbStalls++;
              }
              // With arbitration of LM pipe complete, transition the
              // LM pipe to SKIP state in the dispatchList to inform EX stage
@@ -663,7 +663,7 @@ ScheduleStage::checkRfOperandReadComplete()
  
              // Increment the number of cycles the wave spends in the
              // SCH stage, since this loop visits every wave in SCH.
-            wf->schCycles++;
+            wf->stats.schCycles++;
  
              bool vrfRdy = true;
              if (!gpu_dyn_inst->isScalar()) {
@@ -690,15 +690,15 @@ ScheduleStage::checkRfOperandReadComplete()
                  p.second = RFBUSY;
  
                  // Increment stall stats
-                wf->schStalls++;
-                wf->schOpdNrdyStalls++;
+                wf->stats.schStalls++;
+                wf->stats.schOpdNrdyStalls++;
  
-                opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+                stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
                  if (!vrfRdy) {
-                    opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+                    stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
                  }
                  if (!srfRdy) {
-                    opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+                    stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
                  }
              }
          }
@@ -777,60 +777,40 @@ ScheduleStage::deleteFromSch(Wavefront *w)
      wavesInSch.erase(w->wfDynId);
  }
  
-void
-ScheduleStage::regStats()
+ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
+                                                      int num_exec_units)
+    : Stats::Group(parent, "ScheduleStage"),
+      ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
+               "execution resource"),
+      ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
+               "list per execution resource"),
+      ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
+               "schList per execution resource when ready list is not empty"),
+      ADD_STAT(schListToDispList, "number of cycles a wave is added to "
+               "dispatchList per execution resource"),
+      ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
+               " dispatchList per execution resource"),
+      ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
+      ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
+               "conflicts"),
+      ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
+               "ready"),
+      ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
+               "ready")
  {
-    rdyListNotEmpty
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".rdy_list_not_empty")
-        .desc("number of cycles one or more wave on ready list per "
-              "execution resource")
-        ;
-
-    rdyListEmpty
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".rdy_list_empty")
-        .desc("number of cycles no wave on ready list per "
-              "execution resource")
-        ;
-
-    addToSchListStalls
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_add_stalls")
-        .desc("number of cycles a wave is not added to schList per "
-              "execution resource when ready list is not empty")
-        ;
-
-    schListToDispList
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_to_disp_list")
-        .desc("number of cycles a wave is added to dispatchList per "
-              "execution resource")
-        ;
-
-    schListToDispListStalls
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_to_disp_list_stalls")
-        .desc("number of cycles no wave is added to dispatchList per "
-              "execution resource")
-        ;
-
-    // Operand Readiness Stall Cycles
-    opdNrdyStalls
-        .init(SCH_RF_OPD_NRDY_CONDITIONS)
-        .name(name() + ".opd_nrdy_stalls")
-        .desc("number of stalls in SCH due to operands not ready")
-        ;
+    rdyListNotEmpty.init(num_exec_units);
+    rdyListEmpty.init(num_exec_units);
+    addToSchListStalls.init(num_exec_units);
+    schListToDispList.init(num_exec_units);
+    schListToDispListStalls.init(num_exec_units);
+    opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
+    dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
+    rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);
+
      opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
      opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
      opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
  
-    // dispatchReady Stall Cycles
-    dispNrdyStalls
-        .init(SCH_NRDY_CONDITIONS)
-        .name(name() + ".disp_nrdy_stalls")
-        .desc("number of stalls in SCH due to resource not ready")
-        ;
      dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
      dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
      dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
@@ -862,21 +842,9 @@ ScheduleStage::regStats()
                                    csprintf("FlatMemFIFO"));
      dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
  
-    // RF Access Stall Cycles
-    rfAccessStalls
-        .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
-        .name(name() + ".rf_access_stalls")
-        .desc("number of stalls due to RF access denied")
-        ;
      rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
      rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
      rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
      rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
      rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
-
-    // Stall cycles due to wave losing LDS bus arbitration
-    ldsBusArbStalls
-        .name(name() + ".lds_bus_arb_stalls")
-        .desc("number of stalls due to VRF->LDS bus conflicts")
-        ;
  }
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh

index 1a9aca17e8ea3b6a750cce0c4ab5e10060d836a5..ede2a45db8fb1135259541c45155078198ce3614 100644 (file)
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -40,6 +40,8 @@
  #include <utility>
  #include <vector>
  
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "gpu-compute/exec_stage.hh"
  #include "gpu-compute/misc.hh"
  #include "gpu-compute/scheduler.hh"
@@ -105,8 +107,6 @@ class ScheduleStage
          SCH_RF_ACCESS_NRDY_CONDITIONS
      };
  
-    void regStats();
-
      // Called by ExecStage to inform SCH of instruction execution
      void deleteFromSch(Wavefront *w);
  
@@ -126,48 +126,6 @@ class ScheduleStage
      // scheduler and a dispatch list
      std::vector<Scheduler> scheduler;
  
-    // Stats
-
-    // Number of cycles with empty (or not empty) readyList, per execution
-    // resource, when the CU is active (not sleeping)
-    Stats::Vector rdyListEmpty;
-    Stats::Vector rdyListNotEmpty;
-
-    // Number of cycles, per execution resource, when at least one wave
-    // was on the readyList and picked by scheduler, but was unable to be
-    // added to the schList, when the CU is active (not sleeping)
-    Stats::Vector addToSchListStalls;
-
-    // Number of cycles, per execution resource, when a wave is selected
-    // as candidate for dispatchList from schList
-    // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
-    Stats::Vector schListToDispList;
-
-    // Per execution resource stat, incremented once per cycle if no wave
-    // was selected as candidate for dispatch and moved to dispatchList
-    Stats::Vector schListToDispListStalls;
-
-    // Number of times a wave is selected by the scheduler but cannot
-    // be added to the schList due to register files not being able to
-    // support reads or writes of operands. RF_ACCESS_NRDY condition is always
-    // incremented if at least one read/write not supported, other
-    // conditions are incremented independently from each other.
-    Stats::Vector rfAccessStalls;
-
-    // Number of times a wave is executing FLAT instruction and
-    // forces another wave occupying its required local memory resource
-    // to be deselected for execution, and placed back on schList
-    Stats::Scalar ldsBusArbStalls;
-
-    // Count of times VRF and/or SRF blocks waves on schList from
-    // performing RFBUSY->RFREADY transition
-    Stats::Vector opdNrdyStalls;
-
-    // Count of times resource required for dispatch is not ready and
-    // blocks wave in RFREADY state on schList from potentially moving
-    // to dispatchList
-    Stats::Vector dispNrdyStalls;
-
      const std::string _name;
  
      // called by exec() to add a wave to schList if the RFs can support it
@@ -221,6 +179,52 @@ class ScheduleStage
      // the VRF/SRF availability or limits imposed by paremeters (to be added)
      // of the SCH stage or CU.
      std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
+
+  protected:
+    struct ScheduleStageStats : public Stats::Group
+    {
+        ScheduleStageStats(Stats::Group *parent, int num_exec_units);
+
+        // Number of cycles with empty (or not empty) readyList, per execution
+        // resource, when the CU is active (not sleeping)
+        Stats::Vector rdyListEmpty;
+        Stats::Vector rdyListNotEmpty;
+
+        // Number of cycles, per execution resource, when at least one wave
+        // was on the readyList and picked by scheduler, but was unable to be
+        // added to the schList, when the CU is active (not sleeping)
+        Stats::Vector addToSchListStalls;
+
+        // Number of cycles, per execution resource, when a wave is selected
+        // as candidate for dispatchList from schList
+        // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+        Stats::Vector schListToDispList;
+
+        // Per execution resource stat, incremented once per cycle if no wave
+        // was selected as candidate for dispatch and moved to dispatchList
+        Stats::Vector schListToDispListStalls;
+
+        // Number of times a wave is selected by the scheduler but cannot
+        // be added to the schList due to register files not being able to
+        // support reads or writes of operands. RF_ACCESS_NRDY condition is
+        // always incremented if at least one read/write not supported, other
+        // conditions are incremented independently from each other.
+        Stats::Vector rfAccessStalls;
+
+        // Number of times a wave is executing FLAT instruction and
+        // forces another wave occupying its required local memory resource
+        // to be deselected for execution, and placed back on schList
+        Stats::Scalar ldsBusArbStalls;
+
+        // Count of times VRF and/or SRF blocks waves on schList from
+        // performing RFBUSY->RFREADY transition
+        Stats::Vector opdNrdyStalls;
+
+        // Count of times resource required for dispatch is not ready and
+        // blocks wave in RFREADY state on schList from potentially moving
+        // to dispatchList
+        Stats::Vector dispNrdyStalls;
+    } stats;
  };
  
  #endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc

index dfda0ad79022bd8656705b29bffe11d0718961d4..c246279d6003b34f38be7e57fd14ab32f7327ead 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -49,7 +49,7 @@ ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams &p,
                                             ScoreboardCheckToSchedule
                                             &to_schedule)
      : computeUnit(cu), toSchedule(to_schedule),
-      _name(cu.name() + ".ScoreboardCheckStage")
+      _name(cu.name() + ".ScoreboardCheckStage"), stats(&cu)
  {
  }
  
@@ -62,7 +62,7 @@ ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
  {
      panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
               "Instruction ready status %d is illegal!!!", rdyStatus);
-    stallCycles[rdyStatus]++;
+    stats.stallCycles[rdyStatus]++;
  }
  
  // Return true if this wavefront is ready
@@ -266,14 +266,13 @@ ScoreboardCheckStage::exec()
      }
  }
  
-void
-ScoreboardCheckStage::regStats()
+ScoreboardCheckStage::
+ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "ScoreboardCheckStage"),
+      ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
  {
-    stallCycles
-        .init(NRDY_CONDITIONS)
-        .name(name() + ".stall_cycles")
-        .desc("number of cycles wave stalled in SCB")
-        ;
+    stallCycles.init(NRDY_CONDITIONS);
+
      stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
      stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
      stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh

index c45ea7571b28c01d0af717a2dd56f2784203473a..419dffb9f8df8504e7666f462aff504acbd41f85 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -40,7 +40,8 @@
  #include <utility>
  #include <vector>
  
-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  
  class ComputeUnit;
  class ScoreboardCheckToSchedule;
@@ -78,7 +79,6 @@ class ScoreboardCheckStage
  
      // Stats related variables and methods
      const std::string& name() const { return _name; }
-    void regStats();
  
    private:
      void collectStatistics(nonrdytype_e rdyStatus);
@@ -94,10 +94,15 @@ class ScoreboardCheckStage
       */
      ScoreboardCheckToSchedule &toSchedule;
  
-    // Stats
-    Stats::Vector stallCycles;
-
      const std::string _name;
+
+  protected:
+    struct ScoreboardCheckStageStats : public Stats::Group
+    {
+        ScoreboardCheckStageStats(Stats::Group *parent);
+
+        Stats::Vector stallCycles;
+    } stats;
  };
  
  #endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index 9ae3fd7ce1155e4b20d302f363e4457ed2e93ed7..dcb0d8b9c9862a2c47ec13eb22f0773b4eade146 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -65,7 +65,8 @@ Shader::Shader(const Params &p) : ClockedObject(p),
      globalMemSize(p.globalmem),
      nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
      _dispatcher(*p.dispatcher),
-    max_valu_insts(p.max_valu_insts), total_valu_insts(0)
+    max_valu_insts(p.max_valu_insts), total_valu_insts(0),
+    stats(this, p.CUs[0]->wfSize())
  {
      gpuCmdProc.setShader(this);
      _dispatcher.setShader(this);
@@ -278,86 +279,6 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
      return scheduledSomething;
  }
  
-void
-Shader::regStats()
-{
-    ClockedObject::regStats();
-
-    shaderActiveTicks
-        .name(name() + ".shader_active_ticks")
-        .desc("Total ticks that any CU attached to this shader is active")
-        ;
-    allLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".allLatencyDist")
-        .desc("delay distribution for all")
-        .flags(Stats::pdf | Stats::oneline);
-
-    loadLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".loadLatencyDist")
-        .desc("delay distribution for loads")
-        .flags(Stats::pdf | Stats::oneline);
-
-    storeLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".storeLatencyDist")
-        .desc("delay distribution for stores")
-        .flags(Stats::pdf | Stats::oneline);
-
-    vectorInstSrcOperand
-        .init(4)
-        .name(name() + ".vec_inst_src_operand")
-        .desc("vector instruction source operand distribution");
-
-    vectorInstDstOperand
-        .init(4)
-        .name(name() + ".vec_inst_dst_operand")
-        .desc("vector instruction destination operand distribution");
-
-    initToCoalesceLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".initToCoalesceLatency")
-        .desc("Ticks from vmem inst initiateAcc to coalescer issue")
-        .flags(Stats::pdf | Stats::oneline);
-
-    rubyNetworkLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".rubyNetworkLatency")
-        .desc("Ticks from coalescer issue to coalescer hit callback")
-        .flags(Stats::pdf | Stats::oneline);
-
-    gmEnqueueLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".gmEnqueueLatency")
-        .desc("Ticks from coalescer hit callback to GM pipe enqueue")
-        .flags(Stats::pdf | Stats::oneline);
-
-    gmToCompleteLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".gmToCompleteLatency")
-        .desc("Ticks queued in GM pipes ordered response buffer")
-        .flags(Stats::pdf | Stats::oneline);
-
-    coalsrLineAddresses
-        .init(0, 20, 1)
-        .name(name() + ".coalsrLineAddresses")
-        .desc("Number of cache lines for coalesced request")
-        .flags(Stats::pdf | Stats::oneline);
-
-    int wfSize = cuList[0]->wfSize();
-    cacheBlockRoundTrip = new Stats::Distribution[wfSize];
-    for (int idx = 0; idx < wfSize; ++idx) {
-        std::stringstream namestr;
-        ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
-        cacheBlockRoundTrip[idx]
-            .init(0, 1600000, 10000)
-            .name(namestr.str())
-            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
-            .flags(Stats::pdf | Stats::oneline);
-    }
-}
-
  void
  Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
                             bool suppress_func_errors, int cu_id)
@@ -528,8 +449,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
  void
  Shader::sampleStore(const Tick accessTime)
  {
-    storeLatencyDist.sample(accessTime);
-    allLatencyDist.sample(accessTime);
+    stats.storeLatencyDist.sample(accessTime);
+    stats.allLatencyDist.sample(accessTime);
  }
  
  /*
@@ -538,8 +459,8 @@ Shader::sampleStore(const Tick accessTime)
  void
  Shader::sampleLoad(const Tick accessTime)
  {
-    loadLatencyDist.sample(accessTime);
-    allLatencyDist.sample(accessTime);
+    stats.loadLatencyDist.sample(accessTime);
+    stats.allLatencyDist.sample(accessTime);
  }
  
  void
@@ -556,16 +477,16 @@ Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
      Tick t4 = roundTripTime[3];
      Tick t5 = roundTripTime[4];
  
-    initToCoalesceLatency.sample(t2-t1);
-    rubyNetworkLatency.sample(t3-t2);
-    gmEnqueueLatency.sample(t4-t3);
-    gmToCompleteLatency.sample(t5-t4);
+    stats.initToCoalesceLatency.sample(t2-t1);
+    stats.rubyNetworkLatency.sample(t3-t2);
+    stats.gmEnqueueLatency.sample(t4-t3);
+    stats.gmToCompleteLatency.sample(t5-t4);
  }
  
  void
  Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
  {
-    coalsrLineAddresses.sample(lineMap.size());
+    stats.coalsrLineAddresses.sample(lineMap.size());
      std::vector<Tick> netTimes;
  
      // For each cache block address generated by a vmem inst, calculate
@@ -586,7 +507,7 @@ Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
      // Nth distribution.
      int idx = 0;
      for (auto& time : netTimes) {
-        cacheBlockRoundTrip[idx].sample(time);
+        stats.cacheBlockRoundTrip[idx].sample(time);
          ++idx;
      }
  }
@@ -598,5 +519,75 @@ Shader::notifyCuSleep() {
               "Invalid activeCu size\n");
      _activeCus--;
      if (!_activeCus)
-        shaderActiveTicks += curTick() - _lastInactiveTick;
+        stats.shaderActiveTicks += curTick() - _lastInactiveTick;
+}
+
+Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
+    : Stats::Group(parent),
+      ADD_STAT(allLatencyDist, "delay distribution for all"),
+      ADD_STAT(loadLatencyDist, "delay distribution for loads"),
+      ADD_STAT(storeLatencyDist, "delay distribution for stores"),
+      ADD_STAT(initToCoalesceLatency,
+               "Ticks from vmem inst initiateAcc to coalescer issue"),
+      ADD_STAT(rubyNetworkLatency,
+               "Ticks from coalescer issue to coalescer hit callback"),
+      ADD_STAT(gmEnqueueLatency,
+               "Ticks from coalescer hit callback to GM pipe enqueue"),
+      ADD_STAT(gmToCompleteLatency,
+               "Ticks queued in GM pipes ordered response buffer"),
+      ADD_STAT(coalsrLineAddresses,
+               "Number of cache lines for coalesced request"),
+      ADD_STAT(shaderActiveTicks,
+               "Total ticks that any CU attached to this shader is active"),
+      ADD_STAT(vectorInstSrcOperand,
+               "vector instruction source operand distribution"),
+      ADD_STAT(vectorInstDstOperand,
+               "vector instruction destination operand distribution")
+{
+    allLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    loadLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    storeLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    initToCoalesceLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    rubyNetworkLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmEnqueueLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmToCompleteLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    coalsrLineAddresses
+        .init(0, 20, 1)
+        .flags(Stats::pdf | Stats::oneline);
+
+    vectorInstSrcOperand.init(4);
+    vectorInstDstOperand.init(4);
+
+    cacheBlockRoundTrip = new Stats::Distribution[wf_size];
+    for (int idx = 0; idx < wf_size; ++idx) {
+        std::stringstream namestr;
+        ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
+                 static_cast<Shader*>(parent)->name(), idx);
+        cacheBlockRoundTrip[idx]
+            .init(0, 1600000, 10000)
+            .name(namestr.str())
+            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+            .flags(Stats::pdf | Stats::oneline);
+    }
  }
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh

index 76ee3c9f7c786bdbcec9dadf068930c8a04ed70c..125df1a36d7a2d9747d27b34ddc3dcf7d738e3ef 100644 (file)
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -40,6 +40,8 @@
  #include <string>
  
  #include "arch/isa.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "base/types.hh"
  #include "cpu/simple/atomic.hh"
  #include "cpu/simple/timing.hh"
@@ -98,26 +100,6 @@ class Shader : public ClockedObject
      // Last tick that all CUs attached to this shader were inactive
      Tick _lastInactiveTick;
  
-    // some stats for measuring latency
-    Stats::Distribution allLatencyDist;
-    Stats::Distribution loadLatencyDist;
-    Stats::Distribution storeLatencyDist;
-
-    // average ticks from vmem inst initiateAcc to coalescer issue,
-    // average ticks from coalescer issue to coalescer hit callback,
-    // average ticks from coalescer hit callback to GM pipe enqueue,
-    // and average ticks spent in GM pipe's ordered resp buffer.
-    Stats::Distribution initToCoalesceLatency;
-    Stats::Distribution rubyNetworkLatency;
-    Stats::Distribution gmEnqueueLatency;
-    Stats::Distribution gmToCompleteLatency;
-
-    // average number of cache blocks requested by vmem inst, and
-    // average ticks for cache blocks to main memory for the Nth
-    // cache block generated by a vmem inst.
-    Stats::Distribution coalsrLineAddresses;
-    Stats::Distribution *cacheBlockRoundTrip;
-
    public:
      typedef ShaderParams Params;
      enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -249,14 +231,6 @@ class Shader : public ClockedObject
      GPUCommandProcessor &gpuCmdProc;
      GPUDispatcher &_dispatcher;
  
-    /**
-     * Statistics
-     */
-    Stats::Scalar shaderActiveTicks;
-    Stats::Vector vectorInstSrcOperand;
-    Stats::Vector vectorInstDstOperand;
-    void regStats();
-
      int64_t max_valu_insts;
      int64_t total_valu_insts;
  
@@ -301,6 +275,52 @@ class Shader : public ClockedObject
      void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
      void updateContext(int cid);
      void notifyCuSleep();
+
+    void
+    incVectorInstSrcOperand(int num_operands)
+    {
+        stats.vectorInstSrcOperand[num_operands]++;
+    }
+
+    void
+    incVectorInstDstOperand(int num_operands)
+    {
+        stats.vectorInstDstOperand[num_operands]++;
+    }
+
+  protected:
+    struct ShaderStats : public Stats::Group
+    {
+        ShaderStats(Stats::Group *parent, int wf_size);
+
+        // some stats for measuring latency
+        Stats::Distribution allLatencyDist;
+        Stats::Distribution loadLatencyDist;
+        Stats::Distribution storeLatencyDist;
+
+        // average ticks from vmem inst initiateAcc to coalescer issue,
+        Stats::Distribution initToCoalesceLatency;
+
+        // average ticks from coalescer issue to coalescer hit callback,
+        Stats::Distribution rubyNetworkLatency;
+
+        // average ticks from coalescer hit callback to GM pipe enqueue,
+        Stats::Distribution gmEnqueueLatency;
+
+        // average ticks spent in GM pipe's ordered resp buffer.
+        Stats::Distribution gmToCompleteLatency;
+
+        // average number of cache blocks requested by vmem inst
+        Stats::Distribution coalsrLineAddresses;
+
+        // average ticks for cache blocks to main memory for the Nth
+        // cache block generated by a vmem inst.
+        Stats::Distribution *cacheBlockRoundTrip;
+
+        Stats::Scalar shaderActiveTicks;
+        Stats::Vector vectorInstSrcOperand;
+        Stats::Vector vectorInstDstOperand;
+    } stats;
  };
  
  #endif // __SHADER_HH__
diff --git a/src/gpu-compute/static_register_manager_policy.cc b/src/gpu-compute/static_register_manager_policy.cc

index f1bc1e6f16f5e2007cd49f92f1d356515df1b70f..62b29cff3a24006c11276471489b81aded89f7f0 100644 (file)
--- a/src/gpu-compute/static_register_manager_policy.cc
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -180,8 +180,3 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
      w->reservedScalarRegs = 0;
      w->startSgprIndex = 0;
  }
-
-void
-StaticRegisterManagerPolicy::regStats()
-{
-}
diff --git a/src/gpu-compute/static_register_manager_policy.hh b/src/gpu-compute/static_register_manager_policy.hh

index 6abeb1d1a2b83fea068f800dcf15d5a0398e47e8..812232f4aca2e87b3ae4d501f54cd8121adb24ec 100644 (file)
--- a/src/gpu-compute/static_register_manager_policy.hh
+++ b/src/gpu-compute/static_register_manager_policy.hh
@@ -58,8 +58,6 @@ class StaticRegisterManagerPolicy : public RegisterManagerPolicy
          int scalarDemand) override;
  
      void freeRegisters(Wavefront *w) override;
-
-    void regStats() override;
  };
  
  #endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc

index 39db4ee13399ac6cd930a4e53e272d2cad09ef08..aaf470f4e11b2efd6e8f5c863486d2778b67bd12 100644 (file)
--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -50,7 +50,8 @@ TLBCoalescer::TLBCoalescer(const Params &p)
                      false, Event::CPU_Tick_Pri),
        cleanupEvent([this]{ processCleanupEvent(); },
                     "Cleanup issuedTranslationsTable hashmap",
-                   false, Event::Maximum_Pri)
+                   false, Event::Maximum_Pri),
+      stats(this)
  {
      // create the response ports based on the number of connected ports
      for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
@@ -256,11 +257,11 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
          sender_state->reqCnt.push_back(req_cnt);
  
          // update statistics
-        coalescer->uncoalescedAccesses++;
+        coalescer->stats.uncoalescedAccesses++;
          req_cnt = sender_state->reqCnt.back();
          DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
-        coalescer->queuingCycles -= (curTick() * req_cnt);
-        coalescer->localqueuingCycles -= curTick();
+        coalescer->stats.queuingCycles -= (curTick() * req_cnt);
+        coalescer->stats.localqueuingCycles -= curTick();
      }
  
      // FIXME if you want to coalesce not based on the issueTime
@@ -302,7 +303,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
      // and make necessary allocations.
      if (!coalescedReq_cnt || !didCoalesce) {
          if (update_stats)
-            coalescer->coalescedAccesses++;
+            coalescer->stats.coalescedAccesses++;
  
          std::vector<PacketPtr> new_array;
          new_array.push_back(pkt);
@@ -339,7 +340,7 @@ TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
      bool update_stats = !sender_state->prefetch;
  
      if (update_stats)
-        coalescer->uncoalescedAccesses++;
+        coalescer->stats.uncoalescedAccesses++;
  
      // If there is a pending timing request for this virtual address
      // print a warning message. This is a temporary caveat of
@@ -467,7 +468,7 @@ TLBCoalescer::processProbeTLBEvent()
                      // by the one we just sent counting all the way from
                      // the top of TLB hiearchy (i.e., from the CU)
                      int req_cnt = tmp_sender_state->reqCnt.back();
-                    queuingCycles += (curTick() * req_cnt);
+                    stats.queuingCycles += (curTick() * req_cnt);
  
                      DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
                              name(), req_cnt);
@@ -475,7 +476,7 @@ TLBCoalescer::processProbeTLBEvent()
                      // pkt_cnt is number of packets we coalesced into the one
                      // we just sent but only at this coalescer level
                      int pkt_cnt = iter->second[vector_index].size();
-                    localqueuingCycles += (curTick() * pkt_cnt);
+                    stats.localqueuingCycles += (curTick() * pkt_cnt);
                  }
  
                  DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
@@ -520,35 +521,14 @@ TLBCoalescer::processCleanupEvent()
      }
  }
  
-void
-TLBCoalescer::regStats()
+TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
+      ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
+      ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
+      ADD_STAT(localqueuingCycles,
+               "Number of cycles spent in queue for all incoming reqs"),
+      ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
  {
-    ClockedObject::regStats();
-
-    uncoalescedAccesses
-        .name(name() + ".uncoalesced_accesses")
-        .desc("Number of uncoalesced TLB accesses")
-        ;
-
-    coalescedAccesses
-        .name(name() + ".coalesced_accesses")
-        .desc("Number of coalesced TLB accesses")
-        ;
-
-    queuingCycles
-        .name(name() + ".queuing_cycles")
-        .desc("Number of cycles spent in queue")
-        ;
-
-    localqueuingCycles
-        .name(name() + ".local_queuing_cycles")
-        .desc("Number of cycles spent in queue for all incoming reqs")
-        ;
-
-    localLatency
-        .name(name() + ".local_latency")
-        .desc("Avg. latency over all incoming pkts")
-        ;
-
      localLatency = localqueuingCycles / uncoalescedAccesses;
  }
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh

index 8b71a982d20eb5aa0a0fde3996759df73a2c9d0e..ef35ecbaac7194e3bab520473b7e88d484d850d0 100644 (file)
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -115,26 +115,8 @@ class TLBCoalescer : public ClockedObject
  
      CoalescingTable issuedTranslationsTable;
  
-    // number of packets the coalescer receives
-    Stats::Scalar uncoalescedAccesses;
-    // number packets the coalescer send to the TLB
-    Stats::Scalar coalescedAccesses;
-
-    // Number of cycles the coalesced requests spend waiting in
-    // coalescerFIFO. For each packet the coalescer receives we take into
-    // account the number of all uncoalesced requests this pkt "represents"
-    Stats::Scalar queuingCycles;
-
-    // On average how much time a request from the
-    // uncoalescedAccesses that reaches the TLB
-    // spends waiting?
-    Stats::Scalar localqueuingCycles;
-    // localqueuingCycles/uncoalescedAccesses
-    Stats::Formula localLatency;
-
      bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
      void updatePhysAddresses(PacketPtr pkt);
-    void regStats() override;
  
      class CpuSidePort : public ResponsePort
      {
@@ -211,6 +193,29 @@ class TLBCoalescer : public ClockedObject
      // this FIFO queue keeps track of the virt. page
      // addresses that are pending cleanup
      std::queue<Addr> cleanupQueue;
+
+  protected:
+    struct TLBCoalescerStats : public Stats::Group
+    {
+        TLBCoalescerStats(Stats::Group *parent);
+
+        // number of packets the coalescer receives
+        Stats::Scalar uncoalescedAccesses;
+        // number packets the coalescer send to the TLB
+        Stats::Scalar coalescedAccesses;
+
+        // Number of cycles the coalesced requests spend waiting in
+        // coalescerFIFO. For each packet the coalescer receives we take into
+        // account the number of all uncoalesced requests this pkt "represents"
+        Stats::Scalar queuingCycles;
+
+        // On average how much time a request from the
+        // uncoalescedAccesses that reaches the TLB
+        // spends waiting?
+        Stats::Scalar localqueuingCycles;
+        // localqueuingCycles/uncoalescedAccesses
+        Stats::Formula localLatency;
+    } stats;
  };
  
  #endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc

index c44955cd844fce9e04b9cdabf0fc89546b484a29..40ce281f6950d6d6a87d42cc4fbead790d1b4ddb 100644 (file)
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -69,11 +69,11 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
                      ->mapVgpr(w, vgprIdx + j);
                  if (regBusy(pVgpr)) {
                      if (ii->isDstOperand(i)) {
-                        w->numTimesBlockedDueWAXDependencies++;
+                        w->stats.numTimesBlockedDueWAXDependencies++;
                      } else if (ii->isSrcOperand(i)) {
                          DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                                  w->wfDynId, ii->disassemble(), pVgpr);
-                        w->numTimesBlockedDueRAWDependencies++;
+                        w->stats.numTimesBlockedDueRAWDependencies++;
                      }
                      return false;
                  }
@@ -125,13 +125,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
  {
      // increment count of number of DWORDs read from VRF
      int DWORDs = ii->numSrcVecDWORDs();
-    registerReads += (DWORDs * w->execMask().count());
+    stats.registerReads += (DWORDs * w->execMask().count());
  
      uint64_t mask = w->execMask().to_ullong();
      int srams = w->execMask().size() / 4;
      for (int i = 0; i < srams; i++) {
          if (mask & 0xF) {
-            sramReads += DWORDs;
+            stats.sramReads += DWORDs;
          }
          mask = mask >> 4;
      }
@@ -163,13 +163,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
  
          // increment count of number of DWORDs written to VRF
          DWORDs = ii->numDstVecDWORDs();
-        registerWrites += (DWORDs * w->execMask().count());
+        stats.registerWrites += (DWORDs * w->execMask().count());
  
          mask = w->execMask().to_ullong();
          srams = w->execMask().size() / 4;
          for (int i = 0; i < srams; i++) {
              if (mask & 0xF) {
-                sramWrites += DWORDs;
+                stats.sramWrites += DWORDs;
              }
              mask = mask >> 4;
          }
@@ -196,13 +196,13 @@ VectorRegisterFile::scheduleWriteOperandsFromLoad(
      }
      // increment count of number of DWORDs written to VRF
      int DWORDs = ii->numDstVecDWORDs();
-    registerWrites += (DWORDs * ii->exec_mask.count());
+    stats.registerWrites += (DWORDs * ii->exec_mask.count());
  
      uint64_t mask = ii->exec_mask.to_ullong();
      int srams = ii->exec_mask.size() / 4;
      for (int i = 0; i < srams; i++) {
          if (mask & 0xF) {
-            sramWrites += DWORDs;
+            stats.sramWrites += DWORDs;
          }
          mask = mask >> 4;
      }
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc

index b7ff95ab2ce7a2335a17bbd960546fe1b1471d07..343b5c9f2df23b151a44e55b60589d21d4ddae55 100644 (file)
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -49,7 +49,7 @@ Wavefront::Wavefront(const Params &p)
      maxIbSize(p.max_ib_size), _gpuISA(*this),
      vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
      vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
-    barId(WFBarrier::InvalidID)
+    barId(WFBarrier::InvalidID), stats(this)
  {
      lastTrace = 0;
      execUnitId = -1;
@@ -97,75 +97,6 @@ Wavefront::Wavefront(const Params &p)
      vecReads.clear();
  }
  
-void
-Wavefront::regStats()
-{
-    SimObject::regStats();
-
-    // FIXME: the name of the WF needs to be unique
-    numTimesBlockedDueWAXDependencies
-        .name(name() + ".timesBlockedDueWAXDependencies")
-        .desc("number of times the wf's instructions are blocked due to WAW "
-              "or WAR dependencies")
-        ;
-
-    // FIXME: the name of the WF needs to be unique
-    numTimesBlockedDueRAWDependencies
-        .name(name() + ".timesBlockedDueRAWDependencies")
-        .desc("number of times the wf's instructions are blocked due to RAW "
-              "dependencies")
-        ;
-
-    numInstrExecuted
-        .name(name() + ".num_instr_executed")
-        .desc("number of instructions executed by this WF slot")
-        ;
-
-    schCycles
-        .name(name() + ".sch_cycles")
-        .desc("number of cycles spent in schedule stage")
-        ;
-
-    schStalls
-        .name(name() + ".sch_stalls")
-        .desc("number of cycles WF is stalled in SCH stage")
-        ;
-
-    schRfAccessStalls
-        .name(name() + ".sch_rf_access_stalls")
-        .desc("number of cycles wave selected in SCH but RF denied adding "
-              "instruction")
-        ;
-
-    schResourceStalls
-        .name(name() + ".sch_resource_stalls")
-        .desc("number of cycles stalled in sch by resource not available")
-        ;
-
-    schOpdNrdyStalls
-        .name(name() + ".sch_opd_nrdy_stalls")
-        .desc("number of cycles stalled in sch waiting for RF reads to "
-              "complete")
-        ;
-
-    schLdsArbStalls
-        .name(name() + ".sch_lds_arb_stalls")
-        .desc("number of cycles wave stalled due to LDS-VRF arbitration")
-        ;
-
-    vecRawDistance
-        .init(0,20,1)
-        .name(name() + ".vec_raw_distance")
-        .desc("Count of RAW distance in dynamic instructions for this WF")
-        ;
-
-    readsPerWrite
-        .init(0,4,1)
-        .name(name() + ".vec_reads_per_write")
-        .desc("Count of Vector reads per write for this WF")
-        ;
-}
-
  void
  Wavefront::init()
  {
@@ -959,17 +890,19 @@ Wavefront::exec()
      }
      computeUnit->srf[simdId]->waveExecuteInst(this, ii);
  
-    computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
-    computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
-    computeUnit->numInstrExecuted++;
-    numInstrExecuted++;
+    computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands());
+    computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands());
+    computeUnit->stats.numInstrExecuted++;
+    stats.numInstrExecuted++;
      computeUnit->instExecPerSimd[simdId]++;
-    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
-                                     computeUnit->lastExecCycle[simdId]);
-    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+    computeUnit->stats.execRateDist.sample(
+                                    computeUnit->stats.totalCycles.value() -
+                                    computeUnit->lastExecCycle[simdId]);
+    computeUnit->lastExecCycle[simdId] =
+        computeUnit->stats.totalCycles.value();
  
      if (lastInstExec) {
-        computeUnit->instInterleave[simdId].
+        computeUnit->stats.instInterleave[simdId].
              sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
      }
      lastInstExec = computeUnit->instExecPerSimd[simdId];
@@ -987,8 +920,8 @@ Wavefront::exec()
                  if (ii->isSrcOperand(i)) {
                      // This check should never fail, but to be safe we check
                      if (rawDist.find(vgpr+n) != rawDist.end()) {
-                        vecRawDistance.
-                            sample(numInstrExecuted.value() - rawDist[vgpr+n]);
+                        stats.vecRawDistance.sample(
+                            stats.numInstrExecuted.value() - rawDist[vgpr+n]);
                      }
                      // increment number of reads to this register
                      vecReads[vgpr+n]++;
@@ -997,12 +930,12 @@ Wavefront::exec()
                      // for the first write to each physical register
                      if (rawDist.find(vgpr+n) != rawDist.end()) {
                          // sample the number of reads that were performed
-                        readsPerWrite.sample(vecReads[vgpr+n]);
+                        stats.readsPerWrite.sample(vecReads[vgpr+n]);
                      }
                      // on a write, reset count of reads to 0
                      vecReads[vgpr+n] = 0;
  
-                    rawDist[vgpr+n] = numInstrExecuted.value();
+                    rawDist[vgpr+n] = stats.numInstrExecuted.value();
                  }
              }
          }
@@ -1023,26 +956,29 @@ Wavefront::exec()
  
      if (computeUnit->shader->hsail_mode==Shader::SIMT) {
          const int num_active_lanes = execMask().count();
-        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
-        computeUnit->numVecOpsExecuted += num_active_lanes;
+        computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
+        computeUnit->stats.numVecOpsExecuted += num_active_lanes;
  
          if (ii->isF16() && ii->isALU()) {
              if (ii->isF32() || ii->isF64()) {
                  fatal("Instruction is tagged as both (1) F16, and (2)"
                         "either F32 or F64.");
              }
-            computeUnit->numVecOpsExecutedF16 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
              if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
          }
          if (ii->isF32() && ii->isALU()) {
@@ -1050,18 +986,21 @@ Wavefront::exec()
                  fatal("Instruction is tagged as both (1) F32, and (2)"
                         "either F16 or F64.");
              }
-            computeUnit->numVecOpsExecutedF32 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
              if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
          }
          if (ii->isF64() && ii->isALU()) {
@@ -1069,24 +1008,29 @@ Wavefront::exec()
                  fatal("Instruction is tagged as both (1) F64, and (2)"
                         "either F16 or F32.");
              }
-            computeUnit->numVecOpsExecutedF64 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
              if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
              else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
              }
          }
          if (isGmInstruction(ii)) {
-            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+            computeUnit->stats.activeLanesPerGMemInstrDist.sample(
+                                                            num_active_lanes);
          } else if (isLmInstruction(ii)) {
-            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+            computeUnit->stats.activeLanesPerLMemInstrDist.sample(
+                                                            num_active_lanes);
          }
      }
  
@@ -1133,14 +1077,14 @@ Wavefront::exec()
                  computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
              computeUnit->vectorGlobalMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                  computeUnit->vrf_gm_bus_latency;
          } else {
              computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                  cyclesToTicks(computeUnit->srf_scm_bus_latency));
              computeUnit->scalarMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                  computeUnit->srf_scm_bus_latency;
          }
      // GM or Flat as GM Store
@@ -1150,14 +1094,14 @@ Wavefront::exec()
                  cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
              computeUnit->vectorGlobalMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                  (2 * computeUnit->vrf_gm_bus_latency);
          } else {
              computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                  cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
              computeUnit->scalarMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                  (2 * computeUnit->srf_scm_bus_latency);
          }
      } else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1167,14 +1111,14 @@ Wavefront::exec()
                  cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
              computeUnit->vectorGlobalMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                  (2 * computeUnit->vrf_gm_bus_latency);
          } else {
              computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                  cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
              computeUnit->scalarMemUnit.
                  set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                  (2 * computeUnit->srf_scm_bus_latency);
          }
      // LM or Flat as LM Load
@@ -1183,7 +1127,7 @@ Wavefront::exec()
              cyclesToTicks(computeUnit->vrf_lm_bus_latency));
          computeUnit->vectorSharedMemUnit.
              set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
              computeUnit->vrf_lm_bus_latency;
      // LM or Flat as LM Store
      } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
@@ -1191,7 +1135,7 @@ Wavefront::exec()
              cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
          computeUnit->vectorSharedMemUnit.
              set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
              (2 * computeUnit->vrf_lm_bus_latency);
      // LM or Flat as LM, Atomic or MemFence
      } else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1200,7 +1144,7 @@ Wavefront::exec()
              cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
          computeUnit->vectorSharedMemUnit.
              set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
              (2 * computeUnit->vrf_lm_bus_latency);
      } else {
          panic("Bad instruction type!\n");
@@ -1453,3 +1397,31 @@ Wavefront::releaseBarrier()
  {
      barId = WFBarrier::InvalidID;
  }
+
+Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(numInstrExecuted,
+               "number of instructions executed by this WF slot"),
+      ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
+      ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
+      ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
+               "RF denied adding instruction"),
+      ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
+               " not available"),
+      ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
+               "RF reads to complete"),
+      ADD_STAT(schLdsArbStalls,
+               "number of cycles wave stalled due to LDS-VRF arbitration"),
+      // FIXME: the name of the WF needs to be unique
+      ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
+               "instructions are blocked due to WAW or WAR dependencies"),
+      // FIXME: the name of the WF needs to be unique
+      ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
+               "instructions are blocked due to RAW dependencies"),
+      ADD_STAT(vecRawDistance,
+               "Count of RAW distance in dynamic instructions for this WF"),
+      ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
+{
+    vecRawDistance.init(0, 20, 1);
+    readsPerWrite.init(0, 4, 1);
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh

index 80fc3248ac54834ca9d7b9d8c2a5e61933ef4094..7b617c63fb798430716d81910f1501612583da8c 100644 (file)
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -43,6 +43,8 @@
  
  #include "arch/gpu_isa.hh"
  #include "base/logging.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
  #include "base/types.hh"
  #include "config/the_gpu_isa.hh"
  #include "gpu-compute/compute_unit.hh"
@@ -217,52 +219,13 @@ class Wavefront : public SimObject
      // unique WF id over all WFs executed across all CUs
      uint64_t wfDynId;
  
-    // Wavefront slot stats
-
-    // Number of instructions executed by this wavefront slot across all
-    // dynamic wavefronts
-    Stats::Scalar numInstrExecuted;
-
-    // Number of cycles this WF spends in SCH stage
-    Stats::Scalar schCycles;
-
-    // Number of stall cycles encounterd by this WF in SCH stage
-    Stats::Scalar schStalls;
-
-    // The following stats sum to the value of schStalls, and record, per
-    // WF slot, what the cause of each stall was at a coarse granularity.
-
-    // Cycles WF is selected by scheduler, but RFs cannot support instruction
-    Stats::Scalar schRfAccessStalls;
-    // Cycles spent waiting for execution resources
-    Stats::Scalar schResourceStalls;
-    // cycles spent waiting for RF reads to complete in SCH stage
-    Stats::Scalar schOpdNrdyStalls;
-    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
-    // but another wave is executing FLAT, which requires LM and GM and forces
-    // this WF to stall.
-    Stats::Scalar schLdsArbStalls;
-
-    // number of times an instruction of a WF is blocked from being issued
-    // due to WAR and WAW dependencies
-    Stats::Scalar numTimesBlockedDueWAXDependencies;
-    // number of times an instruction of a WF is blocked from being issued
-    // due to WAR and WAW dependencies
-    Stats::Scalar numTimesBlockedDueRAWDependencies;
-
      // dyn inst id (per SIMD) of last instruction exec from this wave
      uint64_t lastInstExec;
  
-    // Distribution to track the distance between producer and consumer
-    // for vector register values
-    Stats::Distribution vecRawDistance;
      // Map to track the dyn instruction id of each vector register value
      // produced, indexed by physical vector register ID
      std::unordered_map<int,uint64_t> rawDist;
  
-    // Distribution to track the number of times every vector register
-    // value produced is consumed.
-    Stats::Distribution readsPerWrite;
      // Counts the number of reads performed to each physical register
      // - counts are reset to 0 for each dynamic wavefront launched
      std::vector<int> vecReads;
@@ -289,7 +252,6 @@ class Wavefront : public SimObject
      // called by SCH stage to reserve
      std::vector<int> reserveResources();
      bool stopFetch();
-    void regStats();
  
      Addr pc() const;
      void pc(Addr new_pc);
@@ -357,6 +319,52 @@ class Wavefront : public SimObject
      Addr _pc;
      VectorMask _execMask;
      int barId;
+
+  public:
+    struct WavefrontStats : public Stats::Group
+    {
+        WavefrontStats(Stats::Group *parent);
+
+        // Number of instructions executed by this wavefront slot across all
+        // dynamic wavefronts
+        Stats::Scalar numInstrExecuted;
+
+        // Number of cycles this WF spends in SCH stage
+        Stats::Scalar schCycles;
+
+        // Number of stall cycles encounterd by this WF in SCH stage
+        Stats::Scalar schStalls;
+
+        // The following stats sum to the value of schStalls, and record, per
+        // WF slot, what the cause of each stall was at a coarse granularity.
+
+        // Cycles WF is selected by scheduler, but RFs cannot support
+        // instruction
+        Stats::Scalar schRfAccessStalls;
+        // Cycles spent waiting for execution resources
+        Stats::Scalar schResourceStalls;
+        // cycles spent waiting for RF reads to complete in SCH stage
+        Stats::Scalar schOpdNrdyStalls;
+        // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+        // but another wave is executing FLAT, which requires LM and GM and
+        // forces this WF to stall.
+        Stats::Scalar schLdsArbStalls;
+
+        // number of times an instruction of a WF is blocked from being issued
+        // due to WAR and WAW dependencies
+        Stats::Scalar numTimesBlockedDueWAXDependencies;
+        // number of times an instruction of a WF is blocked from being issued
+        // due to WAR and WAW dependencies
+        Stats::Scalar numTimesBlockedDueRAWDependencies;
+
+        // Distribution to track the distance between producer and consumer
+        // for vector register values
+        Stats::Distribution vecRawDistance;
+
+        // Distribution to track the number of times every vector register
+        // value produced is consumed.
+        Stats::Distribution readsPerWrite;
+    } stats;
  };
  
  #endif // __GPU_COMPUTE_WAVEFRONT_HH__
author	Matthew Poremba <matthew.poremba@amd.com>
	Thu, 14 Jan 2021 16:29:37 +0000 (10:29 -0600)
committer	Matthew Poremba <matthew.poremba@amd.com>
	Mon, 18 Jan 2021 17:58:05 +0000 (17:58 +0000)
src/arch/gcn3/insts/instructions.cc		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/dispatcher.cc		patch \| blob \| history
src/gpu-compute/dispatcher.hh		patch \| blob \| history
src/gpu-compute/exec_stage.cc		patch \| blob \| history
src/gpu-compute/exec_stage.hh		patch \| blob \| history
src/gpu-compute/fetch_stage.cc		patch \| blob \| history
src/gpu-compute/fetch_stage.hh		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/gpu_dyn_inst.cc		patch \| blob \| history
src/gpu-compute/gpu_dyn_inst.hh		patch \| blob \| history
src/gpu-compute/gpu_tlb.cc		patch \| blob \| history
src/gpu-compute/gpu_tlb.hh		patch \| blob \| history
src/gpu-compute/lds_state.cc		patch \| blob \| history
src/gpu-compute/local_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/local_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/register_file.cc		patch \| blob \| history
src/gpu-compute/register_file.hh		patch \| blob \| history
src/gpu-compute/register_manager.cc		patch \| blob \| history
src/gpu-compute/register_manager.hh		patch \| blob \| history
src/gpu-compute/register_manager_policy.hh		patch \| blob \| history
src/gpu-compute/scalar_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/scalar_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/scalar_register_file.cc		patch \| blob \| history
src/gpu-compute/schedule_stage.cc		patch \| blob \| history
src/gpu-compute/schedule_stage.hh		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.cc		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.hh		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history
src/gpu-compute/shader.hh		patch \| blob \| history
src/gpu-compute/static_register_manager_policy.cc		patch \| blob \| history
src/gpu-compute/static_register_manager_policy.hh		patch \| blob \| history
src/gpu-compute/tlb_coalescer.cc		patch \| blob \| history
src/gpu-compute/tlb_coalescer.hh		patch \| blob \| history
src/gpu-compute/vector_register_file.cc		patch \| blob \| history
src/gpu-compute/wavefront.cc		patch \| blob \| history
src/gpu-compute/wavefront.hh		patch \| blob \| history