previousCycle(0), previousState(CPU_STATE_SLEEP),
       functionTraceStream(nullptr), currentFunctionStart(0),
       currentFunctionEnd(0), functionEntryTick(0),
+      baseStats(this),
       addressMonitor(p.numThreads),
       syscallRetryLatency(p.syscallRetryLatency),
       pwrGatingLatency(p.pwr_gating_latency),
         ppRetiredBranches->notify(1);
 }
 
+BaseCPU::
+BaseCPUStats::BaseCPUStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(numCycles, "Number of cpu cycles simulated"),
+      ADD_STAT(numWorkItemsStarted, "Number of work items this cpu started"),
+      ADD_STAT(numWorkItemsCompleted,
+               "Number of work items this cpu completed")
+{
+}
+
 void
 BaseCPU::regStats()
 {
 
     using namespace Stats;
 
-    numCycles
-        .name(name() + ".numCycles")
-        .desc("number of cpu cycles simulated")
-        ;
-
-    numWorkItemsStarted
-        .name(name() + ".numWorkItemsStarted")
-        .desc("number of work items this cpu started")
-        ;
-
-    numWorkItemsCompleted
-        .name(name() + ".numWorkItemsCompleted")
-        .desc("number of work items this cpu completed")
-        ;
-
     int size = threadContexts.size();
     if (size > 1) {
         for (int i = 0; i < size; ++i) {
 
     uint32_t getPid() const { return _pid; }
     void setPid(uint32_t pid) { _pid = pid; }
 
-    inline void workItemBegin() { numWorkItemsStarted++; }
-    inline void workItemEnd() { numWorkItemsCompleted++; }
+    inline void workItemBegin() { baseStats.numWorkItemsStarted++; }
+    inline void workItemEnd() { baseStats.numWorkItemsCompleted++; }
     // @todo remove me after debugging with legion done
     Tick instCount() { return instCnt; }
 
     }
 
   public:
-    // Number of CPU cycles simulated
-    Stats::Scalar numCycles;
-    Stats::Scalar numWorkItemsStarted;
-    Stats::Scalar numWorkItemsCompleted;
+    struct BaseCPUStats : public Stats::Group
+    {
+        BaseCPUStats(Stats::Group *parent);
+        // Number of CPU cycles simulated
+        Stats::Scalar numCycles;
+        Stats::Scalar numWorkItemsStarted;
+        Stats::Scalar numWorkItemsCompleted;
+    } baseStats;
 
   private:
     std::vector<AddressMonitor> addressMonitor;
 
         while (!result.empty()) {
             result.pop();
         }
-        numCycles++;
+        baseStats.numCycles++;
 
         Fault fault = NoFault;
 
 
     assert(_status == Idle);
     assert(!tickEvent.scheduled());
 
-    numCycles += ticksToCycles(thread->lastActivate - thread->lastSuspend);
+    baseStats.numCycles +=
+        ticksToCycles(thread->lastActivate - thread->lastSuspend);
 
     schedule(tickEvent, clockEdge(Cycles(0)));
     _status = Running;
         ticksExecuted = runTimer->ticksFromHostCycles(hostCyclesExecuted);
 
         /* Update statistics */
-        numCycles += simCyclesExecuted;;
+        baseStats.numCycles += simCyclesExecuted;;
         stats.committedInsts += instsExecuted;
         ctrInsts += instsExecuted;
         system->totalNumInsts += instsExecuted;
 
 {
 
 Pipeline::Pipeline(MinorCPU &cpu_, const MinorCPUParams ¶ms) :
-    Ticked(cpu_, &(cpu_.BaseCPU::numCycles)),
+    Ticked(cpu_, &(cpu_.BaseCPU::baseStats.numCycles)),
     cpu(cpu_),
     allow_idling(params.enableIdling),
     f1ToF2(cpu.name() + ".f1ToF2", "lines",
 
     quiesceCycles.prereq(quiesceCycles);
 
     cpi.precision(6);
-    cpi = base_cpu->numCycles / numInsts;
+    cpi = base_cpu->baseStats.numCycles / numInsts;
 
     ipc.precision(6);
-    ipc = numInsts / base_cpu->numCycles;
+    ipc = numInsts / base_cpu->baseStats.numCycles;
 
     committedInstType
         .init(base_cpu->numThreads, Enums::Num_OpClass)
 
 
     cpi
         .precision(6);
-    cpi = cpu->numCycles / committedInsts;
+    cpi = cpu->baseStats.numCycles / committedInsts;
 
     totalCpi
         .precision(6);
-    totalCpi = cpu->numCycles / sum(committedInsts);
+    totalCpi = cpu->baseStats.numCycles / sum(committedInsts);
 
     ipc
         .precision(6);
-    ipc =  committedInsts / cpu->numCycles;
+    ipc = committedInsts / cpu->baseStats.numCycles;
 
     totalIpc
         .precision(6);
-    totalIpc =  sum(committedInsts) / cpu->numCycles;
+    totalIpc = sum(committedInsts) / cpu->baseStats.numCycles;
 
     intRegfileReads
         .prereq(intRegfileReads);
     assert(!switchedOut());
     assert(drainState() != DrainState::Drained);
 
-    ++numCycles;
+    ++baseStats.numCycles;
     updateCycleCounters(BaseCPU::CPU_STATE_ON);
 
 //    activity = false;
     if (cycles > 1) {
         --cycles;
         cpuStats.idleCycles += cycles;
-        numCycles += cycles;
+        baseStats.numCycles += cycles;
     }
 
     schedule(tickEvent, clockEdge());
 
 FetchStatGroup::FetchStatGroup(O3CPU *cpu, DefaultFetch *fetch)
     : Stats::Group(cpu, "fetch"),
     ADD_STAT(icacheStallCycles,
-     "Number of cycles fetch is stalled on an Icache miss"),
+             "Number of cycles fetch is stalled on an Icache miss"),
     ADD_STAT(insts, "Number of instructions fetch has processed"),
     ADD_STAT(branches, "Number of branches that fetch encountered"),
     ADD_STAT(predictedBranches,
-     "Number of branches that fetch has predicted taken"),
+             "Number of branches that fetch has predicted taken"),
     ADD_STAT(cycles,
-     "Number of cycles fetch has run and was not squashing or blocked"),
+             "Number of cycles fetch has run and was not squashing or "
+             "blocked"),
     ADD_STAT(squashCycles, "Number of cycles fetch has spent squashing"),
     ADD_STAT(tlbCycles,
-     "Number of cycles fetch has spent waiting for tlb"),
+             "Number of cycles fetch has spent waiting for tlb"),
     ADD_STAT(idleCycles, "Number of cycles fetch was idle"),
     ADD_STAT(blockedCycles, "Number of cycles fetch has spent blocked"),
     ADD_STAT(miscStallCycles,
-     "Number of cycles fetch has spent waiting on interrupts,"
-      "or bad addresses, or out of MSHRs"),
+             "Number of cycles fetch has spent waiting on interrupts, "
+             "or bad addresses, or out of MSHRs"),
     ADD_STAT(pendingDrainCycles,
-     "Number of cycles fetch has spent waiting on pipes to drain"),
+             "Number of cycles fetch has spent waiting on pipes to drain"),
     ADD_STAT(noActiveThreadStallCycles,
-     "Number of stall cycles due to no active thread to fetch from"),
+             "Number of stall cycles due to no active thread to fetch from"),
     ADD_STAT(pendingTrapStallCycles,
-     "Number of stall cycles due to pending traps"),
+             "Number of stall cycles due to pending traps"),
     ADD_STAT(pendingQuiesceStallCycles,
-     "Number of stall cycles due to pending quiesce instructions"),
+             "Number of stall cycles due to pending quiesce instructions"),
     ADD_STAT(icacheWaitRetryStallCycles,
-     "Number of stall cycles due to full MSHR"),
+             "Number of stall cycles due to full MSHR"),
     ADD_STAT(cacheLines, "Number of cache lines fetched"),
     ADD_STAT(icacheSquashes,
-     "Number of outstanding Icache misses that were squashed"),
+             "Number of outstanding Icache misses that were squashed"),
     ADD_STAT(tlbSquashes,
-     "Number of outstanding ITLB misses that were squashed"),
+             "Number of outstanding ITLB misses that were squashed"),
     ADD_STAT(nisnDist,
-     "Number of instructions fetched each cycle (Total)"),
+             "Number of instructions fetched each cycle (Total)"),
     ADD_STAT(idleRate, "Percent of cycles fetch was idle",
-     idleCycles * 100 / cpu->numCycles),
+             idleCycles * 100 / cpu->baseStats.numCycles),
     ADD_STAT(branchRate, "Number of branch fetches per cycle",
-     branches / cpu->numCycles),
+             branches / cpu->baseStats.numCycles),
     ADD_STAT(rate, "Number of inst fetches per cycle",
-     insts / cpu->numCycles)
+             insts / cpu->baseStats.numCycles)
 {
         icacheStallCycles
             .prereq(icacheStallCycles);
 
 
     wbRate
         .flags(Stats::total);
-    wbRate = writebackCount / cpu->numCycles;
+    wbRate = writebackCount / cpu->baseStats.numCycles;
 
     wbFanout
         .flags(Stats::total);
     ADD_STAT(numRefs, "Number of memory reference insts executed"),
     ADD_STAT(numBranches, "Number of branches executed"),
     ADD_STAT(numStoreInsts, "Number of stores executed"),
-    ADD_STAT(numRate, "Inst execution rate", numInsts / cpu->numCycles)
+    ADD_STAT(numRate, "Inst execution rate",
+             numInsts / cpu->baseStats.numCycles)
 {
     numLoadInsts
         .init(cpu->numThreads)
 
     ADD_STAT(numIssuedDist, "Number of insts issued each cycle"),
     ADD_STAT(statFuBusy, "attempts to use FU when none available"),
     ADD_STAT(statIssuedInstType, "Type of FU issued"),
-    ADD_STAT(issueRate, "Inst issue rate", instsIssued / cpu->numCycles),
+    ADD_STAT(issueRate, "Inst issue rate",
+             instsIssued / cpu->baseStats.numCycles),
     ADD_STAT(fuBusy, "FU busy when requested"),
     ADD_STAT(fuBusyRate, "FU busy rate (busy events/executed inst)")
 {
 
     threadInfo[thread_num]->execContextStats.notIdleFraction = 1;
     Cycles delta = ticksToCycles(threadInfo[thread_num]->thread->lastActivate -
                                  threadInfo[thread_num]->thread->lastSuspend);
-    numCycles += delta;
+    baseStats.numCycles += delta;
 
     if (!tickEvent.scheduled()) {
         //Make sure ticks are still on multiples of cycles
     Tick latency = 0;
 
     for (int i = 0; i < width || locked; ++i) {
-        numCycles++;
+        baseStats.numCycles++;
         updateCycleCounters(BaseCPU::CPU_STATE_ON);
 
         if (!curStaticInst || !curStaticInst->isDelayedCommit()) {
 
             }
 
             idleFraction = Stats::constant(1.0) - notIdleFraction;
-            numIdleCycles = idleFraction * cpu->numCycles;
-            numBusyCycles = notIdleFraction * cpu->numCycles;
+            numIdleCycles = idleFraction * cpu->baseStats.numCycles;
+            numBusyCycles = notIdleFraction * cpu->baseStats.numCycles;
 
             numBranches
                 .prereq(numBranches);
 
 {
     const Cycles delta(curCycle() - previousCycle);
 
-    numCycles += delta;
+    baseStats.numCycles += delta;
 
     previousCycle = curCycle();
 }
 
     DPRINTF(TraceCPUData, "DcacheGen event.\n");
 
     // Update stat for numCycles
-    numCycles = clockEdge() / clockPeriod();
+    baseStats.numCycles = clockEdge() / clockPeriod();
 
     dcacheGen.execute();
     if (dcacheGen.isExecComplete()) {
      "Number of events scheduled to trigger instruction request generator"),
     ADD_STAT(numOps, "Number of micro-ops simulated by the Trace CPU"),
     ADD_STAT(cpi, "Cycles per micro-op used as a proxy for CPI",
-     trace->numCycles / numOps)
+     trace->baseStats.numCycles / numOps)
 {
         cpi.precision(6);
 }