From: Curtis Dunham <Curtis.Dunham@arm.com>
Date: Tue, 23 Dec 2014 14:31:18 +0000 (-0500)
Subject: arm: Add stats to table walker
X-Git-Tag: stable_2015_04_15~46
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4d88978913c57e0cd10751d31d7f5b95c1e00170;p=gem5.git

arm: Add stats to table walker

This patch adds table walker stats for:
- Walk events
- Instruction vs Data
- Page size histogram
- Wait time and service time histograms
- Pending requests histogram (per cycle) - measures dist. of L
  (p(1..) = how often busy, p(0) = how often idle)
- Squashes, before starting and after completion
---

diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc
index 6a0f0144a..6a1ac5577 100644
--- a/src/arch/arm/table_walker.cc
+++ b/src/arch/arm/table_walker.cc
@@ -60,6 +60,8 @@ TableWalker::TableWalker(const Params *p)
       stage2Mmu(NULL), isStage2(p->is_stage2), tlb(NULL),
       currState(NULL), pending(false), masterId(p->sys->getMasterId(name())),
       numSquashable(p->num_squash_per_cycle),
+      pendingReqs(0),
+      pendingChangeTick(curTick()),
       doL1DescEvent(this), doL2DescEvent(this),
       doL0LongDescEvent(this), doL1LongDescEvent(this), doL2LongDescEvent(this),
       doL3LongDescEvent(this),
@@ -151,6 +153,7 @@ TableWalker::drainResume()
     if (params()->sys->isTimingMode() && currState) {
         delete currState;
         currState = NULL;
+        pendingChange();
     }
 }
 
@@ -170,6 +173,8 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid,
                   bool secure, TLB::ArmTranslationType tranType)
 {
     assert(!(_functional && _timing));
+    ++statWalks;
+
     WalkerState *savedCurrState = NULL;
 
     if (!currState && !_functional) {
@@ -196,10 +201,13 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid,
         // this fault to re-execute the faulting instruction which should clean
         // up everything.
         if (currState->vaddr_tainted == _req->getVaddr()) {
+            ++statSquashedBefore;
             return std::make_shared<ReExec>();
         }
     }
+    pendingChange();
 
+    currState->startTime = curTick();
     currState->tc = _tc;
     currState->aarch64 = opModeIs64(currOpMode(_tc));
     currState->el = currEL(_tc);
@@ -261,6 +269,8 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid,
     currState->isFetch = (currState->mode == TLB::Execute);
     currState->isWrite = (currState->mode == TLB::Write);
 
+    statRequestOrigin[REQUESTED][currState->isFetch]++;
+
     // We only do a second stage of translation if we're not secure, or in
     // hyp mode, the second stage MMU is enabled, and this table walker
     // instance is the first stage.
@@ -280,6 +290,10 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid,
         currState->userTable = true;
         currState->xnTable = false;
         currState->pxnTable = false;
+
+        ++statWalksLongDescriptor;
+    } else {
+        ++statWalksShortDescriptor;
     }
 
     if (!currState->timing) {
@@ -303,8 +317,10 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid,
     if (pending || pendingQueue.size()) {
         pendingQueue.push_back(currState);
         currState = NULL;
+        pendingChange();
     } else {
         pending = true;
+        pendingChange();
         if (currState->aarch64)
             return processWalkAArch64();
         else if (long_desc_format)
@@ -321,6 +337,7 @@ TableWalker::processWalkWrapper()
 {
     assert(!currState);
     assert(pendingQueue.size());
+    pendingChange();
     currState = pendingQueue.front();
 
     ExceptionLevel target_el = EL0;
@@ -372,6 +389,7 @@ TableWalker::processWalkWrapper()
            (currState->transState->squashed() || te)) {
         pendingQueue.pop_front();
         num_squashed++;
+        statSquashedBefore++;
 
         DPRINTF(TLB, "Squashing table walk for address %#x\n",
                       currState->vaddr_tainted);
@@ -383,6 +401,7 @@ TableWalker::processWalkWrapper()
                 currState->req, currState->tc, currState->mode);
         } else {
             // translate the request now that we know it will work
+            statWalkServiceTime.sample(curTick() - currState->startTime);
             tlb->translateTiming(currState->req, currState->tc,
                         currState->transState, currState->mode);
 
@@ -402,8 +421,9 @@ TableWalker::processWalkWrapper()
             currState = NULL;
         }
     }
+    pendingChange();
 
-    // if we've still got pending translations schedule more work
+    // if we still have pending translations, schedule more work
     nextWalk(tc);
     currState = NULL;
 }
@@ -420,6 +440,8 @@ TableWalker::processWalk()
             currState->vaddr_tainted, currState->ttbcr, mbits(currState->vaddr, 31,
                                                       32 - currState->ttbcr.n));
 
+    statWalkWaitTime.sample(curTick() - currState->startTime);
+
     if (currState->ttbcr.n == 0 || !mbits(currState->vaddr, 31,
                                           32 - currState->ttbcr.n)) {
         DPRINTF(TLB, " - Selecting TTBR0\n");
@@ -511,6 +533,8 @@ TableWalker::processWalkLPAE()
     DPRINTF(TLB, "Beginning table walk for address %#x, TTBCR: %#x\n",
             currState->vaddr_tainted, currState->ttbcr);
 
+    statWalkWaitTime.sample(curTick() - currState->startTime);
+
     Request::Flags flag = 0;
     if (currState->isSecure)
         flag.set(Request::SECURE);
@@ -700,6 +724,8 @@ TableWalker::processWalkAArch64()
     static const GrainSize GrainMap_EL1_tg1[] =
       { ReservedGrain, Grain16KB, Grain4KB, Grain64KB };
 
+    statWalkWaitTime.sample(curTick() - currState->startTime);
+
     // Determine TTBR, table size, granule size and phys. address range
     Addr ttbr = 0;
     int tsz = 0, ps = 0;
@@ -920,8 +946,8 @@ TableWalker::processWalkAArch64()
             panic("Invalid table lookup level");
             break;
         }
-        port.dmaAction(MemCmd::ReadReq, desc_addr, sizeof(uint64_t), event,
-                       (uint8_t*) &currState->longDesc.data,
+        port.dmaAction(MemCmd::ReadReq, desc_addr, sizeof(uint64_t),
+                       event, (uint8_t*) &currState->longDesc.data,
                        currState->tc->getCpuPtr()->clockPeriod(), flag);
         DPRINTF(TLBVerbose,
                 "Adding to walker fifo: queue size before adding: %d\n",
@@ -1698,6 +1724,7 @@ TableWalker::doL1DescriptorWrapper()
     if (currState->fault != NoFault) {
         currState->transState->finish(currState->fault, currState->req,
                                       currState->tc, currState->mode);
+        statWalksShortTerminatedAtLevel[0]++;
 
         pending = false;
         nextWalk(currState->tc);
@@ -1711,9 +1738,11 @@ TableWalker::doL1DescriptorWrapper()
         // delay is not set so there is no L2 to do
         // Don't finish the translation if a stage 2 look up is underway
         if (!currState->doingStage2) {
+            statWalkServiceTime.sample(curTick() - currState->startTime);
             DPRINTF(TLBVerbose, "calling translateTiming again\n");
             currState->fault = tlb->translateTiming(currState->req, currState->tc,
                 currState->transState, currState->mode);
+            statWalksShortTerminatedAtLevel[0]++;
         }
 
         pending = false;
@@ -1749,13 +1778,16 @@ TableWalker::doL2DescriptorWrapper()
     if (currState->fault != NoFault) {
         currState->transState->finish(currState->fault, currState->req,
                                       currState->tc, currState->mode);
+        statWalksShortTerminatedAtLevel[1]++;
     }
     else {
         // Don't finish the translation if a stage 2 look up is underway
         if (!currState->doingStage2) {
+            statWalkServiceTime.sample(curTick() - currState->startTime);
             DPRINTF(TLBVerbose, "calling translateTiming again\n");
             currState->fault = tlb->translateTiming(currState->req,
                 currState->tc, currState->transState, currState->mode);
+            statWalksShortTerminatedAtLevel[1]++;
         }
     }
 
@@ -1832,9 +1864,11 @@ TableWalker::doLongDescriptorWrapper(LookupLevel curr_lookup_level)
         // Don't finish the translation if a stage 2 look up is underway
         if (!currState->doingStage2) {
             DPRINTF(TLBVerbose, "calling translateTiming again\n");
+            statWalkServiceTime.sample(curTick() - currState->startTime);
             currState->fault = tlb->translateTiming(currState->req, currState->tc,
                                                     currState->transState,
                                                     currState->mode);
+            statWalksLongTerminatedAtLevel[(unsigned) curr_lookup_level]++;
         }
 
         pending = false;
@@ -1906,7 +1940,7 @@ TableWalker::fetchDescriptor(Addr descAddr, uint8_t *data, int numBytes,
     } else {
         if (isTiming) {
             port.dmaAction(MemCmd::ReadReq, descAddr, numBytes, event, data,
-                           currState->tc->getCpuPtr()->clockPeriod(), flags);
+                           currState->tc->getCpuPtr()->clockPeriod(),flags);
             if (queueIndex >= 0) {
                 DPRINTF(TLBVerbose, "Adding to walker fifo: queue size before adding: %d\n",
                         stateQueues[queueIndex].size());
@@ -1956,6 +1990,9 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool longDescriptor)
     else
         te.el         = 1;
 
+    statPageSizes[pageSizeNtoStatBin(te.N)]++;
+    statRequestOrigin[COMPLETED][currState->isFetch]++;
+
     // ASID has no meaning for stage 2 TLB entries, so mark all stage 2 entries
     // as global
     te.global         = descriptor.global(currState) || isStage2;
@@ -2024,3 +2061,144 @@ TableWalker::toLookupLevel(uint8_t lookup_level_as_int)
         panic("Invalid lookup level conversion");
     }
 }
+
+/* this method keeps track of the table walker queue's residency, so
+ * needs to be called whenever requests start and complete. */
+void
+TableWalker::pendingChange()
+{
+    unsigned n = pendingQueue.size();
+    if ((currState != NULL) && (currState != pendingQueue.front())) {
+        ++n;
+    }
+
+    if (n != pendingReqs) {
+        Tick now = curTick();
+        statPendingWalks.sample(pendingReqs, now - pendingChangeTick);
+        pendingReqs = n;
+        pendingChangeTick = now;
+    }
+}
+
+uint8_t
+TableWalker::pageSizeNtoStatBin(uint8_t N)
+{
+    /* for statPageSizes */
+    switch(N) {
+        case 12: return 0; // 4K
+        case 14: return 1; // 16K (using 16K granule in v8-64)
+        case 16: return 2; // 64K
+        case 20: return 3; // 1M
+        case 21: return 4; // 2M-LPAE
+        case 24: return 5; // 16M
+        case 25: return 6; // 32M (using 16K granule in v8-64)
+        case 29: return 7; // 512M (using 64K granule in v8-64)
+        case 30: return 8; // 1G-LPAE
+        default:
+            panic("unknown page size");
+            return 255;
+    }
+}
+
+void
+TableWalker::regStats()
+{
+    statWalks
+        .name(name() + ".walks")
+        .desc("Table walker walks requested")
+        ;
+
+    statWalksShortDescriptor
+        .name(name() + ".walksShort")
+        .desc("Table walker walks initiated with short descriptors")
+        .flags(Stats::nozero)
+        ;
+
+    statWalksLongDescriptor
+        .name(name() + ".walksLong")
+        .desc("Table walker walks initiated with long descriptors")
+        .flags(Stats::nozero)
+        ;
+
+    statWalksShortTerminatedAtLevel
+        .init(2)
+        .name(name() + ".walksShortTerminationLevel")
+        .desc("Level at which table walker walks "
+              "with short descriptors terminate")
+        .flags(Stats::nozero)
+        ;
+    statWalksShortTerminatedAtLevel.subname(0, "Level1");
+    statWalksShortTerminatedAtLevel.subname(1, "Level2");
+
+    statWalksLongTerminatedAtLevel
+        .init(4)
+        .name(name() + ".walksLongTerminationLevel")
+        .desc("Level at which table walker walks "
+              "with long descriptors terminate")
+        .flags(Stats::nozero)
+        ;
+    statWalksLongTerminatedAtLevel.subname(0, "Level0");
+    statWalksLongTerminatedAtLevel.subname(1, "Level1");
+    statWalksLongTerminatedAtLevel.subname(2, "Level2");
+    statWalksLongTerminatedAtLevel.subname(3, "Level3");
+
+    statSquashedBefore
+        .name(name() + ".walksSquashedBefore")
+        .desc("Table walks squashed before starting")
+        .flags(Stats::nozero)
+        ;
+
+    statSquashedAfter
+        .name(name() + ".walksSquashedAfter")
+        .desc("Table walks squashed after completion")
+        .flags(Stats::nozero)
+        ;
+
+    statWalkWaitTime
+        .init(16)
+        .name(name() + ".walkWaitTime")
+        .desc("Table walker wait (enqueue to first request) latency")
+        .flags(Stats::pdf | Stats::nozero | Stats::nonan)
+        ;
+
+    statWalkServiceTime
+        .init(16)
+        .name(name() + ".walkCompletionTime")
+        .desc("Table walker service (enqueue to completion) latency")
+        .flags(Stats::pdf | Stats::nozero | Stats::nonan)
+        ;
+
+    statPendingWalks
+        .init(16)
+        .name(name() + ".walksPending")
+        .desc("Table walker pending requests distribution")
+        .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan)
+        ;
+
+    statPageSizes // see DDI 0487A D4-1661
+        .init(9)
+        .name(name() + ".walkPageSizes")
+        .desc("Table walker page sizes translated")
+        .flags(Stats::total | Stats::pdf | Stats::dist | Stats::nozero)
+        ;
+    statPageSizes.subname(0, "4K");
+    statPageSizes.subname(1, "16K");
+    statPageSizes.subname(2, "64K");
+    statPageSizes.subname(3, "1M");
+    statPageSizes.subname(4, "2M");
+    statPageSizes.subname(5, "16M");
+    statPageSizes.subname(6, "32M");
+    statPageSizes.subname(7, "512M");
+    statPageSizes.subname(8, "1G");
+
+    statRequestOrigin
+        .init(2,2) // Instruction/Data, requests/completed
+        .name(name() + ".walkRequestOrigin")
+        .desc("Table walker requests started/completed, data/inst")
+        .flags(Stats::total)
+        ;
+    statRequestOrigin.subname(0,"Requested");
+    statRequestOrigin.subname(1,"Completed");
+    statRequestOrigin.ysubname(0,"Data");
+    statRequestOrigin.ysubname(1,"Inst");
+}
diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh
index b5d67c9a1..b265f5165 100644
--- a/src/arch/arm/table_walker.hh
+++ b/src/arch/arm/table_walker.hh
@@ -794,6 +794,12 @@ class TableWalker : public MemObject
 
         TableWalker *tableWalker;
 
+        /** Timestamp for calculating elapsed time in service (for stats) */
+        Tick startTime;
+
+        /** Page entries walked during service (for stats) */
+        unsigned levels;
+
         void doL1Descriptor();
         void doL2Descriptor();
 
@@ -883,6 +889,26 @@ class TableWalker : public MemObject
     bool _haveLargeAsid64;
     ArmSystem *armSys;
 
+    /** Statistics */
+    Stats::Scalar statWalks;
+    Stats::Scalar statWalksShortDescriptor;
+    Stats::Scalar statWalksLongDescriptor;
+    Stats::Vector statWalksShortTerminatedAtLevel;
+    Stats::Vector statWalksLongTerminatedAtLevel;
+    Stats::Scalar statSquashedBefore;
+    Stats::Scalar statSquashedAfter;
+    Stats::Histogram statWalkWaitTime;
+    Stats::Histogram statWalkServiceTime;
+    Stats::Histogram statPendingWalks; // essentially "L" of queueing theory
+    Stats::Vector statPageSizes;
+    Stats::Vector2d statRequestOrigin;
+
+    mutable unsigned pendingReqs;
+    mutable Tick pendingChangeTick;
+
+    static const unsigned REQUESTED = 0;
+    static const unsigned COMPLETED = 1;
+
   public:
    typedef ArmTableWalkerParams Params;
     TableWalker(const Params *p);
@@ -903,6 +929,7 @@ class TableWalker : public MemObject
     virtual void drainResume();
     virtual BaseMasterPort& getMasterPort(const std::string &if_name,
                                           PortID idx = InvalidPortID);
+    void regStats();
 
     /**
      * Allow the MMU (overseeing both stage 1 and stage 2 TLBs) to
@@ -976,6 +1003,10 @@ class TableWalker : public MemObject
     EventWrapper<TableWalker, &TableWalker::processWalkWrapper> doProcessEvent;
 
     void nextWalk(ThreadContext *tc);
+
+    void pendingChange();
+
+    static uint8_t pageSizeNtoStatBin(uint8_t N);
 };
 
 } // namespace ArmISA
diff --git a/src/dev/dma_device.cc b/src/dev/dma_device.cc
index 5033c3617..d53ea2546 100644
--- a/src/dev/dma_device.cc
+++ b/src/dev/dma_device.cc
@@ -152,7 +152,7 @@ DmaPort::recvRetry()
     trySendTimingReq();
 }
 
-void
+RequestPtr
 DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
                    uint8_t *data, Tick delay, Request::Flags flag)
 {
@@ -161,11 +161,17 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
     // i.e. cache line size
     DmaReqState *reqState = new DmaReqState(event, size, delay);
 
+    // (functionality added for Table Walker statistics)
+    // We're only interested in this when there will only be one request.
+    // For simplicity, we return the last request, which would also be
+    // the only request in that case.
+    RequestPtr req = NULL;
+
     DPRINTF(DMA, "Starting DMA for addr: %#x size: %d sched: %d\n", addr, size,
             event ? event->scheduled() : -1);
     for (ChunkGenerator gen(addr, size, sys->cacheLineSize());
          !gen.done(); gen.next()) {
-        Request *req = new Request(gen.addr(), gen.size(), flag, masterId);
+        req = new Request(gen.addr(), gen.size(), flag, masterId);
         req->taskId(ContextSwitchTaskId::DMA);
         PacketPtr pkt = new Packet(req, cmd);
 
@@ -184,6 +190,8 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
     // just created, for atomic this involves actually completing all
     // the requests
     sendDma();
+
+    return req;
 }
 
 void
diff --git a/src/dev/dma_device.hh b/src/dev/dma_device.hh
index d1245b977..6df4a287d 100644
--- a/src/dev/dma_device.hh
+++ b/src/dev/dma_device.hh
@@ -142,8 +142,8 @@ class DmaPort : public MasterPort
 
     DmaPort(MemObject *dev, System *s);
 
-    void dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
-                   uint8_t *data, Tick delay, Request::Flags flag = 0);
+    RequestPtr dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
+                         uint8_t *data, Tick delay, Request::Flags flag = 0);
 
     bool dmaPending() const { return pendingCount > 0; }