From 4d88978913c57e0cd10751d31d7f5b95c1e00170 Mon Sep 17 00:00:00 2001 From: Curtis Dunham Date: Tue, 23 Dec 2014 09:31:18 -0500 Subject: [PATCH] arm: Add stats to table walker This patch adds table walker stats for: - Walk events - Instruction vs Data - Page size histogram - Wait time and service time histograms - Pending requests histogram (per cycle) - measures dist. of L (p(1..) = how often busy, p(0) = how often idle) - Squashes, before starting and after completion --- src/arch/arm/table_walker.cc | 186 ++++++++++++++++++++++++++++++++++- src/arch/arm/table_walker.hh | 31 ++++++ src/dev/dma_device.cc | 12 ++- src/dev/dma_device.hh | 4 +- 4 files changed, 225 insertions(+), 8 deletions(-) diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc index 6a0f0144a..6a1ac5577 100644 --- a/src/arch/arm/table_walker.cc +++ b/src/arch/arm/table_walker.cc @@ -60,6 +60,8 @@ TableWalker::TableWalker(const Params *p) stage2Mmu(NULL), isStage2(p->is_stage2), tlb(NULL), currState(NULL), pending(false), masterId(p->sys->getMasterId(name())), numSquashable(p->num_squash_per_cycle), + pendingReqs(0), + pendingChangeTick(curTick()), doL1DescEvent(this), doL2DescEvent(this), doL0LongDescEvent(this), doL1LongDescEvent(this), doL2LongDescEvent(this), doL3LongDescEvent(this), @@ -151,6 +153,7 @@ TableWalker::drainResume() if (params()->sys->isTimingMode() && currState) { delete currState; currState = NULL; + pendingChange(); } } @@ -170,6 +173,8 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid, bool secure, TLB::ArmTranslationType tranType) { assert(!(_functional && _timing)); + ++statWalks; + WalkerState *savedCurrState = NULL; if (!currState && !_functional) { @@ -196,10 +201,13 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid, // this fault to re-execute the faulting instruction which should clean // up everything. if (currState->vaddr_tainted == _req->getVaddr()) { + ++statSquashedBefore; return std::make_shared(); } } + pendingChange(); + currState->startTime = curTick(); currState->tc = _tc; currState->aarch64 = opModeIs64(currOpMode(_tc)); currState->el = currEL(_tc); @@ -261,6 +269,8 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid, currState->isFetch = (currState->mode == TLB::Execute); currState->isWrite = (currState->mode == TLB::Write); + statRequestOrigin[REQUESTED][currState->isFetch]++; + // We only do a second stage of translation if we're not secure, or in // hyp mode, the second stage MMU is enabled, and this table walker // instance is the first stage. @@ -280,6 +290,10 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid, currState->userTable = true; currState->xnTable = false; currState->pxnTable = false; + + ++statWalksLongDescriptor; + } else { + ++statWalksShortDescriptor; } if (!currState->timing) { @@ -303,8 +317,10 @@ TableWalker::walk(RequestPtr _req, ThreadContext *_tc, uint16_t _asid, if (pending || pendingQueue.size()) { pendingQueue.push_back(currState); currState = NULL; + pendingChange(); } else { pending = true; + pendingChange(); if (currState->aarch64) return processWalkAArch64(); else if (long_desc_format) @@ -321,6 +337,7 @@ TableWalker::processWalkWrapper() { assert(!currState); assert(pendingQueue.size()); + pendingChange(); currState = pendingQueue.front(); ExceptionLevel target_el = EL0; @@ -372,6 +389,7 @@ TableWalker::processWalkWrapper() (currState->transState->squashed() || te)) { pendingQueue.pop_front(); num_squashed++; + statSquashedBefore++; DPRINTF(TLB, "Squashing table walk for address %#x\n", currState->vaddr_tainted); @@ -383,6 +401,7 @@ TableWalker::processWalkWrapper() currState->req, currState->tc, currState->mode); } else { // translate the request now that we know it will work + statWalkServiceTime.sample(curTick() - currState->startTime); tlb->translateTiming(currState->req, currState->tc, currState->transState, currState->mode); @@ -402,8 +421,9 @@ TableWalker::processWalkWrapper() currState = NULL; } } + pendingChange(); - // if we've still got pending translations schedule more work + // if we still have pending translations, schedule more work nextWalk(tc); currState = NULL; } @@ -420,6 +440,8 @@ TableWalker::processWalk() currState->vaddr_tainted, currState->ttbcr, mbits(currState->vaddr, 31, 32 - currState->ttbcr.n)); + statWalkWaitTime.sample(curTick() - currState->startTime); + if (currState->ttbcr.n == 0 || !mbits(currState->vaddr, 31, 32 - currState->ttbcr.n)) { DPRINTF(TLB, " - Selecting TTBR0\n"); @@ -511,6 +533,8 @@ TableWalker::processWalkLPAE() DPRINTF(TLB, "Beginning table walk for address %#x, TTBCR: %#x\n", currState->vaddr_tainted, currState->ttbcr); + statWalkWaitTime.sample(curTick() - currState->startTime); + Request::Flags flag = 0; if (currState->isSecure) flag.set(Request::SECURE); @@ -700,6 +724,8 @@ TableWalker::processWalkAArch64() static const GrainSize GrainMap_EL1_tg1[] = { ReservedGrain, Grain16KB, Grain4KB, Grain64KB }; + statWalkWaitTime.sample(curTick() - currState->startTime); + // Determine TTBR, table size, granule size and phys. address range Addr ttbr = 0; int tsz = 0, ps = 0; @@ -920,8 +946,8 @@ TableWalker::processWalkAArch64() panic("Invalid table lookup level"); break; } - port.dmaAction(MemCmd::ReadReq, desc_addr, sizeof(uint64_t), event, - (uint8_t*) &currState->longDesc.data, + port.dmaAction(MemCmd::ReadReq, desc_addr, sizeof(uint64_t), + event, (uint8_t*) &currState->longDesc.data, currState->tc->getCpuPtr()->clockPeriod(), flag); DPRINTF(TLBVerbose, "Adding to walker fifo: queue size before adding: %d\n", @@ -1698,6 +1724,7 @@ TableWalker::doL1DescriptorWrapper() if (currState->fault != NoFault) { currState->transState->finish(currState->fault, currState->req, currState->tc, currState->mode); + statWalksShortTerminatedAtLevel[0]++; pending = false; nextWalk(currState->tc); @@ -1711,9 +1738,11 @@ TableWalker::doL1DescriptorWrapper() // delay is not set so there is no L2 to do // Don't finish the translation if a stage 2 look up is underway if (!currState->doingStage2) { + statWalkServiceTime.sample(curTick() - currState->startTime); DPRINTF(TLBVerbose, "calling translateTiming again\n"); currState->fault = tlb->translateTiming(currState->req, currState->tc, currState->transState, currState->mode); + statWalksShortTerminatedAtLevel[0]++; } pending = false; @@ -1749,13 +1778,16 @@ TableWalker::doL2DescriptorWrapper() if (currState->fault != NoFault) { currState->transState->finish(currState->fault, currState->req, currState->tc, currState->mode); + statWalksShortTerminatedAtLevel[1]++; } else { // Don't finish the translation if a stage 2 look up is underway if (!currState->doingStage2) { + statWalkServiceTime.sample(curTick() - currState->startTime); DPRINTF(TLBVerbose, "calling translateTiming again\n"); currState->fault = tlb->translateTiming(currState->req, currState->tc, currState->transState, currState->mode); + statWalksShortTerminatedAtLevel[1]++; } } @@ -1832,9 +1864,11 @@ TableWalker::doLongDescriptorWrapper(LookupLevel curr_lookup_level) // Don't finish the translation if a stage 2 look up is underway if (!currState->doingStage2) { DPRINTF(TLBVerbose, "calling translateTiming again\n"); + statWalkServiceTime.sample(curTick() - currState->startTime); currState->fault = tlb->translateTiming(currState->req, currState->tc, currState->transState, currState->mode); + statWalksLongTerminatedAtLevel[(unsigned) curr_lookup_level]++; } pending = false; @@ -1906,7 +1940,7 @@ TableWalker::fetchDescriptor(Addr descAddr, uint8_t *data, int numBytes, } else { if (isTiming) { port.dmaAction(MemCmd::ReadReq, descAddr, numBytes, event, data, - currState->tc->getCpuPtr()->clockPeriod(), flags); + currState->tc->getCpuPtr()->clockPeriod(),flags); if (queueIndex >= 0) { DPRINTF(TLBVerbose, "Adding to walker fifo: queue size before adding: %d\n", stateQueues[queueIndex].size()); @@ -1956,6 +1990,9 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool longDescriptor) else te.el = 1; + statPageSizes[pageSizeNtoStatBin(te.N)]++; + statRequestOrigin[COMPLETED][currState->isFetch]++; + // ASID has no meaning for stage 2 TLB entries, so mark all stage 2 entries // as global te.global = descriptor.global(currState) || isStage2; @@ -2024,3 +2061,144 @@ TableWalker::toLookupLevel(uint8_t lookup_level_as_int) panic("Invalid lookup level conversion"); } } + +/* this method keeps track of the table walker queue's residency, so + * needs to be called whenever requests start and complete. */ +void +TableWalker::pendingChange() +{ + unsigned n = pendingQueue.size(); + if ((currState != NULL) && (currState != pendingQueue.front())) { + ++n; + } + + if (n != pendingReqs) { + Tick now = curTick(); + statPendingWalks.sample(pendingReqs, now - pendingChangeTick); + pendingReqs = n; + pendingChangeTick = now; + } +} + +uint8_t +TableWalker::pageSizeNtoStatBin(uint8_t N) +{ + /* for statPageSizes */ + switch(N) { + case 12: return 0; // 4K + case 14: return 1; // 16K (using 16K granule in v8-64) + case 16: return 2; // 64K + case 20: return 3; // 1M + case 21: return 4; // 2M-LPAE + case 24: return 5; // 16M + case 25: return 6; // 32M (using 16K granule in v8-64) + case 29: return 7; // 512M (using 64K granule in v8-64) + case 30: return 8; // 1G-LPAE + default: + panic("unknown page size"); + return 255; + } +} + +void +TableWalker::regStats() +{ + statWalks + .name(name() + ".walks") + .desc("Table walker walks requested") + ; + + statWalksShortDescriptor + .name(name() + ".walksShort") + .desc("Table walker walks initiated with short descriptors") + .flags(Stats::nozero) + ; + + statWalksLongDescriptor + .name(name() + ".walksLong") + .desc("Table walker walks initiated with long descriptors") + .flags(Stats::nozero) + ; + + statWalksShortTerminatedAtLevel + .init(2) + .name(name() + ".walksShortTerminationLevel") + .desc("Level at which table walker walks " + "with short descriptors terminate") + .flags(Stats::nozero) + ; + statWalksShortTerminatedAtLevel.subname(0, "Level1"); + statWalksShortTerminatedAtLevel.subname(1, "Level2"); + + statWalksLongTerminatedAtLevel + .init(4) + .name(name() + ".walksLongTerminationLevel") + .desc("Level at which table walker walks " + "with long descriptors terminate") + .flags(Stats::nozero) + ; + statWalksLongTerminatedAtLevel.subname(0, "Level0"); + statWalksLongTerminatedAtLevel.subname(1, "Level1"); + statWalksLongTerminatedAtLevel.subname(2, "Level2"); + statWalksLongTerminatedAtLevel.subname(3, "Level3"); + + statSquashedBefore + .name(name() + ".walksSquashedBefore") + .desc("Table walks squashed before starting") + .flags(Stats::nozero) + ; + + statSquashedAfter + .name(name() + ".walksSquashedAfter") + .desc("Table walks squashed after completion") + .flags(Stats::nozero) + ; + + statWalkWaitTime + .init(16) + .name(name() + ".walkWaitTime") + .desc("Table walker wait (enqueue to first request) latency") + .flags(Stats::pdf | Stats::nozero | Stats::nonan) + ; + + statWalkServiceTime + .init(16) + .name(name() + ".walkCompletionTime") + .desc("Table walker service (enqueue to completion) latency") + .flags(Stats::pdf | Stats::nozero | Stats::nonan) + ; + + statPendingWalks + .init(16) + .name(name() + ".walksPending") + .desc("Table walker pending requests distribution") + .flags(Stats::pdf | Stats::dist | Stats::nozero | Stats::nonan) + ; + + statPageSizes // see DDI 0487A D4-1661 + .init(9) + .name(name() + ".walkPageSizes") + .desc("Table walker page sizes translated") + .flags(Stats::total | Stats::pdf | Stats::dist | Stats::nozero) + ; + statPageSizes.subname(0, "4K"); + statPageSizes.subname(1, "16K"); + statPageSizes.subname(2, "64K"); + statPageSizes.subname(3, "1M"); + statPageSizes.subname(4, "2M"); + statPageSizes.subname(5, "16M"); + statPageSizes.subname(6, "32M"); + statPageSizes.subname(7, "512M"); + statPageSizes.subname(8, "1G"); + + statRequestOrigin + .init(2,2) // Instruction/Data, requests/completed + .name(name() + ".walkRequestOrigin") + .desc("Table walker requests started/completed, data/inst") + .flags(Stats::total) + ; + statRequestOrigin.subname(0,"Requested"); + statRequestOrigin.subname(1,"Completed"); + statRequestOrigin.ysubname(0,"Data"); + statRequestOrigin.ysubname(1,"Inst"); +} diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh index b5d67c9a1..b265f5165 100644 --- a/src/arch/arm/table_walker.hh +++ b/src/arch/arm/table_walker.hh @@ -794,6 +794,12 @@ class TableWalker : public MemObject TableWalker *tableWalker; + /** Timestamp for calculating elapsed time in service (for stats) */ + Tick startTime; + + /** Page entries walked during service (for stats) */ + unsigned levels; + void doL1Descriptor(); void doL2Descriptor(); @@ -883,6 +889,26 @@ class TableWalker : public MemObject bool _haveLargeAsid64; ArmSystem *armSys; + /** Statistics */ + Stats::Scalar statWalks; + Stats::Scalar statWalksShortDescriptor; + Stats::Scalar statWalksLongDescriptor; + Stats::Vector statWalksShortTerminatedAtLevel; + Stats::Vector statWalksLongTerminatedAtLevel; + Stats::Scalar statSquashedBefore; + Stats::Scalar statSquashedAfter; + Stats::Histogram statWalkWaitTime; + Stats::Histogram statWalkServiceTime; + Stats::Histogram statPendingWalks; // essentially "L" of queueing theory + Stats::Vector statPageSizes; + Stats::Vector2d statRequestOrigin; + + mutable unsigned pendingReqs; + mutable Tick pendingChangeTick; + + static const unsigned REQUESTED = 0; + static const unsigned COMPLETED = 1; + public: typedef ArmTableWalkerParams Params; TableWalker(const Params *p); @@ -903,6 +929,7 @@ class TableWalker : public MemObject virtual void drainResume(); virtual BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx = InvalidPortID); + void regStats(); /** * Allow the MMU (overseeing both stage 1 and stage 2 TLBs) to @@ -976,6 +1003,10 @@ class TableWalker : public MemObject EventWrapper doProcessEvent; void nextWalk(ThreadContext *tc); + + void pendingChange(); + + static uint8_t pageSizeNtoStatBin(uint8_t N); }; } // namespace ArmISA diff --git a/src/dev/dma_device.cc b/src/dev/dma_device.cc index 5033c3617..d53ea2546 100644 --- a/src/dev/dma_device.cc +++ b/src/dev/dma_device.cc @@ -152,7 +152,7 @@ DmaPort::recvRetry() trySendTimingReq(); } -void +RequestPtr DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event, uint8_t *data, Tick delay, Request::Flags flag) { @@ -161,11 +161,17 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event, // i.e. cache line size DmaReqState *reqState = new DmaReqState(event, size, delay); + // (functionality added for Table Walker statistics) + // We're only interested in this when there will only be one request. + // For simplicity, we return the last request, which would also be + // the only request in that case. + RequestPtr req = NULL; + DPRINTF(DMA, "Starting DMA for addr: %#x size: %d sched: %d\n", addr, size, event ? event->scheduled() : -1); for (ChunkGenerator gen(addr, size, sys->cacheLineSize()); !gen.done(); gen.next()) { - Request *req = new Request(gen.addr(), gen.size(), flag, masterId); + req = new Request(gen.addr(), gen.size(), flag, masterId); req->taskId(ContextSwitchTaskId::DMA); PacketPtr pkt = new Packet(req, cmd); @@ -184,6 +190,8 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event, // just created, for atomic this involves actually completing all // the requests sendDma(); + + return req; } void diff --git a/src/dev/dma_device.hh b/src/dev/dma_device.hh index d1245b977..6df4a287d 100644 --- a/src/dev/dma_device.hh +++ b/src/dev/dma_device.hh @@ -142,8 +142,8 @@ class DmaPort : public MasterPort DmaPort(MemObject *dev, System *s); - void dmaAction(Packet::Command cmd, Addr addr, int size, Event *event, - uint8_t *data, Tick delay, Request::Flags flag = 0); + RequestPtr dmaAction(Packet::Command cmd, Addr addr, int size, Event *event, + uint8_t *data, Tick delay, Request::Flags flag = 0); bool dmaPending() const { return pendingCount > 0; } -- 2.30.2