From e2828587b3f28c4f37f0fe598209290bc3d41de0 Mon Sep 17 00:00:00 2001
From: Marco Balboni <Marco.Balboni@ARM.com>
Date: Wed, 11 Feb 2015 10:23:36 -0500
Subject: [PATCH] mem: Clarify usage of latency in the cache

This patch adds some much-needed clarity in the specification of the
cache timing. For now, hit_latency and response_latency are kept as
top-level parameters, but the cache itself has a number of local
variables to better map the individual timing variables to different
behaviours (and sub-components).

The introduced variables are:
- lookupLatency: latency of tag lookup, occuring on any access
- forwardLatency: latency that occurs in case of outbound miss
- fillLatency: latency to fill a cache block
We keep the existing responseLatency

The forwardLatency is used by allocateInternalBuffer() for:
- MSHR allocateWriteBuffer (unchached write forwarded to WriteBuffer);
- MSHR allocateMissBuffer (cacheable miss in MSHR queue);
- MSHR allocateUncachedReadBuffer (unchached read allocated in MSHR
  queue)
It is our assumption that the time for the above three buffers is the
same. Similarly, for snoop responses passing through the cache we use
forwardLatency.
---
 src/mem/cache/base.cc                |   4 +-
 src/mem/cache/base.hh                |  35 +++++++--
 src/mem/cache/cache_impl.hh          | 103 +++++++++++++++++++--------
 src/mem/cache/tags/base.cc           |   2 +-
 src/mem/cache/tags/base.hh           |   5 +-
 src/mem/cache/tags/base_set_assoc.cc |   3 -
 src/mem/cache/tags/base_set_assoc.hh |  12 +---
 src/mem/cache/tags/fa_lru.cc         |   4 +-
 src/mem/cache/tags/fa_lru.hh         |   9 ---
 9 files changed, 111 insertions(+), 66 deletions(-)

diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index d89517b9c..78e2ca9ab 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -72,7 +72,9 @@ BaseCache::BaseCache(const Params *p)
       writeBuffer("write buffer", p->write_buffers, p->mshrs+1000, 0,
                   MSHRQueue_WriteBuffer),
       blkSize(p->system->cacheLineSize()),
-      hitLatency(p->hit_latency),
+      lookupLatency(p->hit_latency),
+      forwardLatency(p->hit_latency),
+      fillLatency(p->response_latency),
       responseLatency(p->response_latency),
       numTarget(p->tgts_per_mshr),
       forwardSnoops(p->forward_snoops),
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 0be6b7944..beb818961 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 ARM Limited
+ * Copyright (c) 2012-2013, 2015 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -202,6 +202,17 @@ class BaseCache : public MemObject
     /** Write/writeback buffer */
     MSHRQueue writeBuffer;
 
+    /**
+     * Allocate a buffer, passing the time indicating when schedule an
+     * event to the queued port to go and ask the MSHR and write queue
+     * if they have packets to send.
+     *
+     * allocateBufferInternal() function is called in:
+     * - MSHR allocateWriteBuffer (unchached write forwarded to WriteBuffer);
+     * - MSHR allocateMissBuffer (cacheable miss in MSHR queue);
+     * - MSHR allocateUncachedReadBuffer (unchached read allocated in MSHR
+     *   queue)
+     */
     MSHR *allocateBufferInternal(MSHRQueue *mq, Addr addr, int size,
                                  PacketPtr pkt, Tick time, bool requestBus)
     {
@@ -251,15 +262,25 @@ class BaseCache : public MemObject
     const unsigned blkSize;
 
     /**
-     * The latency of a hit in this device.
+     * The latency of tag lookup of a cache. It occurs when there is
+     * an access to the cache.
      */
-    const Cycles hitLatency;
+    const Cycles lookupLatency;
+
+    /**
+     * This is the forward latency of the cache. It occurs when there
+     * is a cache miss and a request is forwarded downstream, in
+     * particular an outbound miss.
+     */
+    const Cycles forwardLatency;
+
+    /** The latency to fill a cache block */
+    const Cycles fillLatency;
 
     /**
-     * The latency of sending reponse to its upper level cache/core on a
-     * linefill. In most contemporary processors, the return path on a cache
-     * miss is much quicker that the hit latency. The responseLatency parameter
-     * tries to capture this latency.
+     * The latency of sending reponse to its upper level cache/core on
+     * a linefill. The responseLatency parameter captures this
+     * latency.
      */
     const Cycles responseLatency;
 
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index c671deb68..2fb0baaa4 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2014 ARM Limited
+ * Copyright (c) 2010-2015 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -314,11 +314,14 @@ Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk,
     if (pkt->req->isUncacheable()) {
         uncacheableFlush(pkt);
         blk = NULL;
-        lat = hitLatency;
+        // lookupLatency is the latency in case the request is uncacheable.
+        lat = lookupLatency;
         return false;
     }
 
     int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1;
+    // Here lat is the value passed as parameter to accessBlock() function
+    // that can modify its value.
     blk = tags->accessBlock(pkt->getAddr(), pkt->isSecure(), lat, id);
 
     DPRINTF(Cache, "%s%s %x (%s) %s %s\n", pkt->cmdString(),
@@ -392,7 +395,6 @@ Cache<TagStore>::recvTimingSnoopResp(PacketPtr pkt)
 {
     DPRINTF(Cache, "%s for %s address %x size %d\n", __func__,
             pkt->cmdString(), pkt->getAddr(), pkt->getSize());
-    Tick time = clockEdge(hitLatency);
 
     assert(pkt->isResponse());
 
@@ -418,7 +420,10 @@ Cache<TagStore>::recvTimingSnoopResp(PacketPtr pkt)
     delete rec;
     // @todo someone should pay for this
     pkt->firstWordDelay = pkt->lastWordDelay = 0;
-    memSidePort->schedTimingSnoopResp(pkt, time);
+    // forwardLatency is set here because there is a response from an
+    // upper level cache.
+    memSidePort->schedTimingSnoopResp(pkt, clockEdge(forwardLatency));
+
 }
 
 template<class TagStore>
@@ -449,9 +454,6 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
         delete pendingDelete[x];
     pendingDelete.clear();
 
-    // we charge hitLatency for doing just about anything here
-    Tick time = clockEdge(hitLatency);
-
     assert(pkt->isRequest());
 
     // Just forward the packet if caches are disabled.
@@ -527,21 +529,34 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
             // prefetching (cache loading) uncacheable data is nonsensical
             pkt->makeTimingResponse();
             std::memset(pkt->getPtr<uint8_t>(), 0xFF, pkt->getSize());
-            cpuSidePort->schedTimingResp(pkt, clockEdge(hitLatency));
+            // We use lookupLatency here because the request is uncacheable
+            cpuSidePort->schedTimingResp(pkt, clockEdge(lookupLatency));
             return true;
         } else if (pkt->isWrite() && !pkt->isRead()) {
-            allocateWriteBuffer(pkt, time, true);
+            // We use forwardLatency here because there is an uncached
+            // memory write, forwarded to WriteBuffer. It specifies the
+            // latency to allocate an internal buffer and to schedule an
+            // event to the queued port.
+            allocateWriteBuffer(pkt, clockEdge(forwardLatency), true);
         } else {
-            allocateUncachedReadBuffer(pkt, time, true);
+            // We use forwardLatency here because there is an uncached
+            // memory read, allocateded to MSHR queue (it requires the same
+            // time of forwarding to WriteBuffer, in our assumption). It
+            // specifies the latency to allocate an internal buffer and to
+            // schedule an event to the queued port.
+            allocateUncachedReadBuffer(pkt, clockEdge(forwardLatency), true);
         }
         assert(pkt->needsResponse()); // else we should delete it here??
         return true;
     }
 
-    Cycles lat = hitLatency;
+    // We use lookupLatency here because it is used to specify the latency
+    // to access.
+    Cycles lat = lookupLatency;
     BlkType *blk = NULL;
     PacketList writebacks;
-
+    // Note that lat is passed by reference here. The function access() calls
+    // accessBlock() which can modify lat value.
     bool satisfied = access(pkt, blk, lat, writebacks);
 
     // track time of availability of next prefetch, if any
@@ -565,6 +580,13 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
             pkt->makeTimingResponse();
             // @todo: Make someone pay for this
             pkt->firstWordDelay = pkt->lastWordDelay = 0;
+
+            // In this case we are considering lat neglecting
+            // responseLatency, modelling hit latency just as
+            // lookupLatency We pass lat by reference to access(),
+            // which calls accessBlock() function. If it is a hit,
+            // accessBlock() can modify lat to override the
+            // lookupLatency value.
             cpuSidePort->schedTimingResp(pkt, clockEdge(lat));
         } else {
             /// @todo nominally we should just delete the packet here,
@@ -638,7 +660,12 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
                 if (mshr->threadNum != 0/*pkt->req->threadId()*/) {
                     mshr->threadNum = -1;
                 }
-                mshr->allocateTarget(pkt, time, order++);
+                // We use forwardLatency here because it is the same
+                // considering new targets. We have multiple requests for the
+                // same address here. It pecifies the latency to allocate an
+                // internal buffer and to schedule an event to the queued
+                // port.
+                mshr->allocateTarget(pkt, clockEdge(forwardLatency), order++);
                 if (mshr->getNumTargets() == numTarget) {
                     noTargetMSHR = mshr;
                     setBlocked(Blocked_NoTargets);
@@ -669,7 +696,11 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
             // no-write-allocate or bypass accesses this will have to
             // be changed.
             if (pkt->cmd == MemCmd::Writeback) {
-                allocateWriteBuffer(pkt, time, true);
+                // We use forwardLatency here because there is an
+                // uncached memory write, forwarded to WriteBuffer. It
+                // specifies the latency to allocate an internal buffer and to
+                // schedule an event to the queued port.
+                allocateWriteBuffer(pkt, clockEdge(forwardLatency), true);
             } else {
                 if (blk && blk->isValid()) {
                     // If we have a write miss to a valid block, we
@@ -691,8 +722,13 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
                     assert(!blk->isWritable());
                     blk->status &= ~BlkReadable;
                 }
-
-                allocateMissBuffer(pkt, time, true);
+                // Here we are using forwardLatency, modelling the latency of
+                // a miss (outbound) just as forwardLatency, neglecting the
+                // lookupLatency component. In this case this latency value
+                // specifies the latency to allocate an internal buffer and to
+                // schedule an event to the queued port, when a cacheable miss
+                // is forwarded to MSHR queue.
+                allocateMissBuffer(pkt, clockEdge(forwardLatency), true);
             }
 
             if (prefetcher) {
@@ -702,14 +738,17 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
             }
         }
     }
-
+    // Here we condiser just forward latency.
     if (next_pf_time != MaxTick)
-        requestMemSideBus(Request_PF, std::max(time, next_pf_time));
-
+        requestMemSideBus(Request_PF, std::max(clockEdge(forwardLatency),
+                                                next_pf_time));
     // copy writebacks to write buffer
     while (!writebacks.empty()) {
         PacketPtr wbPkt = writebacks.front();
-        allocateWriteBuffer(wbPkt, time, true);
+        // We use forwardLatency here because we are copying writebacks
+        // to write buffer. It specifies the latency to allocate an internal
+        // buffer and to schedule an event to the queued port.
+        allocateWriteBuffer(wbPkt, clockEdge(forwardLatency), true);
         writebacks.pop_front();
     }
 
@@ -778,8 +817,8 @@ template<class TagStore>
 Tick
 Cache<TagStore>::recvAtomic(PacketPtr pkt)
 {
-    Cycles lat = hitLatency;
-
+    // We are in atomic mode so we pay just for lookupLatency here.
+    Cycles lat = lookupLatency;
     // @TODO: make this a parameter
     bool last_level_cache = false;
 
@@ -996,7 +1035,6 @@ Cache<TagStore>::recvTimingResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
 
-    Tick time = clockEdge(hitLatency);
     MSHR *mshr = dynamic_cast<MSHR*>(pkt->senderState);
     bool is_error = pkt->isError();
 
@@ -1221,13 +1259,18 @@ Cache<TagStore>::recvTimingResp(PacketPtr pkt)
     // copy writebacks to write buffer
     while (!writebacks.empty()) {
         PacketPtr wbPkt = writebacks.front();
-        allocateWriteBuffer(wbPkt, time, true);
+        allocateWriteBuffer(wbPkt, clockEdge(forwardLatency), true);
         writebacks.pop_front();
     }
     // if we used temp block, clear it out
     if (blk == tempBlock) {
         if (blk->isDirty()) {
-            allocateWriteBuffer(writebackBlk(blk), time, true);
+            // We use forwardLatency here because we are copying
+            // writebacks to write buffer. It specifies the latency to
+            // allocate an internal buffer and to schedule an event to the
+            // queued port.
+            allocateWriteBuffer(writebackBlk(blk), clockEdge(forwardLatency),
+                                 true);
         }
         blk->invalidate();
     }
@@ -1467,8 +1510,8 @@ Cache<TagStore>::handleFill(PacketPtr pkt, BlkType *blk,
         assert(pkt->hasData());
         std::memcpy(blk->data, pkt->getConstPtr<uint8_t>(), blkSize);
     }
-
-    blk->whenReady = clockEdge() + responseLatency * clockPeriod() +
+    // We pay for fillLatency here.
+    blk->whenReady = clockEdge() + fillLatency * clockPeriod() +
         pkt->lastWordDelay;
 
     return blk;
@@ -1521,7 +1564,8 @@ doTimingSupplyResponse(PacketPtr req_pkt, const uint8_t *blk_data,
     }
     DPRINTF(Cache, "%s created response: %s address %x size %d\n",
             __func__, pkt->cmdString(), pkt->getAddr(), pkt->getSize());
-    memSidePort->schedTimingSnoopResp(pkt, clockEdge(hitLatency));
+    // We model a snoop just considering forwardLatency
+    memSidePort->schedTimingSnoopResp(pkt, clockEdge(forwardLatency));
 }
 
 template<class TagStore>
@@ -1794,7 +1838,8 @@ Cache<TagStore>::recvAtomicSnoop(PacketPtr pkt)
 
     BlkType *blk = tags->findBlock(pkt->getAddr(), pkt->isSecure());
     handleSnoop(pkt, blk, false, false, false);
-    return hitLatency * clockPeriod();
+    // We consider forwardLatency here because a snoop occurs in atomic mode
+    return forwardLatency * clockPeriod();
 }
 
 
diff --git a/src/mem/cache/tags/base.cc b/src/mem/cache/tags/base.cc
index 47a43fb7e..8d2322e51 100644
--- a/src/mem/cache/tags/base.cc
+++ b/src/mem/cache/tags/base.cc
@@ -55,7 +55,7 @@ using namespace std;
 
 BaseTags::BaseTags(const Params *p)
     : ClockedObject(p), blkSize(p->block_size), size(p->size),
-      hitLatency(p->hit_latency), cache(nullptr), warmupBound(0),
+      accessLatency(p->hit_latency), cache(nullptr), warmupBound(0),
       warmedUp(false), numBlocks(0)
 {
 }
diff --git a/src/mem/cache/tags/base.hh b/src/mem/cache/tags/base.hh
index 9e1fb1972..03b6cfed8 100644
--- a/src/mem/cache/tags/base.hh
+++ b/src/mem/cache/tags/base.hh
@@ -68,9 +68,8 @@ class BaseTags : public ClockedObject
     const unsigned blkSize;
     /** The size of the cache. */
     const unsigned size;
-    /** The hit latency of the cache. */
-    const Cycles hitLatency;
-
+    /** The access latency of the cache. */
+    const Cycles accessLatency;
     /** Pointer to the parent cache. */
     BaseCache *cache;
 
diff --git a/src/mem/cache/tags/base_set_assoc.cc b/src/mem/cache/tags/base_set_assoc.cc
index bb0c20141..3c8371edb 100644
--- a/src/mem/cache/tags/base_set_assoc.cc
+++ b/src/mem/cache/tags/base_set_assoc.cc
@@ -68,9 +68,6 @@ BaseSetAssoc::BaseSetAssoc(const Params *p)
     if (assoc <= 0) {
         fatal("associativity must be greater than zero");
     }
-    if (hitLatency <= 0) {
-        fatal("access latency must be greater than zero");
-    }
 
     blkMask = blkSize - 1;
     setShift = floorLog2(blkSize);
diff --git a/src/mem/cache/tags/base_set_assoc.hh b/src/mem/cache/tags/base_set_assoc.hh
index ac575d2ff..0107aafaf 100644
--- a/src/mem/cache/tags/base_set_assoc.hh
+++ b/src/mem/cache/tags/base_set_assoc.hh
@@ -178,7 +178,7 @@ public:
         Addr tag = extractTag(addr);
         int set = extractSet(addr);
         BlkType *blk = sets[set].findBlk(tag, is_secure);
-        lat = hitLatency;
+        lat = accessLatency;;
 
         // Access all tags in parallel, hence one in each way.  The data side
         // either accesses all blocks in parallel, or one block sequentially on
@@ -195,7 +195,7 @@ public:
         if (blk != NULL) {
             if (blk->whenReady > curTick()
                 && cache->ticksToCycles(blk->whenReady - curTick())
-                > hitLatency) {
+                > accessLatency) {
                 lat = cache->ticksToCycles(blk->whenReady - curTick());
             }
             blk->refCount += 1;
@@ -342,14 +342,6 @@ public:
         return ((tag << tagShift) | ((Addr)set << setShift));
     }
 
-    /**
-     * Return the hit latency.
-     * @return the hit latency.
-     */
-    Cycles getHitLatency() const
-    {
-        return hitLatency;
-    }
     /**
      *iterated through all blocks and clear all locks
      *Needed to clear all lock tracking at once
diff --git a/src/mem/cache/tags/fa_lru.cc b/src/mem/cache/tags/fa_lru.cc
index 6a63da673..ffe2cbf25 100644
--- a/src/mem/cache/tags/fa_lru.cc
+++ b/src/mem/cache/tags/fa_lru.cc
@@ -60,8 +60,6 @@ FALRU::FALRU(const Params *p)
     if (!isPowerOf2(blkSize))
         fatal("cache block size (in bytes) `%d' must be a power of two",
               blkSize);
-    if (!(hitLatency > 0))
-        fatal("Access latency in cycles must be at least one cycle");
     if (!isPowerOf2(size))
         fatal("Cache Size must be power of 2 for now");
 
@@ -202,7 +200,7 @@ FALRU::accessBlock(Addr addr, bool is_secure, Cycles &lat, int context_src,
         *inCache = tmp_in_cache;
     }
 
-    lat = hitLatency;
+    lat = accessLatency;
     //assert(check());
     return blk;
 }
diff --git a/src/mem/cache/tags/fa_lru.hh b/src/mem/cache/tags/fa_lru.hh
index ef13b2c79..07a31c154 100644
--- a/src/mem/cache/tags/fa_lru.hh
+++ b/src/mem/cache/tags/fa_lru.hh
@@ -209,15 +209,6 @@ public:
 
     void insertBlock(PacketPtr pkt, BlkType *blk);
 
-    /**
-     * Return the hit latency of this cache.
-     * @return The hit latency.
-     */
-    Cycles getHitLatency() const
-    {
-        return hitLatency;
-    }
-
     /**
      * Return the block size of this cache.
      * @return The block size.
-- 
2.30.2