From 313c015bbc61da0b8acedc84e4d136835a9f9805 Mon Sep 17 00:00:00 2001
From: Nikos Nikoleris <nikos.nikoleris@arm.com>
Date: Mon, 10 Oct 2016 14:06:00 +0100
Subject: [PATCH] mem: Add write coalescing and write-no-allocate to the caches

Enable the cache to detect contiguous writes and hold on to the MSHR
long enough to allow the entire line to be written. If the whole line
is written, the MSHR will be sent out as an invalidation requests, as
it is part of a whole-line write, i.e. no-fetch-on-write.

The cache is also able to switch to a write-no-allocate policy on the
actual completion of the writes, and instead use the tempBlock and
turn the write operation into a writeback.

These policies are all well-known, and described in works such as
Jouppi, Cache Write Policies and Performance, vol 21, no 2, ACM, 1993.

Change-Id: I19792f2970b3c6798c9b2b493acdd156897284ae
Reviewed-on: https://gem5-review.googlesource.com/c/12907
Reviewed-by: Daniel Carvalho <odanrc@yahoo.com.br>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
---
 src/mem/cache/Cache.py |  24 +++++++
 src/mem/cache/base.cc  |  83 +++++++++++++++++++++--
 src/mem/cache/base.hh  | 149 +++++++++++++++++++++++++++++++++++++++++
 src/mem/cache/cache.cc |   5 +-
 4 files changed, 255 insertions(+), 6 deletions(-)

diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 230131bdc..8ffab911b 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -41,6 +41,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.SimObject import SimObject
 from MemObject import MemObject
 from Prefetcher import BasePrefetcher
 from ReplacementPolicies import *
@@ -51,6 +52,24 @@ from Tags import *
 # exclusive.
 class Clusivity(Enum): vals = ['mostly_incl', 'mostly_excl']
 
+class WriteAllocator(SimObject):
+    type = 'WriteAllocator'
+    cxx_header = "mem/cache/cache.hh"
+
+    # Control the limits for when the cache introduces extra delays to
+    # allow whole-line write coalescing, and eventually switches to a
+    # write-no-allocate policy.
+    coalesce_limit = Param.Unsigned(2, "Consecutive lines written before "
+                                    "delaying for coalescing")
+    no_allocate_limit = Param.Unsigned(12, "Consecutive lines written before"
+                                       " skipping allocation")
+
+    delay_threshold = Param.Unsigned(8, "Number of delay quanta imposed on an "
+                                     "MSHR with write requests to allow for "
+                                     "write coalescing")
+
+    block_size = Param.Int(Parent.cache_line_size, "block size in bytes")
+
 
 class BaseCache(MemObject):
     type = 'BaseCache'
@@ -116,6 +135,11 @@ class BaseCache(MemObject):
     clusivity = Param.Clusivity('mostly_incl',
                                 "Clusivity with upstream cache")
 
+    # The write allocator enables optimizations for streaming write
+    # accesses by first coalescing writes and then avoiding allocation
+    # in the current cache. Typically, this would be enabled in the
+    # data cache.
+    write_allocator = Param.WriteAllocator(NULL, "Write allocator")
 
 class Cache(BaseCache):
     type = 'Cache'
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index b292e5a25..183d93f14 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -58,6 +58,7 @@
 #include "mem/cache/prefetch/base.hh"
 #include "mem/cache/queue_entry.hh"
 #include "params/BaseCache.hh"
+#include "params/WriteAllocator.hh"
 #include "sim/core.hh"
 
 class BaseMasterPort;
@@ -83,6 +84,7 @@ BaseCache::BaseCache(const BaseCacheParams *p, unsigned blk_size)
       tags(p->tags),
       prefetcher(p->prefetcher),
       prefetchOnAccess(p->prefetch_on_access),
+      writeAllocator(p->write_allocator),
       writebackClean(p->writeback_clean),
       tempBlockWriteback(nullptr),
       writebackTempBlockAtomicEvent([this]{ writebackTempBlockAtomic(); },
@@ -243,6 +245,12 @@ void
 BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk,
                                Tick forward_time, Tick request_time)
 {
+    if (writeAllocator &&
+        pkt && pkt->isWrite() && !pkt->req->isUncacheable()) {
+        writeAllocator->updateMode(pkt->getAddr(), pkt->getSize(),
+                                   pkt->getBlockAddr(blkSize));
+    }
+
     if (mshr) {
         /// MSHR hit
         /// @note writebacks will be checked in getNextMSHR()
@@ -391,11 +399,13 @@ BaseCache::recvTimingReq(PacketPtr pkt)
         // already allocated for this, we need to let the prefetcher
         // know about the request
 
-        // Don't notify prefetcher on SWPrefetch or cache maintenance
-        // operations
+        // Don't notify prefetcher on SWPrefetch, cache maintenance
+        // operations or for writes that we are coaslescing.
         if (prefetcher && pkt &&
             !pkt->cmd.isSWPrefetch() &&
-            !pkt->req->isCacheMaintenance()) {
+            !pkt->req->isCacheMaintenance() &&
+            !(writeAllocator && writeAllocator->coalesce() &&
+              pkt->isWrite())) {
             next_pf_time = prefetcher->notify(pkt);
         }
     }
@@ -487,7 +497,9 @@ BaseCache::recvTimingResp(PacketPtr pkt)
         DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
                 pkt->getAddr());
 
-        blk = handleFill(pkt, blk, writebacks, mshr->allocOnFill());
+        const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ?
+            writeAllocator->allocate() : mshr->allocOnFill();
+        blk = handleFill(pkt, blk, writebacks, allocate);
         assert(blk != nullptr);
     }
 
@@ -1461,6 +1473,29 @@ BaseCache::sendMSHRQueuePacket(MSHR* mshr)
 
     DPRINTF(Cache, "%s: MSHR %s\n", __func__, tgt_pkt->print());
 
+    // if the cache is in write coalescing mode or (additionally) in
+    // no allocation mode, and we have a write packet with an MSHR
+    // that is not a whole-line write (due to incompatible flags etc),
+    // then reset the write mode
+    if (writeAllocator && writeAllocator->coalesce() && tgt_pkt->isWrite()) {
+        if (!mshr->isWholeLineWrite()) {
+            // if we are currently write coalescing, hold on the
+            // MSHR as many cycles extra as we need to completely
+            // write a cache line
+            if (writeAllocator->delay(mshr->blkAddr)) {
+                Tick delay = blkSize / tgt_pkt->getSize() * clockPeriod();
+                DPRINTF(CacheVerbose, "Delaying pkt %s %llu ticks to allow "
+                        "for write coalescing\n", tgt_pkt->print(), delay);
+                mshrQueue.delay(mshr, delay);
+                return false;
+            } else {
+                writeAllocator->reset();
+            }
+        } else {
+            writeAllocator->resetDelay(mshr->blkAddr);
+        }
+    }
+
     CacheBlk *blk = tags->findBlock(mshr->blkAddr, mshr->isSecure);
 
     // either a prefetch that is not present upstream, or a normal
@@ -2357,3 +2392,43 @@ BaseCache::MemSidePort::MemSidePort(const std::string &_name,
       _snoopRespQueue(*_cache, *this, _label), cache(_cache)
 {
 }
+
+void
+WriteAllocator::updateMode(Addr write_addr, unsigned write_size,
+                           Addr blk_addr)
+{
+    // check if we are continuing where the last write ended
+    if (nextAddr == write_addr) {
+        delayCtr[blk_addr] = delayThreshold;
+        // stop if we have already saturated
+        if (mode != WriteMode::NO_ALLOCATE) {
+            byteCount += write_size;
+            // switch to streaming mode if we have passed the lower
+            // threshold
+            if (mode == WriteMode::ALLOCATE &&
+                byteCount > coalesceLimit) {
+                mode = WriteMode::COALESCE;
+                DPRINTF(Cache, "Switched to write coalescing\n");
+            } else if (mode == WriteMode::COALESCE &&
+                       byteCount > noAllocateLimit) {
+                // and continue and switch to non-allocating mode if we
+                // pass the upper threshold
+                mode = WriteMode::NO_ALLOCATE;
+                DPRINTF(Cache, "Switched to write-no-allocate\n");
+            }
+        }
+    } else {
+        // we did not see a write matching the previous one, start
+        // over again
+        byteCount = write_size;
+        mode = WriteMode::ALLOCATE;
+        resetDelay(blk_addr);
+    }
+    nextAddr = write_addr + write_size;
+}
+
+WriteAllocator*
+WriteAllocatorParams::create()
+{
+    return new WriteAllocator(this);
+}
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 47218f828..b9fd7f943 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -73,6 +73,7 @@
 #include "mem/packet_queue.hh"
 #include "mem/qport.hh"
 #include "mem/request.hh"
+#include "params/WriteAllocator.hh"
 #include "sim/eventq.hh"
 #include "sim/serialize.hh"
 #include "sim/sim_exit.hh"
@@ -328,6 +329,22 @@ class BaseCache : public MemObject
      */
     const bool prefetchOnAccess;
 
+    /**
+     * The writeAllocator drive optimizations for streaming writes.
+     * It first determines whether a WriteReq MSHR should be delayed,
+     * thus ensuring that we wait longer in cases when we are write
+     * coalescing and allowing all the bytes of the line to be written
+     * before the MSHR packet is sent downstream. This works in unison
+     * with the tracking in the MSHR to check if the entire line is
+     * written. The write mode also affects the behaviour on filling
+     * any whole-line writes. Normally the cache allocates the line
+     * when receiving the InvalidateResp, but after seeing enough
+     * consecutive lines we switch to using the tempBlock, and thus
+     * end up not allocating the line, and instead turning the
+     * whole-line write into a writeback straight away.
+     */
+    WriteAllocator * const writeAllocator;
+
     /**
      * Temporary cache block for occasional transitory use.  We use
      * the tempBlock to fill when allocation fails (e.g., when there
@@ -1161,4 +1178,136 @@ class BaseCache : public MemObject
 
 };
 
+/**
+ * The write allocator inspects write packets and detects streaming
+ * patterns. The write allocator supports a single stream where writes
+ * are expected to access consecutive locations and keeps track of
+ * size of the area covered by the concecutive writes in byteCount.
+ *
+ * 1) When byteCount has surpassed the coallesceLimit the mode
+ * switches from ALLOCATE to COALESCE where writes should be delayed
+ * until the whole block is written at which point a single packet
+ * (whole line write) can service them.
+ *
+ * 2) When byteCount has also exceeded the noAllocateLimit (whole
+ * line) we switch to NO_ALLOCATE when writes should not allocate in
+ * the cache but rather send a whole line write to the memory below.
+ */
+class WriteAllocator : public SimObject {
+  public:
+    WriteAllocator(const WriteAllocatorParams *p) :
+        SimObject(p),
+        coalesceLimit(p->coalesce_limit * p->block_size),
+        noAllocateLimit(p->no_allocate_limit * p->block_size),
+        delayThreshold(p->delay_threshold)
+    {
+        reset();
+    }
+
+    /**
+     * Should writes be coalesced? This is true if the mode is set to
+     * NO_ALLOCATE.
+     *
+     * @return return true if the cache should coalesce writes.
+     */
+    bool coalesce() const {
+        return mode != WriteMode::ALLOCATE;
+    }
+
+    /**
+     * Should writes allocate?
+     *
+     * @return return true if the cache should not allocate for writes.
+     */
+    bool allocate() const {
+        return mode != WriteMode::NO_ALLOCATE;
+    }
+
+    /**
+     * Reset the write allocator state, meaning that it allocates for
+     * writes and has not recorded any information about qualifying
+     * writes that might trigger a switch to coalescing and later no
+     * allocation.
+     */
+    void reset() {
+        mode = WriteMode::ALLOCATE;
+        byteCount = 0;
+        nextAddr = 0;
+    }
+
+    /**
+     * Access whether we need to delay the current write.
+     *
+     * @param blk_addr The block address the packet writes to
+     * @return true if the current packet should be delayed
+     */
+    bool delay(Addr blk_addr) {
+        if (delayCtr[blk_addr] > 0) {
+            --delayCtr[blk_addr];
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * Clear delay counter for the input block
+     *
+     * @param blk_addr The accessed cache block
+     */
+    void resetDelay(Addr blk_addr) {
+        delayCtr.erase(blk_addr);
+    }
+
+    /**
+     * Update the write mode based on the current write
+     * packet. This method compares the packet's address with any
+     * current stream, and updates the tracking and the mode
+     * accordingly.
+     *
+     * @param write_addr Start address of the write request
+     * @param write_size Size of the write request
+     * @param blk_addr The block address that this packet writes to
+     */
+    void updateMode(Addr write_addr, unsigned write_size, Addr blk_addr);
+
+  private:
+    /**
+     * The current mode for write coalescing and allocation, either
+     * normal operation (ALLOCATE), write coalescing (COALESCE), or
+     * write coalescing without allocation (NO_ALLOCATE).
+     */
+    enum class WriteMode : char {
+        ALLOCATE,
+        COALESCE,
+        NO_ALLOCATE,
+    };
+    WriteMode mode;
+
+    /** Address to match writes against to detect streams. */
+    Addr nextAddr;
+
+    /**
+     * Bytes written contiguously. Saturating once we no longer
+     * allocate.
+     */
+    uint32_t byteCount;
+
+    /**
+     * Limits for when to switch between the different write modes.
+     */
+    const uint32_t coalesceLimit;
+    const uint32_t noAllocateLimit;
+    /**
+     * The number of times the allocator will delay an WriteReq MSHR.
+     */
+    const uint32_t delayThreshold;
+
+    /**
+     * Keep track of the number of times the allocator has delayed an
+     * WriteReq MSHR.
+     */
+    std::unordered_map<Addr, Counter> delayCtr;
+};
+
 #endif //__MEM_CACHE_BASE_HH__
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index d38680eb8..fb0eb9d05 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -623,8 +623,9 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
 
                 // write-line request to the cache that promoted
                 // the write to a whole line
-                blk = handleFill(bus_pkt, blk, writebacks,
-                                 allocOnFill(pkt->cmd));
+                const bool allocate = allocOnFill(pkt->cmd) &&
+                    (!writeAllocator || writeAllocator->allocate());
+                blk = handleFill(bus_pkt, blk, writebacks, allocate);
                 assert(blk != NULL);
                 is_invalidate = false;
                 satisfyRequest(pkt, blk);
-- 
2.30.2