cache: coherence protocol enhancements & bug fixes
[gem5.git] / src / mem / cache / cache_impl.hh
index c1b01d6762ed70a278561bd282a7e4035e1293e4..8d2806b8d2aeb96cec2da1ac9f8aabee02bdff79 100644 (file)
@@ -1,5 +1,18 @@
 /*
+ * Copyright (c) 2010 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * Copyright (c) 2010 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * Cache definitions.
  */
 
-#include "sim/host.hh"
+#include "base/fast_alloc.hh"
 #include "base/misc.hh"
-
+#include "base/range.hh"
+#include "base/types.hh"
+#include "mem/cache/blk.hh"
 #include "mem/cache/cache.hh"
-#include "mem/cache/cache_blk.hh"
-#include "mem/cache/miss/mshr.hh"
-#include "mem/cache/prefetch/base_prefetcher.hh"
-
-#include "sim/sim_exit.hh" // for SimExitEvent
-
+#include "mem/cache/mshr.hh"
+#include "mem/cache/prefetch/base.hh"
+#include "sim/sim_exit.hh"
 
 template<class TagStore>
-Cache<TagStore>::Cache(const std::string &_name,
-                       Cache<TagStore>::Params &params)
-    : BaseCache(_name, params.baseParams),
-      prefetchAccess(params.prefetchAccess),
-      tags(params.tags),
-      prefetcher(params.prefetcher),
-      doFastWrites(params.doFastWrites),
-      prefetchMiss(params.prefetchMiss)
+Cache<TagStore>::Cache(const Params *p, TagStore *tags, BasePrefetcher *pf)
+    : BaseCache(p),
+      tags(tags),
+      prefetcher(pf),
+      doFastWrites(true),
+      prefetchOnAccess(p->prefetch_on_access)
 {
     tempBlock = new BlkType();
     tempBlock->data = new uint8_t[blkSize];
 
-    cpuSidePort = new CpuSidePort(_name + "-cpu_side_port", this);
-    memSidePort = new MemSidePort(_name + "-mem_side_port", this);
+    cpuSidePort = new CpuSidePort(p->name + "-cpu_side_port", this,
+                                  "CpuSidePort");
+    memSidePort = new MemSidePort(p->name + "-mem_side_port", this,
+                                  "MemSidePort");
     cpuSidePort->setOtherPort(memSidePort);
     memSidePort->setOtherPort(cpuSidePort);
 
     tags->setCache(this);
-    prefetcher->setCache(this);
+    if (prefetcher)
+        prefetcher->setCache(this);
 }
 
 template<class TagStore>
@@ -76,7 +89,8 @@ Cache<TagStore>::regStats()
 {
     BaseCache::regStats();
     tags->regStats(name());
-    prefetcher->regStats(name());
+    if (prefetcher)
+        prefetcher->regStats(name());
 }
 
 template<class TagStore>
@@ -88,7 +102,11 @@ Cache<TagStore>::getPort(const std::string &if_name, int idx)
     } else if (if_name == "mem_side") {
         return memSidePort;
     } else if (if_name == "functional") {
-        return new CpuSidePort(name() + "-cpu_side_funcport", this);
+        CpuSidePort *funcPort =
+            new CpuSidePort(name() + "-cpu_side_funcport", this,
+                            "CpuSideFuncPort");
+        funcPort->setOtherPort(memSidePort);
+        return funcPort;
     } else {
         panic("Port name %s unrecognized\n", if_name);
     }
@@ -138,23 +156,27 @@ Cache<TagStore>::cmpAndSwap(BlkType *blk, PacketPtr pkt)
             panic("Invalid size for conditional read/write\n");
     }
 
-    if (overwrite_mem)
+    if (overwrite_mem) {
         std::memcpy(blk_data, &overwrite_val, pkt->getSize());
+        blk->status |= BlkDirty;
+    }
 }
 
 
 template<class TagStore>
 void
-Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk)
+Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk,
+                                       bool deferred_response,
+                                       bool pending_downgrade)
 {
-    assert(blk);
+    assert(blk && blk->isValid());
     // Occasionally this is not true... if we are a lower-level cache
     // satisfying a string of Read and ReadEx requests from
     // upper-level caches, a Read will mark the block as shared but we
     // can satisfy a following ReadEx anyway since we can rely on the
     // Read requester(s) to have buffered the ReadEx snoop and to
     // invalidate their blocks after receiving them.
-    // assert(pkt->needsExclusive() ? blk->isWritable() : blk->isValid());
+    // assert(!pkt->needsExclusive() || blk->isWritable());
     assert(pkt->getOffset(blkSize) + pkt->getSize() <= blkSize);
 
     // Check RMW operations first since both isRead() and
@@ -163,11 +185,11 @@ Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk)
         cmpAndSwap(blk, pkt);
     } else if (pkt->isWrite()) {
         if (blk->checkWrite(pkt)) {
-            blk->status |= BlkDirty;
             pkt->writeDataToBlock(blk->data, blkSize);
+            blk->status |= BlkDirty;
         }
     } else if (pkt->isRead()) {
-        if (pkt->isLocked()) {
+        if (pkt->isLLSC()) {
             blk->trackLoadLocked(pkt);
         }
         pkt->setDataFromBlock(blk->data, blkSize);
@@ -175,20 +197,50 @@ Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk)
             // special handling for coherent block requests from
             // upper-level caches
             if (pkt->needsExclusive()) {
-                // on ReadExReq we give up our copy
+                // if we have a dirty copy, make sure the recipient
+                // keeps it marked dirty
+                if (blk->isDirty()) {
+                    pkt->assertMemInhibit();
+                }
+                // on ReadExReq we give up our copy unconditionally
                 tags->invalidateBlk(blk);
+            } else if (blk->isWritable() && !pending_downgrade
+                       && !pkt->sharedAsserted()) {
+                // we can give the requester an exclusive copy (by not
+                // asserting shared line) on a read request if:
+                // - we have an exclusive copy at this level (& below)
+                // - we don't have a pending snoop from below
+                //   signaling another read request
+                // - no other cache above has a copy (otherwise it
+                //   would have asseretd shared line on request)
+                
+                if (blk->isDirty()) {
+                    // special considerations if we're owner:
+                    if (!deferred_response) {
+                        // if we are responding immediately and can
+                        // signal that we're transferring ownership
+                        // along with exclusivity, do so
+                        pkt->assertMemInhibit();
+                        blk->status &= ~BlkDirty;
+                    } else {
+                        // if we're responding after our own miss,
+                        // there's a window where the recipient didn't
+                        // know it was getting ownership and may not
+                        // have responded to snoops correctly, so we
+                        // can't pass off ownership *or* exclusivity
+                        pkt->assertShared();
+                    }
+                }
             } else {
-                // on ReadReq we create shareable copies here and in
-                // the requester
+                // otherwise only respond with a shared copy
                 pkt->assertShared();
-                blk->status &= ~BlkWritable;
             }
         }
     } else {
         // Not a read or write... must be an upgrade.  it's OK
         // to just ack those as long as we have an exclusive
         // copy at this level.
-        assert(pkt->cmd == MemCmd::UpgradeReq);
+        assert(pkt->isUpgrade());
         tags->invalidateBlk(blk);
     }
 }
@@ -203,9 +255,9 @@ Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk)
 
 template<class TagStore>
 void
-Cache<TagStore>::markInService(MSHR *mshr)
+Cache<TagStore>::markInService(MSHR *mshr, PacketPtr pkt)
 {
-    markInServiceInternal(mshr);
+    markInServiceInternal(mshr, pkt);
 #if 0
         if (mshr->originalCmd == MemCmd::HardPFReq) {
             DPRINTF(HWPrefetch, "%s:Marking a HW_PF in service\n",
@@ -250,78 +302,83 @@ Cache<TagStore>::squash(int threadNum)
 
 template<class TagStore>
 bool
-Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk, int &lat)
+Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk,
+                        int &lat, PacketList &writebacks)
 {
-    if (pkt->req->isUncacheable())  {
+    if (pkt->req->isUncacheable()) {
+        if (pkt->req->isClrex()) {
+            tags->clearLocks();
+        } else {
+           blk = tags->findBlock(pkt->getAddr());
+           if (blk != NULL) {
+               tags->invalidateBlk(blk);
+           }
+        }
+
         blk = NULL;
         lat = hitLatency;
         return false;
     }
 
-    bool satisfied = false;  // assume the worst
-    blk = tags->findBlock(pkt->getAddr(), lat);
+    int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1;
+    blk = tags->accessBlock(pkt->getAddr(), lat, id);
 
-    if (prefetchAccess) {
-        //We are determining prefetches on access stream, call prefetcher
-        prefetcher->handleMiss(pkt, curTick);
-    }
-
-    DPRINTF(Cache, "%s %x %s\n", pkt->cmdString(), pkt->getAddr(),
-            (blk) ? "hit" : "miss");
+    DPRINTF(Cache, "%s%s %x %s\n", pkt->cmdString(),
+            pkt->req->isInstFetch() ? " (ifetch)" : "",
+            pkt->getAddr(), (blk) ? "hit" : "miss");
 
     if (blk != NULL) {
-        // HIT
-        if (blk->isPrefetch()) {
-            //Signal that this was a hit under prefetch (no need for
-            //use prefetch (only can get here if true)
-            DPRINTF(HWPrefetch, "Hit a block that was prefetched\n");
-            blk->status &= ~BlkHWPrefetched;
-            if (prefetchMiss) {
-                //If we are using the miss stream, signal the
-                //prefetcher otherwise the access stream would have
-                //already signaled this hit
-                prefetcher->handleMiss(pkt, curTick);
-            }
-        }
 
-        if (pkt->needsExclusive() ? blk->isWritable() : blk->isValid()) {
+        if (pkt->needsExclusive() ? blk->isWritable() : blk->isReadable()) {
             // OK to satisfy access
-            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
-            satisfied = true;
+            incHitCount(pkt, id);
             satisfyCpuSideRequest(pkt, blk);
-        } else if (pkt->cmd == MemCmd::Writeback) {
-            // special case: writeback to read-only block (e.g., from
-            // L1 into L2).  since we're really just passing ownership
-            // from one cache to another, we can update this cache to
-            // be the owner without making the block writeable
-            assert(!blk->isWritable() /* && !blk->isDirty() */);
-            assert(blkSize == pkt->getSize());
-            std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
-            blk->status |= BlkDirty;
-            satisfied = true;
-            // nothing else to do; writeback doesn't expect response
-            assert(!pkt->needsResponse());
-        } else {
-            // permission violation... nothing to do here, leave unsatisfied
-            // for statistics purposes this counts like a complete miss
-            incMissCount(pkt);
+            return true;
         }
-    } else {
-        // complete miss (no matching block)
-        incMissCount(pkt);
+    }
 
-        if (pkt->isLocked() && pkt->isWrite()) {
-            // miss on store conditional... just give up now
-            pkt->req->setExtraData(0);
-            satisfied = true;
+    // Can't satisfy access normally... either no block (blk == NULL)
+    // or have block but need exclusive & only have shared.
+
+    // Writeback handling is special case.  We can write the block
+    // into the cache without having a writeable copy (or any copy at
+    // all).
+    if (pkt->cmd == MemCmd::Writeback) {
+        assert(blkSize == pkt->getSize());
+        if (blk == NULL) {
+            // need to do a replacement
+            blk = allocateBlock(pkt->getAddr(), writebacks);
+            if (blk == NULL) {
+                // no replaceable block available, give up.
+                // writeback will be forwarded to next level.
+                incMissCount(pkt, id);
+                return false;
+            }
+            int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1;
+            tags->insertBlock(pkt->getAddr(), blk, id);
+            blk->status = BlkValid | BlkReadable;
         }
+        std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
+        blk->status |= BlkDirty;
+        // nothing else to do; writeback doesn't expect response
+        assert(!pkt->needsResponse());
+        incHitCount(pkt, id);
+        return true;
+    }
+
+    incMissCount(pkt, id);
+
+    if (blk == NULL && pkt->isLLSC() && pkt->isWrite()) {
+        // complete miss on store conditional... just give up now
+        pkt->req->setExtraData(0);
+        return true;
     }
 
-    return satisfied;
+    return false;
 }
 
 
-class ForwardResponseRecord : public Packet::SenderState
+class ForwardResponseRecord : public Packet::SenderState, public FastAlloc
 {
     Packet::SenderState *prevSenderState;
     int prevSrc;
@@ -382,10 +439,22 @@ Cache<TagStore>::timingAccess(PacketPtr pkt)
             memSidePort->sendTiming(snoopPkt);
             // main memory will delete snoopPkt
         }
+        // since we're the official target but we aren't responding,
+        // delete the packet now.
+        delete pkt;
         return true;
     }
 
     if (pkt->req->isUncacheable()) {
+        if (pkt->req->isClrex()) {
+            tags->clearLocks();
+        } else {
+            BlkType *blk = tags->findBlock(pkt->getAddr());
+            if (blk != NULL) {
+                tags->invalidateBlk(blk);
+            }
+        }
+
         // writes go in write buffer, reads use MSHR
         if (pkt->isWrite() && !pkt->isRead()) {
             allocateWriteBuffer(pkt, time, true);
@@ -397,29 +466,16 @@ Cache<TagStore>::timingAccess(PacketPtr pkt)
     }
 
     int lat = hitLatency;
-    bool satisfied = false;
-
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
-    MSHR *mshr = mshrQueue.findMatch(blk_addr);
+    BlkType *blk = NULL;
+    PacketList writebacks;
 
-    if (!mshr) {
-        // no outstanding access to this block, look up in cache
-        // (otherwise if we allow reads while there's an outstanding
-        // write miss, the read could return stale data out of the
-        // cache block... a more aggressive system could detect the
-        // overlap (if any) and forward data out of the MSHRs, but we
-        // don't do that yet)
-        BlkType *blk = NULL;
-        satisfied = access(pkt, blk, lat);
-    }
+    bool satisfied = access(pkt, blk, lat, writebacks);
 
 #if 0
-    PacketList writebacks;
+    /** @todo make the fast write alloc (wh64) work with coherence. */
 
     // If this is a block size write/hint (WH64) allocate the block here
     // if the coherence protocol allows it.
-    /** @todo make the fast write alloc (wh64) work with coherence. */
-    /** @todo Do we want to do fast writes for writebacks as well? */
     if (!blk && pkt->getSize() >= blkSize && coherence->allowFastWrites() &&
         (pkt->cmd == MemCmd::WriteReq
          || pkt->cmd == MemCmd::WriteInvalidateReq) ) {
@@ -435,15 +491,11 @@ Cache<TagStore>::timingAccess(PacketPtr pkt)
             ++fastWrites;
         }
     }
-
-    // copy writebacks to write buffer
-    while (!writebacks.empty()) {
-        PacketPtr wbPkt = writebacks.front();
-        allocateWriteBuffer(wbPkt, time, true);
-        writebacks.pop_front();
-    }
 #endif
 
+    // track time of availability of next prefetch, if any
+    Tick next_pf_time = 0;
+
     bool needsResponse = pkt->needsResponse();
 
     if (satisfied) {
@@ -453,16 +505,23 @@ Cache<TagStore>::timingAccess(PacketPtr pkt)
         } else {
             delete pkt;
         }
+
+        if (prefetcher && (prefetchOnAccess || (blk && blk->wasPrefetched()))) {
+            if (blk)
+                blk->status &= ~BlkHWPrefetched;
+            next_pf_time = prefetcher->notify(pkt, time);
+        }
     } else {
         // miss
-        if (prefetchMiss)
-            prefetcher->handleMiss(pkt, time);
+
+        Addr blk_addr = blockAlign(pkt->getAddr());
+        MSHR *mshr = mshrQueue.findMatch(blk_addr);
 
         if (mshr) {
             // MSHR hit
             //@todo remove hw_pf here
-            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
-            if (mshr->threadNum != 0/*pkt->req->getThreadNum()*/) {
+            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++;
+            if (mshr->threadNum != 0/*pkt->req->threadId()*/) {
                 mshr->threadNum = -1;
             }
             mshr->allocateTarget(pkt, time, order++);
@@ -476,22 +535,57 @@ Cache<TagStore>::timingAccess(PacketPtr pkt)
             }
         } else {
             // no MSHR
-            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++;
             // always mark as cache fill for now... if we implement
             // no-write-allocate or bypass accesses this will have to
             // be changed.
             if (pkt->cmd == MemCmd::Writeback) {
                 allocateWriteBuffer(pkt, time, true);
             } else {
+                if (blk && blk->isValid()) {
+                    // If we have a write miss to a valid block, we
+                    // need to mark the block non-readable.  Otherwise
+                    // if we allow reads while there's an outstanding
+                    // write miss, the read could return stale data
+                    // out of the cache block... a more aggressive
+                    // system could detect the overlap (if any) and
+                    // forward data out of the MSHRs, but we don't do
+                    // that yet.  Note that we do need to leave the
+                    // block valid so that it stays in the cache, in
+                    // case we get an upgrade response (and hence no
+                    // new data) when the write miss completes.
+                    // As long as CPUs do proper store/load forwarding
+                    // internally, and have a sufficiently weak memory
+                    // model, this is probably unnecessary, but at some
+                    // point it must have seemed like we needed it...
+                    assert(pkt->needsExclusive() && !blk->isWritable());
+                    blk->status &= ~BlkReadable;
+                }
+
                 allocateMissBuffer(pkt, time, true);
             }
+
+            if (prefetcher) {
+                next_pf_time = prefetcher->notify(pkt, time);
+            }
         }
     }
 
+    if (next_pf_time != 0)
+        requestMemSideBus(Request_PF, std::max(time, next_pf_time));
+
+    // copy writebacks to write buffer
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        allocateWriteBuffer(wbPkt, time, true);
+        writebacks.pop_front();
+    }
+
     return true;
 }
 
 
+// See comment in cache.hh.
 template<class TagStore>
 PacketPtr
 Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
@@ -500,18 +594,15 @@ Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
     bool blkValid = blk && blk->isValid();
 
     if (cpu_pkt->req->isUncacheable()) {
-        assert(blk == NULL);
+        //assert(blk == NULL);
         return NULL;
     }
 
     if (!blkValid &&
-        (cpu_pkt->cmd == MemCmd::Writeback ||
-         cpu_pkt->cmd == MemCmd::UpgradeReq)) {
-            // For now, writebacks from upper-level caches that
-            // completely miss in the cache just go through. If we had
-            // "fast write" support (where we could write the whole
-            // block w/o fetching new data) we might want to allocate
-            // on writeback misses instead.
+        (cpu_pkt->cmd == MemCmd::Writeback || cpu_pkt->isUpgrade())) {
+        // Writebacks that weren't allocated in access() and upgrades
+        // from upper-level caches that missed completely just go
+        // through.
         return NULL;
     }
 
@@ -527,7 +618,7 @@ Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
         // only reason to be here is that blk is shared
         // (read-only) and we need exclusive
         assert(needsExclusive && !blk->isWritable());
-        cmd = MemCmd::UpgradeReq;
+        cmd = cpu_pkt->isLLSC() ? MemCmd::SCUpgradeReq : MemCmd::UpgradeReq;
     } else {
         // block is invalid
         cmd = needsExclusive ? MemCmd::ReadExReq : MemCmd::ReadReq;
@@ -577,47 +668,79 @@ Cache<TagStore>::atomicAccess(PacketPtr pkt)
     // access in timing mode
 
     BlkType *blk = NULL;
+    PacketList writebacks;
 
-    if (!access(pkt, blk, lat)) {
+    if (!access(pkt, blk, lat, writebacks)) {
         // MISS
-        PacketPtr busPkt = getBusPacket(pkt, blk, pkt->needsExclusive());
+        PacketPtr bus_pkt = getBusPacket(pkt, blk, pkt->needsExclusive());
 
-        bool isCacheFill = (busPkt != NULL);
+        bool is_forward = (bus_pkt == NULL);
 
-        if (busPkt == NULL) {
+        if (is_forward) {
             // just forwarding the same request to the next level
             // no local cache operation involved
-            busPkt = pkt;
+            bus_pkt = pkt;
         }
 
         DPRINTF(Cache, "Sending an atomic %s for %x\n",
-                busPkt->cmdString(), busPkt->getAddr());
+                bus_pkt->cmdString(), bus_pkt->getAddr());
 
 #if TRACING_ON
         CacheBlk::State old_state = blk ? blk->status : 0;
 #endif
 
-        lat += memSidePort->sendAtomic(busPkt);
+        lat += memSidePort->sendAtomic(bus_pkt);
 
         DPRINTF(Cache, "Receive response: %s for addr %x in state %i\n",
-                busPkt->cmdString(), busPkt->getAddr(), old_state);
-
-        if (isCacheFill) {
-            PacketList writebacks;
-            blk = handleFill(busPkt, blk, writebacks);
-            satisfyCpuSideRequest(pkt, blk);
-            delete busPkt;
-
-            // Handle writebacks if needed
-            while (!writebacks.empty()){
-                PacketPtr wbPkt = writebacks.front();
-                memSidePort->sendAtomic(wbPkt);
-                writebacks.pop_front();
-                delete wbPkt;
+                bus_pkt->cmdString(), bus_pkt->getAddr(), old_state);
+
+        assert(!bus_pkt->wasNacked());
+
+        // If packet was a forward, the response (if any) is already
+        // in place in the bus_pkt == pkt structure, so we don't need
+        // to do anything.  Otherwise, use the separate bus_pkt to
+        // generate response to pkt and then delete it.
+        if (!is_forward) {
+            if (pkt->needsResponse()) {
+                assert(bus_pkt->isResponse());
+                if (bus_pkt->isError()) {
+                    pkt->makeAtomicResponse();
+                    pkt->copyError(bus_pkt);
+                } else if (bus_pkt->isRead() ||
+                           bus_pkt->cmd == MemCmd::UpgradeResp) {
+                    // we're updating cache state to allow us to
+                    // satisfy the upstream request from the cache
+                    blk = handleFill(bus_pkt, blk, writebacks);
+                    satisfyCpuSideRequest(pkt, blk);
+                } else {
+                    // we're satisfying the upstream request without
+                    // modifying cache state, e.g., a write-through
+                    pkt->makeAtomicResponse();
+                }
             }
+            delete bus_pkt;
         }
     }
 
+    // Note that we don't invoke the prefetcher at all in atomic mode.
+    // It's not clear how to do it properly, particularly for
+    // prefetchers that aggressively generate prefetch candidates and
+    // rely on bandwidth contention to throttle them; these will tend
+    // to pollute the cache in atomic mode since there is no bandwidth
+    // contention.  If we ever do want to enable prefetching in atomic
+    // mode, though, this is the place to do it... see timingAccess()
+    // for an example (though we'd want to issue the prefetch(es)
+    // immediately rather than calling requestMemSideBus() as we do
+    // there).
+
+    // Handle writebacks if needed
+    while (!writebacks.empty()){
+        PacketPtr wbPkt = writebacks.front();
+        memSidePort->sendAtomic(wbPkt);
+        writebacks.pop_front();
+        delete wbPkt;
+    }
+
     // We now have the block one way or another (hit or completed miss)
 
     if (pkt->needsResponse()) {
@@ -631,21 +754,27 @@ Cache<TagStore>::atomicAccess(PacketPtr pkt)
 template<class TagStore>
 void
 Cache<TagStore>::functionalAccess(PacketPtr pkt,
+                                  CachePort *incomingPort,
                                   CachePort *otherSidePort)
 {
-    Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
+    Addr blk_addr = blockAlign(pkt->getAddr());
     BlkType *blk = tags->findBlock(pkt->getAddr());
 
-    if (blk && pkt->checkFunctional(blk_addr, blkSize, blk->data)) {
-        // request satisfied from block
-        return;
-    }
+    pkt->pushLabel(name());
 
-    // Need to check for outstanding misses and writes; if neither one
-    // satisfies, then forward to other side of cache.
-    if (!(mshrQueue.checkFunctional(pkt, blk_addr) ||
-          writeBuffer.checkFunctional(pkt, blk_addr))) {
-        otherSidePort->checkAndSendFunctional(pkt);
+    CacheBlkPrintWrapper cbpw(blk);
+    bool done =
+        (blk && pkt->checkFunctional(&cbpw, blk_addr, blkSize, blk->data))
+        || incomingPort->checkFunctional(pkt)
+        || mshrQueue.checkFunctional(pkt, blk_addr)
+        || writeBuffer.checkFunctional(pkt, blk_addr)
+        || otherSidePort->checkFunctional(pkt);
+
+    // We're leaving the cache, so pop cache->name() label
+    pkt->popLabel();
+
+    if (!done) {
+        otherSidePort->sendFunctional(pkt);
     }
 }
 
@@ -663,6 +792,8 @@ Cache<TagStore>::handleResponse(PacketPtr pkt)
 {
     Tick time = curTick + hitLatency;
     MSHR *mshr = dynamic_cast<MSHR*>(pkt->senderState);
+    bool is_error = pkt->isError();
+
     assert(mshr);
 
     if (pkt->wasNacked()) {
@@ -671,7 +802,11 @@ Cache<TagStore>::handleResponse(PacketPtr pkt)
              "not implemented\n");
         return;
     }
-    assert(!pkt->isError());
+    if (is_error) {
+        DPRINTF(Cache, "Cache received packet with error for address %x, "
+                "cmd: %s\n", pkt->getAddr(), pkt->cmdString());
+    }
+
     DPRINTF(Cache, "Handling response to %x\n", pkt->getAddr());
 
     MSHRQueue *mq = mshr->queue;
@@ -691,14 +826,17 @@ Cache<TagStore>::handleResponse(PacketPtr pkt)
     PacketList writebacks;
 
     if (pkt->req->isUncacheable()) {
-        mshr_uncacheable_lat[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+        mshr_uncacheable_lat[stats_cmd_idx][0/*pkt->req->threadId()*/] +=
             miss_latency;
     } else {
-        mshr_miss_latency[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+        mshr_miss_latency[stats_cmd_idx][0/*pkt->req->threadId()*/] +=
             miss_latency;
     }
 
-    if (mshr->isCacheFill) {
+    bool is_fill = !mshr->isForward &&
+        (pkt->isRead() || pkt->cmd == MemCmd::UpgradeResp);
+
+    if (is_fill && !is_error) {
         DPRINTF(Cache, "Block for addr %x being updated in Cache\n",
                 pkt->getAddr());
 
@@ -719,10 +857,12 @@ Cache<TagStore>::handleResponse(PacketPtr pkt)
     while (mshr->hasTargets()) {
         MSHR::Target *target = mshr->getTarget();
 
-        if (target->isCpuSide()) {
+        switch (target->source) {
+          case MSHR::Target::FromCPU:
             Tick completion_time;
-            if (blk != NULL) {
-                satisfyCpuSideRequest(target->pkt, blk);
+            if (is_fill) {
+                satisfyCpuSideRequest(target->pkt, blk,
+                                      true, mshr->hasPostDowngrade());
                 // How many bytes past the first request is this one
                 int transfer_offset =
                     target->pkt->getOffset(blkSize) - initial_offset;
@@ -732,30 +872,76 @@ Cache<TagStore>::handleResponse(PacketPtr pkt)
 
                 // If critical word (no offset) return first word time
                 completion_time = tags->getHitLatency() +
-                    transfer_offset ? pkt->finishTime : pkt->firstWordTime;
+                    (transfer_offset ? pkt->finishTime : pkt->firstWordTime);
 
                 assert(!target->pkt->req->isUncacheable());
-                missLatency[target->pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] +=
+                missLatency[target->pkt->cmdToIndex()][0/*pkt->req->threadId()*/] +=
                     completion_time - target->recvTime;
+            } else if (target->pkt->cmd == MemCmd::StoreCondReq &&
+                       pkt->cmd == MemCmd::UpgradeFailResp) {
+                // failed StoreCond upgrade
+                completion_time = tags->getHitLatency() + pkt->finishTime;
+                target->pkt->req->setExtraData(0);
             } else {
                 // not a cache fill, just forwarding response
                 completion_time = tags->getHitLatency() + pkt->finishTime;
-                if (pkt->isRead()) {
+                if (pkt->isRead() && !is_error) {
                     target->pkt->setData(pkt->getPtr<uint8_t>());
                 }
             }
             target->pkt->makeTimingResponse();
+            // if this packet is an error copy that to the new packet
+            if (is_error)
+                target->pkt->copyError(pkt);
+            if (target->pkt->cmd == MemCmd::ReadResp &&
+                (pkt->isInvalidate() || mshr->hasPostInvalidate())) {
+                // If intermediate cache got ReadRespWithInvalidate,
+                // propagate that.  Response should not have
+                // isInvalidate() set otherwise.
+                target->pkt->cmd = MemCmd::ReadRespWithInvalidate;
+            }
             cpuSidePort->respond(target->pkt, completion_time);
-        } else {
+            break;
+
+          case MSHR::Target::FromPrefetcher:
+            assert(target->pkt->cmd == MemCmd::HardPFReq);
+            if (blk)
+                blk->status |= BlkHWPrefetched;
+            delete target->pkt->req;
+            delete target->pkt;
+            break;
+
+          case MSHR::Target::FromSnoop:
+            // I don't believe that a snoop can be in an error state
+            assert(!is_error);
             // response to snoop request
             DPRINTF(Cache, "processing deferred snoop...\n");
-            handleSnoop(target->pkt, blk, true, true);
+            assert(!(pkt->isInvalidate() && !mshr->hasPostInvalidate()));
+            handleSnoop(target->pkt, blk, true, true,
+                        mshr->hasPostInvalidate());
+            break;
+
+          default:
+            panic("Illegal target->source enum %d\n", target->source);
         }
 
         mshr->popTarget();
     }
 
+    if (blk) {
+        if (pkt->isInvalidate() || mshr->hasPostInvalidate()) {
+            tags->invalidateBlk(blk);
+        } else if (mshr->hasPostDowngrade()) {
+            blk->status &= ~BlkWritable;
+        }
+    }
+
     if (mshr->promoteDeferredTargets()) {
+        // avoid later read getting stale data while write miss is
+        // outstanding.. see comment in timingAccess()
+        if (blk) {
+            blk->status &= ~BlkReadable;
+        }
         MSHRQueue *mq = mshr->queue;
         mq->markPending(mshr);
         requestMemSideBus((RequestCause)mq->index, pkt->finishTime);
@@ -792,7 +978,7 @@ Cache<TagStore>::writebackBlk(BlkType *blk)
 {
     assert(blk && blk->isValid() && blk->isDirty());
 
-    writebacks[0/*pkt->req->getThreadNum()*/]++;
+    writebacks[0/*pkt->req->threadId()*/]++;
 
     Request *writebackReq =
         new Request(tags->regenerateBlkAddr(blk->tag, blk->set), blkSize, 0);
@@ -805,6 +991,39 @@ Cache<TagStore>::writebackBlk(BlkType *blk)
 }
 
 
+template<class TagStore>
+typename Cache<TagStore>::BlkType*
+Cache<TagStore>::allocateBlock(Addr addr, PacketList &writebacks)
+{
+    BlkType *blk = tags->findVictim(addr, writebacks);
+
+    if (blk->isValid()) {
+        Addr repl_addr = tags->regenerateBlkAddr(blk->tag, blk->set);
+        MSHR *repl_mshr = mshrQueue.findMatch(repl_addr);
+        if (repl_mshr) {
+            // must be an outstanding upgrade request on block
+            // we're about to replace...
+            assert(!blk->isWritable());
+            assert(repl_mshr->needsExclusive());
+            // too hard to replace block with transient state
+            // allocation failed, block not inserted
+            return NULL;
+        } else {
+            DPRINTF(Cache, "replacement: replacing %x with %x: %s\n",
+                    repl_addr, addr,
+                    blk->isDirty() ? "writeback" : "clean");
+
+            if (blk->isDirty()) {
+                // Save writeback packet for handling by caller
+                writebacks.push_back(writebackBlk(blk));
+            }
+        }
+    }
+
+    return blk;
+}
+
+
 // Note that the reason we return a list of writebacks rather than
 // inserting them directly in the write buffer is that this function
 // is called by both atomic and timing-mode accesses, and in atomic
@@ -822,49 +1041,46 @@ Cache<TagStore>::handleFill(PacketPtr pkt, BlkType *blk,
 
     if (blk == NULL) {
         // better have read new data...
-        assert(pkt->isRead());
-
+        assert(pkt->hasData());
         // need to do a replacement
-        blk = tags->findReplacement(addr, writebacks);
-        if (blk->isValid()) {
-            Addr repl_addr = tags->regenerateBlkAddr(blk->tag, blk->set);
-            MSHR *repl_mshr = mshrQueue.findMatch(repl_addr);
-            if (repl_mshr) {
-                // must be an outstanding upgrade request on block
-                // we're about to replace...
-                assert(!blk->isWritable());
-                assert(repl_mshr->needsExclusive());
-                // too hard to replace block with transient state;
-                // just use temporary storage to complete the current
-                // request and then get rid of it
-                assert(!tempBlock->isValid());
-                blk = tempBlock;
-                tempBlock->set = tags->extractSet(addr);
-                DPRINTF(Cache, "using temp block for %x\n", addr);
-            } else {
-                DPRINTF(Cache, "replacement: replacing %x with %x: %s\n",
-                        repl_addr, addr,
-                        blk->isDirty() ? "writeback" : "clean");
-
-                if (blk->isDirty()) {
-                    // Save writeback packet for handling by caller
-                    writebacks.push_back(writebackBlk(blk));
-                }
-            }
+        blk = allocateBlock(addr, writebacks);
+        if (blk == NULL) {
+            // No replaceable block... just use temporary storage to
+            // complete the current request and then get rid of it
+            assert(!tempBlock->isValid());
+            blk = tempBlock;
+            tempBlock->set = tags->extractSet(addr);
+            tempBlock->tag = tags->extractTag(addr);
+            DPRINTF(Cache, "using temp block for %x\n", addr);
+        } else {
+            int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1;
+            tags->insertBlock(pkt->getAddr(), blk, id);
         }
 
-        blk->tag = tags->extractTag(addr);
+        // starting from scratch with a new block
+        blk->status = 0;
     } else {
         // existing block... probably an upgrade
         assert(blk->tag == tags->extractTag(addr));
         // either we're getting new data or the block should already be valid
-        assert(pkt->isRead() || blk->isValid());
+        assert(pkt->hasData() || blk->isValid());
+        // don't clear block status... if block is already dirty we
+        // don't want to lose that
     }
 
-    if (pkt->needsExclusive() || !pkt->sharedAsserted()) {
-        blk->status = BlkValid | BlkWritable;
-    } else {
-        blk->status = BlkValid;
+    blk->status |= BlkValid | BlkReadable;
+
+    if (!pkt->sharedAsserted()) {
+        blk->status |= BlkWritable;
+        // If we got this via cache-to-cache transfer (i.e., from a
+        // cache that was an owner) and took away that owner's copy,
+        // then we need to write it back.  Normally this happens
+        // anyway as a side effect of getting a copy to write it, but
+        // there are cases (such as failed store conditionals or
+        // compare-and-swaps) where we'll demand an exclusive copy but
+        // end up not writing it.
+        if (pkt->memInhibitAsserted())
+            blk->status |= BlkDirty;
     }
 
     DPRINTF(Cache, "Block addr %x moving from state %i to %i\n",
@@ -889,64 +1105,81 @@ Cache<TagStore>::handleFill(PacketPtr pkt, BlkType *blk,
 
 template<class TagStore>
 void
-Cache<TagStore>::doTimingSupplyResponse(PacketPtr req_pkt,
-                                        uint8_t *blk_data,
-                                        bool already_copied)
+Cache<TagStore>::
+doTimingSupplyResponse(PacketPtr req_pkt, uint8_t *blk_data,
+                       bool already_copied, bool pending_inval)
 {
     // timing-mode snoop responses require a new packet, unless we
     // already made a copy...
-    PacketPtr pkt = already_copied ? req_pkt : new Packet(req_pkt, true);
-    if (!req_pkt->isInvalidate()) {
-        // note that we're ignoring the shared flag on req_pkt... it's
-        // basically irrelveant, as we'll always assert shared unless
-        // it's an exclusive request, in which case the shared line
-        // should never be asserted1
-        pkt->assertShared();
-    }
+    PacketPtr pkt = already_copied ? req_pkt : new Packet(req_pkt);
+    assert(req_pkt->isInvalidate() || pkt->sharedAsserted());
     pkt->allocate();
     pkt->makeTimingResponse();
     if (pkt->isRead()) {
         pkt->setDataFromBlock(blk_data, blkSize);
     }
+    if (pkt->cmd == MemCmd::ReadResp && pending_inval) {
+        // Assume we defer a response to a read from a far-away cache
+        // A, then later defer a ReadExcl from a cache B on the same
+        // bus as us.  We'll assert MemInhibit in both cases, but in
+        // the latter case MemInhibit will keep the invalidation from
+        // reaching cache A.  This special response tells cache A that
+        // it gets the block to satisfy its read, but must immediately
+        // invalidate it.
+        pkt->cmd = MemCmd::ReadRespWithInvalidate;
+    }
     memSidePort->respond(pkt, curTick + hitLatency);
 }
 
 template<class TagStore>
 void
 Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
-                             bool is_timing, bool is_deferred)
+                             bool is_timing, bool is_deferred,
+                             bool pending_inval)
 {
+    // deferred snoops can only happen in timing mode
+    assert(!(is_deferred && !is_timing));
+    // pending_inval only makes sense on deferred snoops
+    assert(!(pending_inval && !is_deferred));
     assert(pkt->isRequest());
 
-    // first propagate snoop upward to see if anyone above us wants to
-    // handle it.  save & restore packet src since it will get
-    // rewritten to be relative to cpu-side bus (if any)
-    bool alreadyResponded = pkt->memInhibitAsserted();
-    if (is_timing) {
-        Packet *snoopPkt = new Packet(pkt, true);  // clear flags
-        snoopPkt->setExpressSnoop();
-        snoopPkt->senderState = new ForwardResponseRecord(pkt, this);
-        cpuSidePort->sendTiming(snoopPkt);
-        if (snoopPkt->memInhibitAsserted()) {
-            // cache-to-cache response from some upper cache
-            assert(!alreadyResponded);
-            pkt->assertMemInhibit();
+    // the packet may get modified if we or a forwarded snooper
+    // responds in atomic mode, so remember a few things about the
+    // original packet up front
+    bool invalidate = pkt->isInvalidate();
+    bool M5_VAR_USED needs_exclusive = pkt->needsExclusive();
+
+    if (forwardSnoops) {
+        // first propagate snoop upward to see if anyone above us wants to
+        // handle it.  save & restore packet src since it will get
+        // rewritten to be relative to cpu-side bus (if any)
+        bool alreadyResponded = pkt->memInhibitAsserted();
+        if (is_timing) {
+            Packet *snoopPkt = new Packet(pkt, true);  // clear flags
+            snoopPkt->setExpressSnoop();
+            snoopPkt->senderState = new ForwardResponseRecord(pkt, this);
+            cpuSidePort->sendTiming(snoopPkt);
+            if (snoopPkt->memInhibitAsserted()) {
+                // cache-to-cache response from some upper cache
+                assert(!alreadyResponded);
+                pkt->assertMemInhibit();
+            } else {
+                delete snoopPkt->senderState;
+            }
+            if (snoopPkt->sharedAsserted()) {
+                pkt->assertShared();
+            }
+            delete snoopPkt;
         } else {
-            delete snoopPkt->senderState;
-        }
-        if (snoopPkt->sharedAsserted()) {
-            pkt->assertShared();
-        }
-        delete snoopPkt;
-    } else {
-        int origSrc = pkt->getSrc();
-        cpuSidePort->sendAtomic(pkt);
-        if (!alreadyResponded && pkt->memInhibitAsserted()) {
-            // cache-to-cache response from some upper cache:
-            // forward response to original requester
-            assert(pkt->isResponse());
+            int origSrc = pkt->getSrc();
+            cpuSidePort->sendAtomic(pkt);
+            if (!alreadyResponded && pkt->memInhibitAsserted()) {
+                // cache-to-cache response from some upper cache:
+                // forward response to original requester
+                assert(pkt->isResponse());
+            }
+            pkt->setSrc(origSrc);
         }
-        pkt->setSrc(origSrc);
     }
 
     if (!blk || !blk->isValid()) {
@@ -958,10 +1191,9 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
     // and then do it later
     bool respond = blk->isDirty() && pkt->needsResponse();
     bool have_exclusive = blk->isWritable();
-    bool invalidate = pkt->isInvalidate();
 
-    if (pkt->isRead() && !pkt->isInvalidate()) {
-        assert(!pkt->needsExclusive());
+    if (pkt->isRead() && !invalidate) {
+        assert(!needs_exclusive);
         pkt->assertShared();
         int bits_to_clear = BlkWritable;
         const bool haveOwnershipState = true; // for now
@@ -974,6 +1206,10 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
         blk->status &= ~bits_to_clear;
     }
 
+    DPRINTF(Cache, "snooped a %s request for addr %x, %snew state is %i\n",
+            pkt->cmdString(), blockAlign(pkt->getAddr()),
+            respond ? "responding, " : "", invalidate ? 0 : blk->status);
+
     if (respond) {
         assert(!pkt->memInhibitAsserted());
         pkt->assertMemInhibit();
@@ -981,11 +1217,16 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
             pkt->setSupplyExclusive();
         }
         if (is_timing) {
-            doTimingSupplyResponse(pkt, blk->data, is_deferred);
+            doTimingSupplyResponse(pkt, blk->data, is_deferred, pending_inval);
         } else {
             pkt->makeAtomicResponse();
             pkt->setDataFromBlock(blk->data, blkSize);
         }
+    } else if (is_timing && is_deferred) {
+        // if it's a deferred timing snoop then we've made a copy of
+        // the packet, and so if we're not using that copy to respond
+        // then we need to delete it here.
+        delete pkt;
     }
 
     // Do this last in case it deallocates block data or something
@@ -993,10 +1234,6 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
     if (invalidate) {
         tags->invalidateBlk(blk);
     }
-
-    DPRINTF(Cache, "snooped a %s request for addr %x, %snew state is %i\n",
-            pkt->cmdString(), blockAlign(pkt->getAddr()),
-            respond ? "responding, " : "", blk->status);
 }
 
 
@@ -1015,7 +1252,7 @@ Cache<TagStore>::snoopTiming(PacketPtr pkt)
 
     BlkType *blk = tags->findBlock(pkt->getAddr());
 
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+    Addr blk_addr = blockAlign(pkt->getAddr());
     MSHR *mshr = mshrQueue.findMatch(blk_addr);
 
     // Let the MSHR itself track the snoop and decide whether we want
@@ -1035,7 +1272,7 @@ Cache<TagStore>::snoopTiming(PacketPtr pkt)
                 pkt->getAddr());
 
         //Look through writebacks for any non-uncachable writes, use that
-        for (int i=0; i<writebacks.size(); i++) {
+        for (int i = 0; i < writebacks.size(); i++) {
             mshr = writebacks[i];
             assert(!mshr->isUncacheable());
             assert(mshr->getNumTargets() == 1);
@@ -1052,11 +1289,13 @@ Cache<TagStore>::snoopTiming(PacketPtr pkt)
                 // the packet's invalidate flag is set...
                 assert(pkt->isInvalidate());
             }
-            doTimingSupplyResponse(pkt, wb_pkt->getPtr<uint8_t>(), false);
+            doTimingSupplyResponse(pkt, wb_pkt->getPtr<uint8_t>(),
+                                   false, false);
 
             if (pkt->isInvalidate()) {
                 // Invalidation trumps our writeback... discard here
                 markInService(mshr);
+                delete wb_pkt;
             }
 
             // If this was a shared writeback, there may still be
@@ -1068,7 +1307,7 @@ Cache<TagStore>::snoopTiming(PacketPtr pkt)
         }
     }
 
-    handleSnoop(pkt, blk, true, false);
+    handleSnoop(pkt, blk, true, false, false);
 }
 
 
@@ -1083,7 +1322,7 @@ Cache<TagStore>::snoopAtomic(PacketPtr pkt)
     }
 
     BlkType *blk = tags->findBlock(pkt->getAddr());
-    handleSnoop(pkt, blk, false, false);
+    handleSnoop(pkt, blk, false, false, false);
     return hitLatency;
 }
 
@@ -1148,15 +1387,18 @@ Cache<TagStore>::getNextMSHR()
 
     // fall through... no pending requests.  Try a prefetch.
     assert(!miss_mshr && !write_mshr);
-    if (!mshrQueue.isFull()) {
+    if (prefetcher && !mshrQueue.isFull()) {
         // If we have a miss queue slot, we can try a prefetch
         PacketPtr pkt = prefetcher->getPacket();
         if (pkt) {
-            // Update statistic on number of prefetches issued
-            // (hwpf_mshr_misses)
-            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
-            // Don't request bus, since we already have it
-            return allocateMissBuffer(pkt, curTick, false);
+            Addr pf_addr = blockAlign(pkt->getAddr());
+            if (!tags->findBlock(pf_addr) && !mshrQueue.findMatch(pf_addr)) {
+                // Update statistic on number of prefetches issued
+                // (hwpf_mshr_misses)
+                mshr_misses[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++;
+                // Don't request bus, since we already have it
+                return allocateMissBuffer(pkt, curTick, false);
+            }
         }
     }
 
@@ -1178,7 +1420,17 @@ Cache<TagStore>::getTimingPacket()
     PacketPtr tgt_pkt = mshr->getTarget()->pkt;
     PacketPtr pkt = NULL;
 
-    if (mshr->isSimpleForward()) {
+    if (tgt_pkt->cmd == MemCmd::SCUpgradeFailReq) {
+        // SCUpgradeReq saw invalidation while queued in MSHR, so now
+        // that we are getting around to processing it, just treat it
+        // as if we got a failure response
+        pkt = new Packet(tgt_pkt);
+        pkt->cmd = MemCmd::UpgradeFailResp;
+        pkt->senderState = mshr;
+        pkt->firstWordTime = pkt->finishTime = curTick;
+        handleResponse(pkt);
+        return NULL;
+    } else if (mshr->isForwardNoResponse()) {
         // no response expected, just forward packet as it is
         assert(tags->findBlock(mshr->addr) == NULL);
         pkt = tgt_pkt;
@@ -1186,11 +1438,10 @@ Cache<TagStore>::getTimingPacket()
         BlkType *blk = tags->findBlock(mshr->addr);
         pkt = getBusPacket(tgt_pkt, blk, mshr->needsExclusive());
 
-        mshr->isCacheFill = (pkt != NULL);
+        mshr->isForward = (pkt == NULL);
 
-        if (pkt == NULL) {
+        if (mshr->isForward) {
             // not a cache block request, but a response is expected
-            assert(!mshr->isSimpleForward());
             // make copy of current packet to forward, keep current
             // copy for response handling
             pkt = new Packet(tgt_pkt);
@@ -1207,6 +1458,22 @@ Cache<TagStore>::getTimingPacket()
 }
 
 
+template<class TagStore>
+Tick
+Cache<TagStore>::nextMSHRReadyTime()
+{
+    Tick nextReady = std::min(mshrQueue.nextMSHRReadyTime(),
+                              writeBuffer.nextMSHRReadyTime());
+
+    if (prefetcher) {
+        nextReady = std::min(nextReady,
+                             prefetcher->nextPrefetchReadyTime());
+    }
+
+    return nextReady;
+}
+
+
 ///////////////
 //
 // CpuSidePort
@@ -1218,10 +1485,10 @@ void
 Cache<TagStore>::CpuSidePort::
 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 {
-    // CPU side port doesn't snoop; it's a target only.
-    bool dummy;
-    otherPort->getPeerAddressRanges(resp, dummy);
+    // CPU side port doesn't snoop; it's a target only.  It can
+    // potentially respond to any address.
     snoop = false;
+    resp.push_back(myCache()->getAddrRange());
 }
 
 
@@ -1253,17 +1520,15 @@ template<class TagStore>
 void
 Cache<TagStore>::CpuSidePort::recvFunctional(PacketPtr pkt)
 {
-    checkFunctional(pkt);
-    if (!pkt->isResponse())
-        myCache()->functionalAccess(pkt, cache->memSidePort);
+    myCache()->functionalAccess(pkt, this, otherPort);
 }
 
 
 template<class TagStore>
 Cache<TagStore>::
-CpuSidePort::CpuSidePort(const std::string &_name,
-                         Cache<TagStore> *_cache)
-    : BaseCache::CachePort(_name, _cache)
+CpuSidePort::CpuSidePort(const std::string &_name, Cache<TagStore> *_cache,
+                         const std::string &_label)
+    : BaseCache::CachePort(_name, _cache, _label)
 {
 }
 
@@ -1278,9 +1543,9 @@ void
 Cache<TagStore>::MemSidePort::
 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 {
-    otherPort->getPeerAddressRanges(resp, snoop);
-    // Memory-side port always snoops, so unconditionally set flag for
-    // caller.
+    // Memory-side port always snoops, but never passes requests
+    // through to targets on the cpu side (so we don't add anything to
+    // the address range list).
     snoop = true;
 }
 
@@ -1327,9 +1592,7 @@ template<class TagStore>
 void
 Cache<TagStore>::MemSidePort::recvFunctional(PacketPtr pkt)
 {
-    checkFunctional(pkt);
-    if (!pkt->isResponse())
-        myCache()->functionalAccess(pkt, cache->cpuSidePort);
+    myCache()->functionalAccess(pkt, this, otherPort);
 }
 
 
@@ -1360,18 +1623,15 @@ Cache<TagStore>::MemSidePort::sendPacket()
             MSHR *mshr = dynamic_cast<MSHR*>(pkt->senderState);
 
             bool success = sendTiming(pkt);
-            DPRINTF(CachePort,
-                    "Address %x was %s in sending the timing request\n",
-                    pkt->getAddr(), success ? "successful" : "unsuccessful");
 
             waitingOnRetry = !success;
             if (waitingOnRetry) {
                 DPRINTF(CachePort, "now waiting on a retry\n");
-                if (!mshr->isSimpleForward()) {
+                if (!mshr->isForwardNoResponse()) {
                     delete pkt;
                 }
             } else {
-                myCache()->markInService(mshr);
+                myCache()->markInService(mshr, pkt);
             }
         }
     }
@@ -1385,10 +1645,10 @@ Cache<TagStore>::MemSidePort::sendPacket()
         // @TODO: need to facotr in prefetch requests here somehow
         if (nextReady != MaxTick) {
             DPRINTF(CachePort, "more packets to send @ %d\n", nextReady);
-            sendEvent->schedule(std::max(nextReady, curTick + 1));
+            schedule(sendEvent, std::max(nextReady, curTick + 1));
         } else {
             // no more to send right now: if we're draining, we may be done
-            if (drainEvent) {
+            if (drainEvent && !sendEvent->scheduled()) {
                 drainEvent->process();
                 drainEvent = NULL;
             }
@@ -1416,8 +1676,9 @@ Cache<TagStore>::MemSidePort::processSendEvent()
 
 template<class TagStore>
 Cache<TagStore>::
-MemSidePort::MemSidePort(const std::string &_name, Cache<TagStore> *_cache)
-    : BaseCache::CachePort(_name, _cache)
+MemSidePort::MemSidePort(const std::string &_name, Cache<TagStore> *_cache,
+                         const std::string &_label)
+    : BaseCache::CachePort(_name, _cache, _label)
 {
     // override default send event from SimpleTimingPort
     delete sendEvent;