ruby: message buffers: significant changes
[gem5.git] / src / mem / protocol / MOESI_hammer-cache.sm
index ab2a6acf4c9f8dfc0ade8ae7482f0835c9244e8c..de502e118599f57a0919be2757e8f8619099e293 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
+ * Copyright (c) 1999-2013 Mark D. Hill and David A. Wood
  * Copyright (c) 2009 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * AMD's contributions to the MOESI hammer protocol do not constitute an 
+ * AMD's contributions to the MOESI hammer protocol do not constitute an
  * endorsement of its similarity to any AMD products.
  *
  * Authors: Milo Martin
  *          Brad Beckmann
  */
 
-machine(L1Cache, "AMD Hammer-like protocol") 
-: Sequencer * sequencer,
-  CacheMemory * L1IcacheMemory,
-  CacheMemory * L1DcacheMemory,
-  CacheMemory * L2cacheMemory,
-  int cache_response_latency = 10,
-  int issue_latency = 2,
-  int l2_cache_hit_latency = 10,
-  bool no_mig_atomic = true
+machine({L1Cache, L2Cache}, "AMD Hammer-like protocol")
+    : Sequencer * sequencer;
+      CacheMemory * L1Icache;
+      CacheMemory * L1Dcache;
+      CacheMemory * L2cache;
+      Cycles cache_response_latency := 10;
+      Cycles issue_latency := 2;
+      Cycles l2_cache_hit_latency := 10;
+      bool no_mig_atomic := "True";
+      bool send_evictions;
+
+      // NETWORK BUFFERS
+      MessageBuffer * requestFromCache, network="To", virtual_network="2",
+            ordered="false", vnet_type="request";
+      MessageBuffer * responseFromCache, network="To", virtual_network="4",
+            ordered="false", vnet_type="response";
+      MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+            ordered="false", vnet_type="unblock";
+
+      MessageBuffer * forwardToCache, network="From", virtual_network="3",
+            ordered="false", vnet_type="forward";
+      MessageBuffer * responseToCache, network="From", virtual_network="4",
+            ordered="false", vnet_type="response";
 {
-
-  // NETWORK BUFFERS
-  MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false";
-  MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false";
-  MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false";
-
-  MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false";
-  MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false";
-
-
   // STATES
-  enumeration(State, desc="Cache states", default="L1Cache_State_I") {
+  state_declaration(State, desc="Cache states", default="L1Cache_State_I") {
     // Base states
-    I, desc="Idle";
-    S, desc="Shared";
-    O, desc="Owned";
-    M, desc="Modified (dirty)";
-    MM, desc="Modified (dirty and locally modified)";
+    I, AccessPermission:Invalid, desc="Idle";
+    S, AccessPermission:Read_Only, desc="Shared";
+    O, AccessPermission:Read_Only, desc="Owned";
+    M, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
+
+    // Base states, locked and ready to service the mandatory queue
+    IR, AccessPermission:Invalid, desc="Idle";
+    SR, AccessPermission:Read_Only, desc="Shared";
+    OR, AccessPermission:Read_Only, desc="Owned";
+    MR, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MMR, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
 
     // Transient States
-    IM, "IM", desc="Issued GetX";
-    SM, "SM", desc="Issued GetX, we still have an old copy of the line";
-    OM, "OM", desc="Issued GetX, received data";
-    ISM, "ISM", desc="Issued GetX, received data, waiting for all acks";
-    M_W, "M^W", desc="Issued GetS, received exclusive data";
-    MM_W, "MM^W", desc="Issued GetX, received exclusive data";
-    IS, "IS", desc="Issued GetS";
-    SS, "SS", desc="Issued GetS, received data, waiting for all acks";
-    OI, "OI", desc="Issued PutO, waiting for ack";
-    MI, "MI", desc="Issued PutX, waiting for ack";
-    II, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack";
-    IT, "IT", desc="Invalid block transferring to L1";
-    ST, "ST", desc="S block transferring to L1";
-    OT, "OT", desc="O block transferring to L1";
-    MT, "MT", desc="M block transferring to L1";
-    MMT, "MMT", desc="MM block transferring to L1";
+    IM, AccessPermission:Busy, "IM", desc="Issued GetX";
+    SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line";
+    OM, AccessPermission:Read_Only, "OM", desc="Issued GetX, received data";
+    ISM, AccessPermission:Read_Only, "ISM", desc="Issued GetX, received valid data, waiting for all acks";
+    M_W, AccessPermission:Read_Only, "M^W", desc="Issued GetS, received exclusive data";
+    MM_W, AccessPermission:Read_Write, "MM^W", desc="Issued GetX, received exclusive data";
+    IS, AccessPermission:Busy, "IS", desc="Issued GetS";
+    SS, AccessPermission:Read_Only, "SS", desc="Issued GetS, received data, waiting for all acks";
+    OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack";
+    MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack";
+    II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack";
+    IT, AccessPermission:Busy, "IT", desc="Invalid block transferring to L1";
+    ST, AccessPermission:Busy, "ST", desc="S block transferring to L1";
+    OT, AccessPermission:Busy, "OT", desc="O block transferring to L1";
+    MT, AccessPermission:Busy, "MT", desc="M block transferring to L1";
+    MMT, AccessPermission:Busy, "MMT", desc="MM block transferring to L0";
+
+    //Transition States Related to Flushing
+    MI_F, AccessPermission:Busy, "MI_F", desc="Issued PutX due to a Flush, waiting for ack";
+    MM_F, AccessPermission:Busy, "MM_F", desc="Issued GETF due to a Flush, waiting for ack";
+    IM_F, AccessPermission:Busy, "IM_F", desc="Issued GetX due to a Flush";
+    ISM_F, AccessPermission:Read_Only, "ISM_F", desc="Issued GetX, received data, waiting for all acks";
+    SM_F, AccessPermission:Read_Only, "SM_F", desc="Issued GetX, we still have an old copy of the line";
+    OM_F, AccessPermission:Read_Only, "OM_F", desc="Issued GetX, received data";
+    MM_WF, AccessPermission:Busy, "MM_WF", desc="Issued GetX, received exclusive data";
   }
 
   // EVENTS
@@ -113,6 +133,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
     // Triggers
     All_acks,                  desc="Received all required data and message acks";
     All_acks_no_sharers,        desc="Received all acks and no other processor has a shared copy";
+
+    // For Flush
+    Flush_line,                  desc="flush the cache line from all caches";
+    Block_Ack,                   desc="the directory is blocked and ready for the flush";
   }
 
   // TYPES
@@ -140,52 +164,73 @@ machine(L1Cache, "AMD Hammer-like protocol")
     bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks";
     MachineID LastResponder, desc="last machine to send a response for this request";
     MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
-    Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
-    Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
-    Time FirstResponseTime, default="0", desc="the time the first response was received";
+
+    Cycles InitialRequestTime, default="Cycles(0)",
+            desc="time the initial requests was sent from the L1Cache";
+    Cycles ForwardRequestTime, default="Cycles(0)",
+            desc="time the dir forwarded the request";
+    Cycles FirstResponseTime, default="Cycles(0)",
+            desc="the time the first response was received";
   }
 
-  external_type(TBETable) {
+  structure(TBETable, external="yes") {
     TBE lookup(Address);
     void allocate(Address);
     void deallocate(Address);
     bool isPresent(Address);
   }
 
-  TBETable TBEs, template_hack="<L1Cache_TBE>";
+  TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
   void set_cache_entry(AbstractCacheEntry b);
   void unset_cache_entry();
   void set_tbe(TBE b);
   void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Address a);
+  Cycles curCycle();
 
   Entry getCacheEntry(Address address), return_by_pointer="yes" {
-    Entry L2cache_entry := static_cast(Entry, "pointer", L2cacheMemory.lookup(address));
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
     if(is_valid(L2cache_entry)) {
       return L2cache_entry;
     }
 
-    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1DcacheMemory.lookup(address));
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
     if(is_valid(L1Dcache_entry)) {
       return L1Dcache_entry;
     }
 
-    Entry L1Icache_entry := static_cast(Entry, "pointer", L1IcacheMemory.lookup(address));
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
     return L1Icache_entry;
   }
 
+  DataBlock getDataBlock(Address addr), return_by_ref="yes" {
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+        return cache_entry.DataBlk;
+    }
+
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    }
+
+    error("Missing data block");
+  }
+
   Entry getL2CacheEntry(Address address), return_by_pointer="yes" {
-    Entry L2cache_entry := static_cast(Entry, "pointer", L2cacheMemory.lookup(address));
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
     return L2cache_entry;
   }
 
   Entry getL1DCacheEntry(Address address), return_by_pointer="yes" {
-    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1DcacheMemory.lookup(address));
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
     return L1Dcache_entry;
   }
 
   Entry getL1ICacheEntry(Address address), return_by_pointer="yes" {
-    Entry L1Icache_entry := static_cast(Entry, "pointer", L1IcacheMemory.lookup(address));
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
     return L1Icache_entry;
   }
 
@@ -199,9 +244,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   void setState(TBE tbe, Entry cache_entry, Address addr, State state) {
-    assert((L1DcacheMemory.isTagPresent(addr) && L1IcacheMemory.isTagPresent(addr)) == false);
-    assert((L1IcacheMemory.isTagPresent(addr) && L2cacheMemory.isTagPresent(addr)) == false);
-    assert((L1DcacheMemory.isTagPresent(addr) && L2cacheMemory.isTagPresent(addr)) == false);
+    assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false);
+    assert((L1Icache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
+    assert((L1Dcache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
 
     if (is_valid(tbe)) {
       tbe.TBEState := state;
@@ -209,56 +254,49 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
     if (is_valid(cache_entry)) {
       cache_entry.CacheState := state;
-    
-      // Set permission
-      if ((state == State:MM) || 
-          (state == State:MM_W)) {
-        cache_entry.changePermission(AccessPermission:Read_Write);
-      } else if (state == State:S || 
-                 state == State:O || 
-                 state == State:M || 
-                 state == State:M_W || 
-                 state == State:SM || 
-                 state == State:ISM || 
-                 state == State:OM || 
-                 state == State:SS) {
-        cache_entry.changePermission(AccessPermission:Read_Only);
-      } else {
-        cache_entry.changePermission(AccessPermission:Invalid);
-      }
     }
   }
 
-  Event mandatory_request_type_to_event(CacheRequestType type) {
-    if (type == CacheRequestType:LD) {
-      return Event:Load;
-    } else if (type == CacheRequestType:IFETCH) {
-      return Event:Ifetch;
-    } else if ((type == CacheRequestType:ST) || (type == CacheRequestType:ATOMIC)) {
-      return Event:Store;
-    } else {
-      error("Invalid CacheRequestType");
+  AccessPermission getAccessPermission(Address addr) {
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return L1Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return L1Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Address addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L1Cache_State_to_permission(state));
     }
   }
 
-  GenericMachineType getNondirectHitMachType(Address addr, MachineID sender) {
-    if (machineIDToMachineType(sender) == MachineType:L1Cache) {
-      //
-      // NOTE direct local hits should not call this
-      //
-      return GenericMachineType:L1Cache_wCC; 
+  Event mandatory_request_type_to_event(RubyRequestType type) {
+    if (type == RubyRequestType:LD) {
+      return Event:Load;
+    } else if (type == RubyRequestType:IFETCH) {
+      return Event:Ifetch;
+    } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)) {
+      return Event:Store;
+    } else if ((type == RubyRequestType:FLUSH)) {
+      return Event:Flush_line;
     } else {
-      return ConvertMachToGenericMach(machineIDToMachineType(sender));
+      error("Invalid RubyRequestType");
     }
   }
 
-  GenericMachineType testAndClearLocalHit(Entry cache_entry) {
+  MachineType testAndClearLocalHit(Entry cache_entry) {
     if (is_valid(cache_entry) && cache_entry.FromL2) {
       cache_entry.FromL2 := false;
-      return GenericMachineType:L2Cache;
-    } else {
-      return GenericMachineType:L1Cache; 
+      return MachineType:L2Cache;
     }
+    return MachineType:L1Cache;
   }
 
   bool IsAtomicAccessed(Entry cache_entry) {
@@ -282,15 +320,15 @@ machine(L1Cache, "AMD Hammer-like protocol")
     if (triggerQueue_in.isReady()) {
       peek(triggerQueue_in, TriggerMsg) {
 
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
 
         if (in_msg.Type == TriggerType:L2_to_L1) {
-          trigger(Event:Complete_L2_to_L1, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Complete_L2_to_L1, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == TriggerType:ALL_ACKS) {
-          trigger(Event:All_acks, in_msg.Address, cache_entry, tbe);
+          trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
-          trigger(Event:All_acks_no_sharers, in_msg.Address, cache_entry, tbe);
+          trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe);
         } else {
           error("Unexpected message");
         }
@@ -303,21 +341,21 @@ machine(L1Cache, "AMD Hammer-like protocol")
   // Response Network
   in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) {
     if (responseToCache_in.isReady()) {
-      peek(responseToCache_in, ResponseMsg, block_on="Address") {
+      peek(responseToCache_in, ResponseMsg, block_on="Addr") {
 
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
 
         if (in_msg.Type == CoherenceResponseType:ACK) {
-          trigger(Event:Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Ack, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) {
-          trigger(Event:Shared_Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceResponseType:DATA) {
-          trigger(Event:Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Data, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) {
-          trigger(Event:Shared_Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
-          trigger(Event:Exclusive_Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe);
         } else {
           error("Unexpected message");
         }
@@ -328,35 +366,38 @@ machine(L1Cache, "AMD Hammer-like protocol")
   // Forward Network
   in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) {
     if (forwardToCache_in.isReady()) {
-      peek(forwardToCache_in, RequestMsg, block_on="Address") {
+      peek(forwardToCache_in, RequestMsg, block_on="Addr") {
 
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
 
-        if (in_msg.Type == CoherenceRequestType:GETX) {
-          trigger(Event:Other_GETX, in_msg.Address, cache_entry, tbe);
+        if ((in_msg.Type == CoherenceRequestType:GETX) ||
+            (in_msg.Type == CoherenceRequestType:GETF)) {
+          trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) {
-          trigger(Event:Merged_GETS, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:GETS) {
           if (machineCount(MachineType:L1Cache) > 1) {
             if (is_valid(cache_entry)) {
               if (IsAtomicAccessed(cache_entry) && no_mig_atomic) {
-                trigger(Event:Other_GETS_No_Mig, in_msg.Address, cache_entry, tbe);
+                trigger(Event:Other_GETS_No_Mig, in_msg.Addr, cache_entry, tbe);
               } else {
-                trigger(Event:Other_GETS, in_msg.Address, cache_entry, tbe);
+                trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
               }
             } else {
-              trigger(Event:Other_GETS, in_msg.Address, cache_entry, tbe);
+              trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
             }
           } else {
-            trigger(Event:NC_DMA_GETS, in_msg.Address, cache_entry, tbe);
+            trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe);
           }
         } else if (in_msg.Type == CoherenceRequestType:INV) {
-          trigger(Event:Invalidate, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:WB_ACK) {
-          trigger(Event:Writeback_Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:WB_NACK) {
-          trigger(Event:Writeback_Nack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) {
+          trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe);
         } else {
           error("Unexpected message");
         }
@@ -367,19 +408,20 @@ machine(L1Cache, "AMD Hammer-like protocol")
   // Nothing from the request network
 
   // Mandatory Queue
-  in_port(mandatoryQueue_in, CacheMsg, mandatoryQueue, desc="...", rank=0) {
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...", rank=0) {
     if (mandatoryQueue_in.isReady()) {
-      peek(mandatoryQueue_in, CacheMsg, block_on="LineAddress") {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
 
         // Check for data access to blocks in I-cache and ifetchs to blocks in D-cache
         TBE tbe := TBEs[in_msg.LineAddress];
 
-        if (in_msg.Type == CacheRequestType:IFETCH) {
+        if (in_msg.Type == RubyRequestType:IFETCH) {
           // ** INSTRUCTION ACCESS ***
 
           Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
           if (is_valid(L1Icache_entry)) {
-            // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Icache_entry, tbe);
           } else {
@@ -387,17 +429,18 @@ machine(L1Cache, "AMD Hammer-like protocol")
             Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
             if (is_valid(L1Dcache_entry)) {
               // The block is in the wrong L1, try to write it to the L2
-              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
                 trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe);
               } else {
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
                 trigger(Event:L2_Replacement,
-                        L2cacheMemory.cacheProbe(in_msg.LineAddress),
-                        getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
-                        TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
               }
             }
 
-            if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
+            if (L1Icache.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
 
               Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
@@ -412,18 +455,20 @@ machine(L1Cache, "AMD Hammer-like protocol")
               }
             } else {
               // No room in the L1, so we need to make room
-              if (L2cacheMemory.cacheAvail(L1IcacheMemory.cacheProbe(in_msg.LineAddress))) {
+              Address l1i_victim_addr := L1Icache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1i_victim_addr)) {
                 // The L2 has room, so we move the line from the L1 to the L2
                 trigger(Event:L1_to_L2,
-                        L1IcacheMemory.cacheProbe(in_msg.LineAddress),
-                        getL1ICacheEntry(L1IcacheMemory.cacheProbe(in_msg.LineAddress)),
-                        TBEs[L1IcacheMemory.cacheProbe(in_msg.LineAddress)]);
+                        l1i_victim_addr,
+                        getL1ICacheEntry(l1i_victim_addr),
+                        TBEs[l1i_victim_addr]);
               } else {
+                Address l2_victim_addr := L2cache.cacheProbe(l1i_victim_addr);
                 // The L2 does not have room, so we replace a line from the L2
                 trigger(Event:L2_Replacement,
-                        L2cacheMemory.cacheProbe(L1IcacheMemory.cacheProbe(in_msg.LineAddress)),
-                        getL2CacheEntry(L2cacheMemory.cacheProbe(L1IcacheMemory.cacheProbe(in_msg.LineAddress))),
-                        TBEs[L2cacheMemory.cacheProbe(L1IcacheMemory.cacheProbe(in_msg.LineAddress))]);
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
               }
             }
           }
@@ -432,7 +477,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
           Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
           if (is_valid(L1Dcache_entry)) {
-            // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Dcache_entry, tbe);
           } else {
@@ -441,17 +487,18 @@ machine(L1Cache, "AMD Hammer-like protocol")
             Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
             if (is_valid(L1Icache_entry)) {
               // The block is in the wrong L1, try to write it to the L2
-              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
                 trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe);
               } else {
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
                 trigger(Event:L2_Replacement,
-                        L2cacheMemory.cacheProbe(in_msg.LineAddress),
-                        getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
-                        TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
               }
             }
 
-            if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
+            if (L1Dcache.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
               Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
               if (is_valid(L2cache_entry)) {
@@ -465,18 +512,20 @@ machine(L1Cache, "AMD Hammer-like protocol")
               }
             } else {
               // No room in the L1, so we need to make room
-              if (L2cacheMemory.cacheAvail(L1DcacheMemory.cacheProbe(in_msg.LineAddress))) {
+              Address l1d_victim_addr := L1Dcache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1d_victim_addr)) {
                 // The L2 has room, so we move the line from the L1 to the L2
                 trigger(Event:L1_to_L2,
-                        L1DcacheMemory.cacheProbe(in_msg.LineAddress),
-                        getL1DCacheEntry(L1DcacheMemory.cacheProbe(in_msg.LineAddress)),
-                        TBEs[L1DcacheMemory.cacheProbe(in_msg.LineAddress)]);
+                        l1d_victim_addr,
+                        getL1DCacheEntry(l1d_victim_addr),
+                        TBEs[l1d_victim_addr]);
               } else {
+                Address l2_victim_addr := L2cache.cacheProbe(l1d_victim_addr);
                 // The L2 does not have room, so we replace a line from the L2
                 trigger(Event:L2_Replacement,
-                        L2cacheMemory.cacheProbe(L1DcacheMemory.cacheProbe(in_msg.LineAddress)),
-                        getL2CacheEntry(L2cacheMemory.cacheProbe(L1DcacheMemory.cacheProbe(in_msg.LineAddress))),
-                        TBEs[L2cacheMemory.cacheProbe(L1DcacheMemory.cacheProbe(in_msg.LineAddress))]);
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
               }
             }
           }
@@ -484,40 +533,76 @@ machine(L1Cache, "AMD Hammer-like protocol")
       }
     }
   }
-  
+
   // ACTIONS
 
   action(a_issueGETS, "a", desc="Issue GETS") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
       assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
       out_msg.Type := CoherenceRequestType:GETS;
       out_msg.Requestor := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.MessageSize := MessageSizeType:Request_Control;
-      out_msg.InitialRequestTime := get_time();
-      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
     }
   }
 
   action(b_issueGETX, "b", desc="Issue GETX") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
       assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
       out_msg.Type := CoherenceRequestType:GETX;
       out_msg.Requestor := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.MessageSize := MessageSizeType:Request_Control;
-      out_msg.InitialRequestTime := get_time();
-      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+    }
+  }
+
+  action(b_issueGETXIfMoreThanOne, "bo", desc="Issue GETX") {
+    if (machineCount(MachineType:L1Cache) > 1) {
+      enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceRequestType:GETX;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := curCycle();
+      }
+    }
+
+    // One from each other cache (n-1) plus the memory (+1)
+    tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+  }
+
+  action(bf_issueGETF, "bf", desc="Issue GETF") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETF;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
     }
   }
 
   action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -536,9 +621,32 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
+  action(ct_sendExclusiveDataFromTBE, "ct", desc="Send exclusive data from tbe to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
   action(d_issuePUT, "d", desc="Issue PUT") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
-      out_msg.Address := address;
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
       out_msg.Type := CoherenceRequestType:PUT;
       out_msg.Requestor := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
@@ -546,11 +654,21 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
+  action(df_issuePUTF, "df", desc="Issue PUTF") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:PUTF;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+    }
+  }
+
   action(e_sendData, "e", desc="Send data from cache to requestor") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:DATA;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -569,11 +687,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
-  action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, keep a shared copy") {
+  action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:DATA_SHARED;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -592,12 +710,36 @@ machine(L1Cache, "AMD Hammer-like protocol")
       }
     }
   }
-  
-  action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors") {
+
+  action(et_sendDataSharedFromTBE, "\et", desc="Send data from TBE to requestor, keep a shared copy") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:DATA_SHARED;
         out_msg.Sender := machineID;
         out_msg.Destination := in_msg.MergedRequestors;
@@ -612,11 +754,31 @@ machine(L1Cache, "AMD Hammer-like protocol")
       }
     }
   }
-  
+
+  action(emt_sendDataSharedMultipleFromTBE, "emt", desc="Send data from tbe to all requestors") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        out_msg.Acks := machineCount(MachineType:L1Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
   action(f_sendAck, "f", desc="Send ack from cache to requestor") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
-        out_msg.Address := address;
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:ACK;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -632,8 +794,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
-        out_msg.Address := address;
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:ACK_SHARED;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -648,8 +810,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(g_sendUnblock, "g", desc="Send unblock to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
-      out_msg.Address := address;
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
       out_msg.Type := CoherenceResponseType:UNBLOCK;
       out_msg.Sender := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
@@ -658,8 +820,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
-      out_msg.Address := address;
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
       out_msg.Type := CoherenceResponseType:UNBLOCKM;
       out_msg.Sender := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
@@ -668,9 +830,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
       assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
       out_msg.Type := CoherenceResponseType:UNBLOCKS;
       out_msg.Sender := machineID;
       out_msg.CurOwner := tbe.CurOwner;
@@ -682,8 +844,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
   action(h_load_hit, "h", desc="Notify sequencer the load completed.") {
     assert(is_valid(cache_entry));
     DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
-    sequencer.readCallback(address, testAndClearLocalHit(cache_entry),
-                           cache_entry.DataBlk);
+    sequencer.readCallback(address, cache_entry.DataBlk, false,
+                           testAndClearLocalHit(cache_entry));
   }
 
   action(hx_external_load_hit, "hx", desc="load required external msgs") {
@@ -692,42 +854,43 @@ machine(L1Cache, "AMD Hammer-like protocol")
     DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
     peek(responseToCache_in, ResponseMsg) {
 
-      sequencer.readCallback(address, 
-                             getNondirectHitMachType(in_msg.Address, in_msg.Sender),
-                             cache_entry.DataBlk,
-                             tbe.InitialRequestTime,
-                             tbe.ForwardRequestTime,
-                             tbe.FirstResponseTime);
+      sequencer.readCallback(address, cache_entry.DataBlk, true,
+                 machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+                 tbe.ForwardRequestTime, tbe.FirstResponseTime);
     }
   }
 
   action(hh_store_hit, "\h", desc="Notify sequencer that store completed.") {
     assert(is_valid(cache_entry));
     DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
-    peek(mandatoryQueue_in, CacheMsg) {
-      sequencer.writeCallback(address, testAndClearLocalHit(cache_entry),
-                              cache_entry.DataBlk);
+    peek(mandatoryQueue_in, RubyRequest) {
+      sequencer.writeCallback(address, cache_entry.DataBlk, false,
+                              testAndClearLocalHit(cache_entry));
 
       cache_entry.Dirty := true;
-      if (in_msg.Type == CacheRequestType:ATOMIC) {
+      if (in_msg.Type == RubyRequestType:ATOMIC) {
         cache_entry.AtomicAccessed := true;
       }
     }
   }
 
+  action(hh_flush_hit, "\hf", desc="Notify sequencer that flush completed.") {
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "%s\n", tbe.DataBlk);
+    sequencer.writeCallback(address, tbe.DataBlk, false, MachineType:L1Cache);
+  }
+
   action(sx_external_store_hit, "sx", desc="store required external msgs.") {
     assert(is_valid(cache_entry));
     assert(is_valid(tbe));
     DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
     peek(responseToCache_in, ResponseMsg) {
 
-      sequencer.writeCallback(address, 
-                              getNondirectHitMachType(address, in_msg.Sender),
-                              cache_entry.DataBlk,
-                              tbe.InitialRequestTime,
-                              tbe.ForwardRequestTime,
-                              tbe.FirstResponseTime);
+      sequencer.writeCallback(address, cache_entry.DataBlk, true,
+              machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+              tbe.ForwardRequestTime, tbe.FirstResponseTime);
     }
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
     cache_entry.Dirty := true;
   }
 
@@ -736,12 +899,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
     assert(is_valid(tbe));
     DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
 
-    sequencer.writeCallback(address, 
-                            getNondirectHitMachType(address, tbe.LastResponder),
-                            cache_entry.DataBlk,
-                            tbe.InitialRequestTime,
-                            tbe.ForwardRequestTime,
-                            tbe.FirstResponseTime);
+    sequencer.writeCallback(address, cache_entry.DataBlk, true,
+            machineIDToMachineType(tbe.LastResponder), tbe.InitialRequestTime,
+            tbe.ForwardRequestTime, tbe.FirstResponseTime);
 
     cache_entry.Dirty := true;
   }
@@ -756,6 +916,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
     tbe.Sharers := false;
   }
 
+  action(it_allocateTBE, "it", desc="Allocate TBE") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.Dirty := false;
+    tbe.Sharers := false;
+  }
+
   action(j_popTriggerQueue, "j", desc="Pop trigger queue.") {
     triggerQueue_in.dequeue();
   }
@@ -785,7 +953,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
     peek(responseToCache_in, ResponseMsg) {
-      assert(in_msg.Acks > 0);
+      assert(in_msg.Acks >= 0);
       assert(is_valid(tbe));
       DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender);
       DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks);
@@ -812,7 +980,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
         tbe.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
       if (tbe.FirstResponseTime == zero_time()) {
-        tbe.FirstResponseTime := get_time();
+        tbe.FirstResponseTime := curCycle();
       }
     }
   }
@@ -828,8 +996,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(ll_L2toL1Transfer, "ll", desc="") {
-    enqueue(triggerQueue_out, TriggerMsg, latency=l2_cache_hit_latency) {
-      out_msg.Address := address;
+    enqueue(triggerQueue_out, TriggerMsg, l2_cache_hit_latency) {
+      out_msg.Addr := address;
       out_msg.Type := TriggerType:L2_to_L1;
     }
   }
@@ -838,7 +1006,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     assert(is_valid(tbe));
     if (tbe.NumPendingMsgs == 0) {
       enqueue(triggerQueue_out, TriggerMsg) {
-        out_msg.Address := address;
+        out_msg.Addr := address;
         if (tbe.Sharers) {
           out_msg.Type := TriggerType:ALL_ACKS;
         } else {
@@ -861,9 +1029,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
   action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") {
     peek(forwardToCache_in, RequestMsg) {
         assert(in_msg.Requestor != machineID);
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
         out_msg.Type := CoherenceResponseType:DATA;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
@@ -883,12 +1051,37 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
-  action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers") {
+  action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") {
     peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+        assert(in_msg.Requestor != machineID);
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
         assert(is_valid(tbe));
-        out_msg.Address := address;
-        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
         out_msg.Sender := machineID;
         out_msg.Destination := in_msg.MergedRequestors;
         DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
@@ -904,9 +1097,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
       assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
       out_msg.Sender := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.Dirty := tbe.Dirty;
@@ -918,7 +1111,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Type := CoherenceResponseType:WB_CLEAN;
         // NOTE: in a real system this would not send data.  We send
         // data here only so we can check it at the memory
-        out_msg.DataBlk := tbe.DataBlk; 
+        out_msg.DataBlk := tbe.DataBlk;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
       }
     }
@@ -935,12 +1128,12 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
       assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
       out_msg.Sender := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
-      out_msg.DataBlk := tbe.DataBlk; 
+      out_msg.DataBlk := tbe.DataBlk;
       out_msg.Dirty := tbe.Dirty;
       if (tbe.Dirty) {
         out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY;
@@ -964,6 +1157,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
+  action(uf_writeDataToCacheTBE, "uf", desc="Write data to TBE") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
   action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") {
     peek(responseToCache_in, ResponseMsg) {
       assert(is_valid(cache_entry));
@@ -974,60 +1175,94 @@ machine(L1Cache, "AMD Hammer-like protocol")
       cache_entry.Dirty := in_msg.Dirty || cache_entry.Dirty;
     }
   }
-  
+
+  action(vt_writeDataToTBEVerify, "vt", desc="Write data to TBE, assert it was same as before") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n",
+              tbe.DataBlk, in_msg.DataBlk);
+      assert(tbe.DataBlk == in_msg.DataBlk);
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty || tbe.Dirty;
+    }
+  }
+
   action(gg_deallocateL1CacheBlock, "\g", desc="Deallocate cache block.  Sets the cache to invalid, allowing a replacement in parallel with a fetch.") {
-    if (L1DcacheMemory.isTagPresent(address)) {
-      L1DcacheMemory.deallocate(address);
+    if (L1Dcache.isTagPresent(address)) {
+      L1Dcache.deallocate(address);
     } else {
-      L1IcacheMemory.deallocate(address);
+      L1Icache.deallocate(address);
     }
     unset_cache_entry();
   }
-  
+
   action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") {
     if (is_invalid(cache_entry)) {
-      set_cache_entry(L1DcacheMemory.allocate(address, new Entry));
+      set_cache_entry(L1Dcache.allocate(address, new Entry));
     }
   }
 
   action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") {
     if (is_invalid(cache_entry)) {
-      set_cache_entry(L1IcacheMemory.allocate(address, new Entry));
+      set_cache_entry(L1Icache.allocate(address, new Entry));
     }
   }
 
   action(vv_allocateL2CacheBlock, "\v", desc="Set L2 cache tag equal to tag of block B.") {
-    set_cache_entry(L2cacheMemory.allocate(address, new Entry));
+    set_cache_entry(L2cache.allocate(address, new Entry));
   }
 
   action(rr_deallocateL2CacheBlock, "\r", desc="Deallocate L2 cache block.  Sets the cache to not present, allowing a replacement in parallel with a fetch.") {
-    L2cacheMemory.deallocate(address);
+    L2cache.deallocate(address);
     unset_cache_entry();
   }
 
-  action(uu_profileMiss, "\u", desc="Profile the demand miss") {
-    peek(mandatoryQueue_in, CacheMsg) {
-      if (L1IcacheMemory.isTagPresent(address)) {
-        L1IcacheMemory.profileMiss(in_msg);
-      } else if (L1DcacheMemory.isTagPresent(address)) {
-        L1DcacheMemory.profileMiss(in_msg);
-      }
-      if (L2cacheMemory.isTagPresent(address) == false) {
-        L2cacheMemory.profileMiss(in_msg);
-      }
+  action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
     }
   }
 
+  action(uu_profileL1DataMiss, "\udm", desc="Profile the demand miss") {
+      ++L1Dcache.demand_misses;
+  }
+
+  action(uu_profileL1DataHit, "\udh", desc="Profile the demand hits") {
+      ++L1Dcache.demand_hits;
+  }
+
+  action(uu_profileL1InstMiss, "\uim", desc="Profile the demand miss") {
+      ++L1Icache.demand_misses;
+  }
+
+  action(uu_profileL1InstHit, "\uih", desc="Profile the demand hits") {
+      ++L1Icache.demand_hits;
+  }
+
+  action(uu_profileL2Miss, "\um", desc="Profile the demand miss") {
+      ++L2cache.demand_misses;
+  }
+
+  action(uu_profileL2Hit, "\uh", desc="Profile the demand hits ") {
+      ++L2cache.demand_hits;
+  }
+
   action(zz_stallAndWaitMandatoryQueue, "\z", desc="Send the head of the mandatory queue to the back of the queue.") {
     stall_and_wait(mandatoryQueue_in, address);    
   }
 
+  action(z_stall, "z", desc="stall") {
+    // do nothing and the special z_stall action will return a protocol stall
+    // so that the next port is checked
+  }
+
   action(kd_wakeUpDependents, "kd", desc="wake-up dependents") {
-    wake_up_dependents(address);
+    wakeUpBuffers(address);
   }
 
   action(ka_wakeUpAllDependents, "ka", desc="wake-up all dependents") {
-    wake_up_all_dependents();
+    wakeUpAllBuffers();
   }
 
   //*****************************************************
@@ -1035,24 +1270,40 @@ machine(L1Cache, "AMD Hammer-like protocol")
   //*****************************************************
 
   // Transitions for Load/Store/L2_Replacement from transient states
-  transition({IM, SM, ISM, OM, IS, SS, OI, MI, II, IT, ST, OT, MT, MMT}, {Store, L2_Replacement}) {
+  transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II, IT, ST, OT, MT, MMT}, {Store, L2_Replacement}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II}, {Flush_line}) {
     zz_stallAndWaitMandatoryQueue;
   }
 
-  transition({M_W, MM_W}, {L2_Replacement}) {
+  transition({M_W, MM_W}, {L2_Replacement, Flush_line}) {
     zz_stallAndWaitMandatoryQueue;
   }
 
-  transition({IM, IS, OI, MI, II, IT, ST, OT, MT, MMT}, {Load, Ifetch}) {
+  transition({IM, IS, OI, MI, II, IT, ST, OT, MT, MMT, MI_F, MM_F, OM_F, IM_F, ISM_F, SM_F, MM_WF}, {Load, Ifetch}) {
     zz_stallAndWaitMandatoryQueue;
   }
 
-  transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT}, L1_to_L2) {
+  transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT, IM_F, SM_F, ISM_F, OM_F, MM_WF, MI_F, MM_F, IR, SR, OR, MR, MMR}, L1_to_L2) {
     zz_stallAndWaitMandatoryQueue;
   }
 
-  transition({IT, ST, OT, MT, MMT}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) {
-    // stall
+  transition({MI_F, MM_F}, {Store}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({MM_F, MI_F}, {Flush_line}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IT, ST, OT, MT, MMT}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate, Flush_line}) {
+    z_stall;
+  }
+
+  transition({IR, SR, OR, MR, MMR}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) {
+    z_stall;
   }
 
   // Transitions moving data between the L1 and L2 caches
@@ -1062,7 +1313,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     vv_allocateL2CacheBlock;
     hp_copyFromTBEToL2;
     s_deallocateTBE;
-    ka_wakeUpAllDependents;
   }
 
   transition(I, Trigger_L2_to_L1D, IT) {
@@ -1071,7 +1321,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     ii_allocateL1DCacheBlock;
     nb_copyFromTBEToL1; // Not really needed for state I
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1082,7 +1331,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     ii_allocateL1DCacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1093,7 +1341,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     ii_allocateL1DCacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1104,7 +1351,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     ii_allocateL1DCacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1115,7 +1361,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     ii_allocateL1DCacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1126,7 +1371,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     jj_allocateL1ICacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1137,7 +1381,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     jj_allocateL1ICacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1148,7 +1391,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     jj_allocateL1ICacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1159,7 +1401,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
     jj_allocateL1ICacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
@@ -1170,58 +1411,66 @@ machine(L1Cache, "AMD Hammer-like protocol")
     jj_allocateL1ICacheBlock;
     nb_copyFromTBEToL1;
     s_deallocateTBE;
-    uu_profileMiss;
     zz_stallAndWaitMandatoryQueue;
     ll_L2toL1Transfer;
   }
 
-  transition(IT, Complete_L2_to_L1, I) {
+  transition(IT, Complete_L2_to_L1, IR) {
     j_popTriggerQueue;
     kd_wakeUpDependents;
   }
 
-  transition(ST, Complete_L2_to_L1, S) {
+  transition(ST, Complete_L2_to_L1, SR) {
     j_popTriggerQueue;
     kd_wakeUpDependents;
   }
 
-  transition(OT, Complete_L2_to_L1, O) {
+  transition(OT, Complete_L2_to_L1, OR) {
     j_popTriggerQueue;
     kd_wakeUpDependents;
   }
 
-  transition(MT, Complete_L2_to_L1, M) {
+  transition(MT, Complete_L2_to_L1, MR) {
     j_popTriggerQueue;
     kd_wakeUpDependents;
   }
 
-  transition(MMT, Complete_L2_to_L1, MM) {
+  transition(MMT, Complete_L2_to_L1, MMR) {
     j_popTriggerQueue;
     kd_wakeUpDependents;
   }
 
   // Transitions from Idle
-  transition(I, Load, IS) {
+  transition({I,IR}, Load, IS) {
     ii_allocateL1DCacheBlock;
     i_allocateTBE;
     a_issueGETS;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
     k_popMandatoryQueue;
   }
 
-  transition(I, Ifetch, IS) {
+  transition({I,IR}, Ifetch, IS) {
     jj_allocateL1ICacheBlock;
     i_allocateTBE;
     a_issueGETS;
-    uu_profileMiss;
+    uu_profileL1InstMiss;
+    uu_profileL2Miss;
     k_popMandatoryQueue;
   }
 
-  transition(I, Store, IM) {
+  transition({I,IR}, Store, IM) {
     ii_allocateL1DCacheBlock;
     i_allocateTBE;
     b_issueGETX;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({I, IR}, Flush_line, IM_F) {
+    it_allocateTBE;
+    bf_issueGETF;
     k_popMandatoryQueue;
   }
 
@@ -1236,25 +1485,59 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   // Transitions from Shared
-  transition({S, SM, ISM}, {Load, Ifetch}) {
+  transition({S, SM, ISM}, Load) {
+    h_load_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({S, SM, ISM}, Ifetch) {
     h_load_hit;
+    uu_profileL1InstHit;
     k_popMandatoryQueue;
   }
 
-  transition(S, Store, SM) {
+  transition(SR, Load, S) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(SR, Ifetch, S) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({S,SR}, Store, SM) {
     i_allocateTBE;
     b_issueGETX;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({S, SR}, Flush_line, SM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
     k_popMandatoryQueue;
   }
 
   transition(S, L2_Replacement, I) {
+    forward_eviction_to_cpu;
     rr_deallocateL2CacheBlock;
     ka_wakeUpAllDependents;
   }
 
   transition(S, {Other_GETX, Invalidate}, I) {
     f_sendAck;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
@@ -1264,28 +1547,63 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   // Transitions from Owned
-  transition({O, OM, SS, MM_W, M_W}, {Load, Ifetch}) {
+  transition({O, OM, SS, MM_W, M_W}, {Load}) {
     h_load_hit;
+    uu_profileL1DataHit;
     k_popMandatoryQueue;
   }
 
-  transition(O, Store, OM) {
+  transition({O, OM, SS, MM_W, M_W}, {Ifetch}) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(OR, Load, O) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(OR, Ifetch, O) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({O,OR}, Store, OM) {
     i_allocateTBE;
     b_issueGETX;
     p_decrementNumberOfMessagesByOne;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({O, OR}, Flush_line, OM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    p_decrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
     k_popMandatoryQueue;
   }
 
   transition(O, L2_Replacement, OI) {
     i_allocateTBE;
     d_issuePUT;
+    forward_eviction_to_cpu;
     rr_deallocateL2CacheBlock;
     ka_wakeUpAllDependents;
   }
 
   transition(O, {Other_GETX, Invalidate}, I) {
     e_sendData;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
@@ -1300,68 +1618,140 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   // Transitions from Modified
-  transition(MM, {Load, Ifetch}) {
+  transition({MM, M}, {Ifetch}) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({MM, M}, {Load}) {
     h_load_hit;
+    uu_profileL1DataHit;
     k_popMandatoryQueue;
   }
 
   transition(MM, Store) {
     hh_store_hit;
+    uu_profileL1DataHit;
     k_popMandatoryQueue;
   }
 
+  transition(MMR, Load, MM) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Ifetch, MM) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({MM, M, MMR, MR}, Flush_line, MM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    p_decrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
+    k_popMandatoryQueue;
+  }
+
+  transition(MM_F, Block_Ack, MI_F) {
+    df_issuePUTF;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
   transition(MM, L2_Replacement, MI) {
     i_allocateTBE;
     d_issuePUT;
+    forward_eviction_to_cpu;
     rr_deallocateL2CacheBlock;
     ka_wakeUpAllDependents;
   }
 
   transition(MM, {Other_GETX, Invalidate}, I) {
     c_sendExclusiveData;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
   transition(MM, Other_GETS, I) {
     c_sendExclusiveData;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
-  
-  transition(MM, NC_DMA_GETS) {
-    c_sendExclusiveData;
+
+  transition(MM, NC_DMA_GETS, O) {
+    ee_sendDataShared;
     l_popForwardQueue;
   }
-  
+
   transition(MM, Other_GETS_No_Mig, O) {
     ee_sendDataShared;
     l_popForwardQueue;
   }
-  
+
   transition(MM, Merged_GETS, O) {
     em_sendDataSharedMultiple;
     l_popForwardQueue;
   }
+
   // Transitions from Dirty Exclusive
-  transition(M, {Load, Ifetch}) {
+  transition(M, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MR, Load, M) {
     h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
     k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
   }
 
-  transition(M, Store, MM) {
+  transition(MR, Ifetch, M) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MR, Store, MM) {
     hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
     k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
   }
 
   transition(M, L2_Replacement, MI) {
     i_allocateTBE;
     d_issuePUT;
+    forward_eviction_to_cpu;
     rr_deallocateL2CacheBlock;
     ka_wakeUpAllDependents;
   }
 
   transition(M, {Other_GETX, Invalidate}, I) {
     c_sendExclusiveData;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
@@ -1370,7 +1760,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     l_popForwardQueue;
   }
 
-  transition(M, NC_DMA_GETS) {
+  transition(M, NC_DMA_GETS, O) {
     ee_sendDataShared;
     l_popForwardQueue;
   }
@@ -1382,12 +1772,12 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   // Transitions from IM
 
-  transition(IM, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
+  transition({IM, IM_F}, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
     f_sendAck;
     l_popForwardQueue;
   }
 
-  transition(IM, Ack) {
+  transition({IM, IM_F, MM_F}, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1395,32 +1785,53 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   transition(IM, Data, ISM) {
     u_writeDataToCache;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
+  transition(IM_F, Data, ISM_F) {
+      uf_writeDataToCacheTBE;
+      m_decrementNumberOfMessages;
+      o_checkForCompletion;
+      n_popResponseQueue;
+  }
+
   transition(IM, Exclusive_Data, MM_W) {
     u_writeDataToCache;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
     o_checkForCompletion;
     sx_external_store_hit;
     n_popResponseQueue;
     kd_wakeUpDependents;
   }
 
+  transition(IM_F, Exclusive_Data, MM_WF) {
+      uf_writeDataToCacheTBE;
+      m_decrementNumberOfMessages;
+      o_checkForCompletion;
+      n_popResponseQueue;
+  }
+
   // Transitions from SM
-  transition(SM, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+  transition({SM, SM_F}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
     ff_sendAckShared;
     l_popForwardQueue;
   }
 
   transition(SM, {Other_GETX, Invalidate}, IM) {
     f_sendAck;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
-  transition(SM, Ack) {
+  transition(SM_F, {Other_GETX, Invalidate}, IM_F) {
+    f_sendAck;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition({SM, SM_F}, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1428,13 +1839,20 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   transition(SM, {Data, Exclusive_Data}, ISM) {
     v_writeDataToCacheVerify;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SM_F, {Data, Exclusive_Data}, ISM_F) {
+    vt_writeDataToTBEVerify;
+    m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
   // Transitions from ISM
-  transition(ISM, Ack) {
+  transition({ISM, ISM_F}, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1448,11 +1866,25 @@ machine(L1Cache, "AMD Hammer-like protocol")
     kd_wakeUpDependents;
   }
 
+  transition(ISM_F, All_acks_no_sharers, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
   // Transitions from OM
 
   transition(OM, {Other_GETX, Invalidate}, IM) {
     e_sendData;
     pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(OM_F, {Other_GETX, Invalidate}, IM_F) {
+    q_sendDataFromTBEToCache;
+    pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
     l_popForwardQueue;
   }
 
@@ -1466,7 +1898,17 @@ machine(L1Cache, "AMD Hammer-like protocol")
     l_popForwardQueue;
   }
 
-  transition(OM, Ack) {
+  transition(OM_F, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    et_sendDataSharedFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition(OM_F, Merged_GETS) {
+    emt_sendDataSharedMultipleFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition({OM, OM_F}, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1480,6 +1922,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
     kd_wakeUpDependents;
   }
 
+  transition({MM_F, OM_F}, {All_acks, All_acks_no_sharers}, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
   // Transitions from IS
 
   transition(IS, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
@@ -1487,13 +1934,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
     l_popForwardQueue;
   }
 
-  transition(IS, Ack) {  
+  transition(IS, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
-  transition(IS, Shared_Ack) {  
+  transition(IS, Shared_Ack) {
     m_decrementNumberOfMessages;
     r_setSharerBit;
     o_checkForCompletion;
@@ -1532,13 +1979,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   // Transitions from SS
 
-  transition(SS, Ack) {  
+  transition(SS, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
-  transition(SS, Shared_Ack) {  
+  transition(SS, Shared_Ack) {
     m_decrementNumberOfMessages;
     r_setSharerBit;
     o_checkForCompletion;
@@ -1549,6 +1996,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     gs_sendUnblockS;
     s_deallocateTBE;
     j_popTriggerQueue;
+    kd_wakeUpDependents;
   }
 
   transition(SS, All_acks_no_sharers, S) {
@@ -1556,16 +2004,18 @@ machine(L1Cache, "AMD Hammer-like protocol")
     gs_sendUnblockS;
     s_deallocateTBE;
     j_popTriggerQueue;
+    kd_wakeUpDependents;
   }
 
   // Transitions from MM_W
 
   transition(MM_W, Store) {
     hh_store_hit;
+    uu_profileL1DataHit;
     k_popMandatoryQueue;
   }
 
-  transition(MM_W, Ack) {  
+  transition({MM_W, MM_WF}, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1578,14 +2028,20 @@ machine(L1Cache, "AMD Hammer-like protocol")
     kd_wakeUpDependents;
   }
 
+  transition(MM_WF, All_acks_no_sharers, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
   // Transitions from M_W
 
   transition(M_W, Store, MM_W) {
     hh_store_hit;
+    uu_profileL1DataHit;
     k_popMandatoryQueue;
   }
 
-  transition(M_W, Ack) {  
+  transition(M_W, Ack) {
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
@@ -1606,7 +2062,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   transition({OI, MI}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}, OI) {
-    q_sendDataFromTBEToCache;
+    sq_sendSharedDataFromTBEToCache;
     l_popForwardQueue;
   }
 
@@ -1622,6 +2078,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
     kd_wakeUpDependents;
   }
 
+  transition(MI_F, Writeback_Ack, I) {
+      hh_flush_hit;
+      t_sendExclusiveDataFromTBEToMemory;
+      s_deallocateTBE;
+      l_popForwardQueue;
+      kd_wakeUpDependents;
+  }
+
   transition(OI, Writeback_Ack, I) {
     qq_sendDataFromTBEToMemory;
     s_deallocateTBE;
@@ -1647,4 +2111,31 @@ machine(L1Cache, "AMD Hammer-like protocol")
     l_popForwardQueue;
     kd_wakeUpDependents;
   }
+
+  transition(MM_F, {Other_GETX, Invalidate}, IM_F) {
+    ct_sendExclusiveDataFromTBE;
+    pp_incrementNumberOfMessagesByOne;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Other_GETS, IM_F) {
+    ct_sendExclusiveDataFromTBE;
+    pp_incrementNumberOfMessagesByOne;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, NC_DMA_GETS, OM_F) {
+    sq_sendSharedDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Other_GETS_No_Mig, OM_F) {
+    et_sendDataSharedFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Merged_GETS, OM_F) {
+    emt_sendDataSharedMultipleFromTBE;
+    l_popForwardQueue;
+  }
 }