ruby: message buffers: significant changes

[gem5.git] / src / mem / protocol / MOESI_hammer-cache.sm
diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm

index 24f3ab318c26dea6c75f260a5289810c9d6be508..de502e118599f57a0919be2757e8f8619099e293 100644 (file)
--- a/src/mem/protocol/MOESI_hammer-cache.sm
+++ b/src/mem/protocol/MOESI_hammer-cache.sm
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
+ * Copyright (c) 1999-2013 Mark D. Hill and David A. Wood
   * Copyright (c) 2009 Advanced Micro Devices, Inc.
   * All rights reserved.
   *
@@ -26,33 +26,37 @@
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
- * AMD's contributions to the MOESI hammer protocol do not constitute an 
+ * AMD's contributions to the MOESI hammer protocol do not constitute an
   * endorsement of its similarity to any AMD products.
   *
   * Authors: Milo Martin
   *          Brad Beckmann
   */
  
-machine(L1Cache, "AMD Hammer-like protocol") 
-: Sequencer * sequencer,
-  CacheMemory * L1IcacheMemory,
-  CacheMemory * L1DcacheMemory,
-  CacheMemory * L2cacheMemory,
-  int cache_response_latency = 10,
-  int issue_latency = 2,
-  int l2_cache_hit_latency = 10,
-  bool no_mig_atomic = true
+machine({L1Cache, L2Cache}, "AMD Hammer-like protocol")
+    : Sequencer * sequencer;
+      CacheMemory * L1Icache;
+      CacheMemory * L1Dcache;
+      CacheMemory * L2cache;
+      Cycles cache_response_latency := 10;
+      Cycles issue_latency := 2;
+      Cycles l2_cache_hit_latency := 10;
+      bool no_mig_atomic := "True";
+      bool send_evictions;
+
+      // NETWORK BUFFERS
+      MessageBuffer * requestFromCache, network="To", virtual_network="2",
+            ordered="false", vnet_type="request";
+      MessageBuffer * responseFromCache, network="To", virtual_network="4",
+            ordered="false", vnet_type="response";
+      MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+            ordered="false", vnet_type="unblock";
+
+      MessageBuffer * forwardToCache, network="From", virtual_network="3",
+            ordered="false", vnet_type="forward";
+      MessageBuffer * responseToCache, network="From", virtual_network="4",
+            ordered="false", vnet_type="response";
  {
-
-  // NETWORK BUFFERS
-  MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false", vnet_type="request";
-  MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false", vnet_type="response";
-  MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false", vnet_type="unblock";
-
-  MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false", vnet_type="forward";
-  MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false", vnet_type="response";
-
-
    // STATES
    state_declaration(State, desc="Cache states", default="L1Cache_State_I") {
      // Base states
@@ -62,6 +66,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
      M, AccessPermission:Read_Only, desc="Modified (dirty)";
      MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
  
+    // Base states, locked and ready to service the mandatory queue
+    IR, AccessPermission:Invalid, desc="Idle";
+    SR, AccessPermission:Read_Only, desc="Shared";
+    OR, AccessPermission:Read_Only, desc="Owned";
+    MR, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MMR, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
+
      // Transient States
      IM, AccessPermission:Busy, "IM", desc="Issued GetX";
      SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line";
@@ -153,9 +164,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
      bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks";
      MachineID LastResponder, desc="last machine to send a response for this request";
      MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
-    Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
-    Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
-    Time FirstResponseTime, default="0", desc="the time the first response was received";
+
+    Cycles InitialRequestTime, default="Cycles(0)",
+            desc="time the initial requests was sent from the L1Cache";
+    Cycles ForwardRequestTime, default="Cycles(0)",
+            desc="time the dir forwarded the request";
+    Cycles FirstResponseTime, default="Cycles(0)",
+            desc="the time the first response was received";
    }
  
    structure(TBETable, external="yes") {
@@ -165,7 +180,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
      bool isPresent(Address);
    }
  
-  TBETable TBEs, template_hack="<L1Cache_TBE>";
+  TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
  
    void set_cache_entry(AbstractCacheEntry b);
    void unset_cache_entry();
@@ -173,34 +188,49 @@ machine(L1Cache, "AMD Hammer-like protocol")
    void unset_tbe();
    void wakeUpAllBuffers();
    void wakeUpBuffers(Address a);
+  Cycles curCycle();
  
    Entry getCacheEntry(Address address), return_by_pointer="yes" {
-    Entry L2cache_entry := static_cast(Entry, "pointer", L2cacheMemory.lookup(address));
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
      if(is_valid(L2cache_entry)) {
        return L2cache_entry;
      }
  
-    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1DcacheMemory.lookup(address));
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
      if(is_valid(L1Dcache_entry)) {
        return L1Dcache_entry;
      }
  
-    Entry L1Icache_entry := static_cast(Entry, "pointer", L1IcacheMemory.lookup(address));
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
      return L1Icache_entry;
    }
  
+  DataBlock getDataBlock(Address addr), return_by_ref="yes" {
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+        return cache_entry.DataBlk;
+    }
+
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    }
+
+    error("Missing data block");
+  }
+
    Entry getL2CacheEntry(Address address), return_by_pointer="yes" {
-    Entry L2cache_entry := static_cast(Entry, "pointer", L2cacheMemory.lookup(address));
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
      return L2cache_entry;
    }
  
    Entry getL1DCacheEntry(Address address), return_by_pointer="yes" {
-    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1DcacheMemory.lookup(address));
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
      return L1Dcache_entry;
    }
  
    Entry getL1ICacheEntry(Address address), return_by_pointer="yes" {
-    Entry L1Icache_entry := static_cast(Entry, "pointer", L1IcacheMemory.lookup(address));
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
      return L1Icache_entry;
    }
  
@@ -214,9 +244,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    void setState(TBE tbe, Entry cache_entry, Address addr, State state) {
-    assert((L1DcacheMemory.isTagPresent(addr) && L1IcacheMemory.isTagPresent(addr)) == false);
-    assert((L1IcacheMemory.isTagPresent(addr) && L2cacheMemory.isTagPresent(addr)) == false);
-    assert((L1DcacheMemory.isTagPresent(addr) && L2cacheMemory.isTagPresent(addr)) == false);
+    assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false);
+    assert((L1Icache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
+    assert((L1Dcache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
  
      if (is_valid(tbe)) {
        tbe.TBEState := state;
@@ -227,6 +257,26 @@ machine(L1Cache, "AMD Hammer-like protocol")
      }
    }
  
+  AccessPermission getAccessPermission(Address addr) {
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return L1Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return L1Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Address addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L1Cache_State_to_permission(state));
+    }
+  }
+
    Event mandatory_request_type_to_event(RubyRequestType type) {
      if (type == RubyRequestType:LD) {
        return Event:Load;
@@ -241,24 +291,12 @@ machine(L1Cache, "AMD Hammer-like protocol")
      }
    }
  
-  GenericMachineType getNondirectHitMachType(Address addr, MachineID sender) {
-    if (machineIDToMachineType(sender) == MachineType:L1Cache) {
-      //
-      // NOTE direct local hits should not call this
-      //
-      return GenericMachineType:L1Cache_wCC; 
-    } else {
-      return ConvertMachToGenericMach(machineIDToMachineType(sender));
-    }
-  }
-
-  GenericMachineType testAndClearLocalHit(Entry cache_entry) {
+  MachineType testAndClearLocalHit(Entry cache_entry) {
      if (is_valid(cache_entry) && cache_entry.FromL2) {
        cache_entry.FromL2 := false;
-      return GenericMachineType:L2Cache;
-    } else {
-      return GenericMachineType:L1Cache; 
+      return MachineType:L2Cache;
      }
+    return MachineType:L1Cache;
    }
  
    bool IsAtomicAccessed(Entry cache_entry) {
@@ -282,15 +320,15 @@ machine(L1Cache, "AMD Hammer-like protocol")
      if (triggerQueue_in.isReady()) {
        peek(triggerQueue_in, TriggerMsg) {
  
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
  
          if (in_msg.Type == TriggerType:L2_to_L1) {
-          trigger(Event:Complete_L2_to_L1, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Complete_L2_to_L1, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == TriggerType:ALL_ACKS) {
-          trigger(Event:All_acks, in_msg.Address, cache_entry, tbe);
+          trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
-          trigger(Event:All_acks_no_sharers, in_msg.Address, cache_entry, tbe);
+          trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe);
          } else {
            error("Unexpected message");
          }
@@ -303,21 +341,21 @@ machine(L1Cache, "AMD Hammer-like protocol")
    // Response Network
    in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) {
      if (responseToCache_in.isReady()) {
-      peek(responseToCache_in, ResponseMsg, block_on="Address") {
+      peek(responseToCache_in, ResponseMsg, block_on="Addr") {
  
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
  
          if (in_msg.Type == CoherenceResponseType:ACK) {
-          trigger(Event:Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Ack, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) {
-          trigger(Event:Shared_Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceResponseType:DATA) {
-          trigger(Event:Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Data, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) {
-          trigger(Event:Shared_Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
-          trigger(Event:Exclusive_Data, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe);
          } else {
            error("Unexpected message");
          }
@@ -328,37 +366,38 @@ machine(L1Cache, "AMD Hammer-like protocol")
    // Forward Network
    in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) {
      if (forwardToCache_in.isReady()) {
-      peek(forwardToCache_in, RequestMsg, block_on="Address") {
+      peek(forwardToCache_in, RequestMsg, block_on="Addr") {
  
-        Entry cache_entry := getCacheEntry(in_msg.Address);
-        TBE tbe := TBEs[in_msg.Address];
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
  
-        if ((in_msg.Type == CoherenceRequestType:GETX) || (in_msg.Type == CoherenceRequestType:GETF)) {
-          trigger(Event:Other_GETX, in_msg.Address, cache_entry, tbe);
+        if ((in_msg.Type == CoherenceRequestType:GETX) ||
+            (in_msg.Type == CoherenceRequestType:GETF)) {
+          trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) {
-          trigger(Event:Merged_GETS, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceRequestType:GETS) {
            if (machineCount(MachineType:L1Cache) > 1) {
              if (is_valid(cache_entry)) {
                if (IsAtomicAccessed(cache_entry) && no_mig_atomic) {
-                trigger(Event:Other_GETS_No_Mig, in_msg.Address, cache_entry, tbe);
+                trigger(Event:Other_GETS_No_Mig, in_msg.Addr, cache_entry, tbe);
                } else {
-                trigger(Event:Other_GETS, in_msg.Address, cache_entry, tbe);
+                trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
                }
              } else {
-              trigger(Event:Other_GETS, in_msg.Address, cache_entry, tbe);
+              trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
              }
            } else {
-            trigger(Event:NC_DMA_GETS, in_msg.Address, cache_entry, tbe);
+            trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe);
            }
          } else if (in_msg.Type == CoherenceRequestType:INV) {
-          trigger(Event:Invalidate, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceRequestType:WB_ACK) {
-          trigger(Event:Writeback_Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceRequestType:WB_NACK) {
-          trigger(Event:Writeback_Nack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe);
          } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) {
-          trigger(Event:Block_Ack, in_msg.Address, cache_entry, tbe);
+          trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe);
          } else {
            error("Unexpected message");
          }
@@ -381,7 +420,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
            if (is_valid(L1Icache_entry)) {
-            // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
              trigger(mandatory_request_type_to_event(in_msg.Type),
                      in_msg.LineAddress, L1Icache_entry, tbe);
            } else {
@@ -389,18 +429,18 @@ machine(L1Cache, "AMD Hammer-like protocol")
              Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
              if (is_valid(L1Dcache_entry)) {
                // The block is in the wrong L1, try to write it to the L2
-              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
                  trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe);
                } else {
-                Address l2_victim_addr := L2cacheMemory.cacheProbe(in_msg.LineAddress);
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
                  trigger(Event:L2_Replacement,
-                        l2_victim_addr, 
+                        l2_victim_addr,
                          getL2CacheEntry(l2_victim_addr),
                          TBEs[l2_victim_addr]);
                }
              }
  
-            if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
+            if (L1Icache.cacheAvail(in_msg.LineAddress)) {
                // L1 does't have the line, but we have space for it in the L1
  
                Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
@@ -415,15 +455,15 @@ machine(L1Cache, "AMD Hammer-like protocol")
                }
              } else {
                // No room in the L1, so we need to make room
-              Address l1i_victim_addr := L1IcacheMemory.cacheProbe(in_msg.LineAddress);
-              if (L2cacheMemory.cacheAvail(l1i_victim_addr)) {
+              Address l1i_victim_addr := L1Icache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1i_victim_addr)) {
                  // The L2 has room, so we move the line from the L1 to the L2
                  trigger(Event:L1_to_L2,
                          l1i_victim_addr,
                          getL1ICacheEntry(l1i_victim_addr),
                          TBEs[l1i_victim_addr]);
                } else {
-                Address l2_victim_addr := L2cacheMemory.cacheProbe(l1i_victim_addr);
+                Address l2_victim_addr := L2cache.cacheProbe(l1i_victim_addr);
                  // The L2 does not have room, so we replace a line from the L2
                  trigger(Event:L2_Replacement,
                          l2_victim_addr,
@@ -437,7 +477,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
            if (is_valid(L1Dcache_entry)) {
-            // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
              trigger(mandatory_request_type_to_event(in_msg.Type),
                      in_msg.LineAddress, L1Dcache_entry, tbe);
            } else {
@@ -446,10 +487,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
              Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
              if (is_valid(L1Icache_entry)) {
                // The block is in the wrong L1, try to write it to the L2
-              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
                  trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe);
                } else {
-                Address l2_victim_addr := L2cacheMemory.cacheProbe(in_msg.LineAddress);
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
                  trigger(Event:L2_Replacement,
                          l2_victim_addr,
                          getL2CacheEntry(l2_victim_addr),
@@ -457,7 +498,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
                }
              }
  
-            if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
+            if (L1Dcache.cacheAvail(in_msg.LineAddress)) {
                // L1 does't have the line, but we have space for it in the L1
                Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
                if (is_valid(L2cache_entry)) {
@@ -471,15 +512,15 @@ machine(L1Cache, "AMD Hammer-like protocol")
                }
              } else {
                // No room in the L1, so we need to make room
-              Address l1d_victim_addr := L1DcacheMemory.cacheProbe(in_msg.LineAddress);
-              if (L2cacheMemory.cacheAvail(l1d_victim_addr)) {
+              Address l1d_victim_addr := L1Dcache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1d_victim_addr)) {
                  // The L2 has room, so we move the line from the L1 to the L2
                  trigger(Event:L1_to_L2,
                          l1d_victim_addr,
                          getL1DCacheEntry(l1d_victim_addr),
                          TBEs[l1d_victim_addr]);
                } else {
-                Address l2_victim_addr := L2cacheMemory.cacheProbe(l1d_victim_addr);
+                Address l2_victim_addr := L2cache.cacheProbe(l1d_victim_addr);
                  // The L2 does not have room, so we replace a line from the L2
                  trigger(Event:L2_Replacement,
                          l2_victim_addr,
@@ -492,53 +533,76 @@ machine(L1Cache, "AMD Hammer-like protocol")
        }
      }
    }
-  
+
    // ACTIONS
  
    action(a_issueGETS, "a", desc="Issue GETS") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Type := CoherenceRequestType:GETS;
        out_msg.Requestor := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
        out_msg.MessageSize := MessageSizeType:Request_Control;
-      out_msg.InitialRequestTime := get_time();
-      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
      }
    }
  
    action(b_issueGETX, "b", desc="Issue GETX") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Type := CoherenceRequestType:GETX;
        out_msg.Requestor := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
        out_msg.MessageSize := MessageSizeType:Request_Control;
-      out_msg.InitialRequestTime := get_time();
-      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
      }
    }
  
+  action(b_issueGETXIfMoreThanOne, "bo", desc="Issue GETX") {
+    if (machineCount(MachineType:L1Cache) > 1) {
+      enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceRequestType:GETX;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := curCycle();
+      }
+    }
+
+    // One from each other cache (n-1) plus the memory (+1)
+    tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+  }
+
    action(bf_issueGETF, "bf", desc="Issue GETF") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Type := CoherenceRequestType:GETF;
        out_msg.Requestor := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
        out_msg.MessageSize := MessageSizeType:Request_Control;
-      out_msg.InitialRequestTime := get_time();
-      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
      }
    }
  
    action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -559,9 +623,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(ct_sendExclusiveDataFromTBE, "ct", desc="Send exclusive data from tbe to requestor") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -581,8 +645,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(d_issuePUT, "d", desc="Issue PUT") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
-      out_msg.Address := address;
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
        out_msg.Type := CoherenceRequestType:PUT;
        out_msg.Requestor := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
@@ -591,8 +655,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(df_issuePUTF, "df", desc="Issue PUTF") {
-    enqueue(requestNetwork_out, RequestMsg, latency=issue_latency) {
-      out_msg.Address := address;
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
        out_msg.Type := CoherenceRequestType:PUTF;
        out_msg.Requestor := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
@@ -602,9 +666,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(e_sendData, "e", desc="Send data from cache to requestor") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -625,9 +689,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -649,9 +713,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(et_sendDataSharedFromTBE, "\et", desc="Send data from TBE to requestor, keep a shared copy") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -673,9 +737,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(cache_entry));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination := in_msg.MergedRequestors;
@@ -690,12 +754,12 @@ machine(L1Cache, "AMD Hammer-like protocol")
        }
      }
    }
-  
+
    action(emt_sendDataSharedMultipleFromTBE, "emt", desc="Send data from tbe to all requestors") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination := in_msg.MergedRequestors;
@@ -713,8 +777,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(f_sendAck, "f", desc="Send ack from cache to requestor") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
-        out_msg.Address := address;
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:ACK;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -730,8 +794,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
-        out_msg.Address := address;
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:ACK_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -746,8 +810,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(g_sendUnblock, "g", desc="Send unblock to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
-      out_msg.Address := address;
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
        out_msg.Type := CoherenceResponseType:UNBLOCK;
        out_msg.Sender := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
@@ -756,8 +820,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
-      out_msg.Address := address;
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
        out_msg.Type := CoherenceResponseType:UNBLOCKM;
        out_msg.Sender := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
@@ -766,9 +830,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Type := CoherenceResponseType:UNBLOCKS;
        out_msg.Sender := machineID;
        out_msg.CurOwner := tbe.CurOwner;
@@ -780,8 +844,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    action(h_load_hit, "h", desc="Notify sequencer the load completed.") {
      assert(is_valid(cache_entry));
      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
-    sequencer.readCallback(address, testAndClearLocalHit(cache_entry),
-                           cache_entry.DataBlk);
+    sequencer.readCallback(address, cache_entry.DataBlk, false,
+                           testAndClearLocalHit(cache_entry));
    }
  
    action(hx_external_load_hit, "hx", desc="load required external msgs") {
@@ -790,12 +854,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
      peek(responseToCache_in, ResponseMsg) {
  
-      sequencer.readCallback(address, 
-                             getNondirectHitMachType(in_msg.Address, in_msg.Sender),
-                             cache_entry.DataBlk,
-                             tbe.InitialRequestTime,
-                             tbe.ForwardRequestTime,
-                             tbe.FirstResponseTime);
+      sequencer.readCallback(address, cache_entry.DataBlk, true,
+                 machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+                 tbe.ForwardRequestTime, tbe.FirstResponseTime);
      }
    }
  
@@ -803,8 +864,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
      assert(is_valid(cache_entry));
      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
      peek(mandatoryQueue_in, RubyRequest) {
-      sequencer.writeCallback(address, testAndClearLocalHit(cache_entry),
-                              cache_entry.DataBlk);
+      sequencer.writeCallback(address, cache_entry.DataBlk, false,
+                              testAndClearLocalHit(cache_entry));
  
        cache_entry.Dirty := true;
        if (in_msg.Type == RubyRequestType:ATOMIC) {
@@ -816,7 +877,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
    action(hh_flush_hit, "\hf", desc="Notify sequencer that flush completed.") {
      assert(is_valid(tbe));
      DPRINTF(RubySlicc, "%s\n", tbe.DataBlk);
-    sequencer.writeCallback(address, GenericMachineType:L1Cache,tbe.DataBlk);
+    sequencer.writeCallback(address, tbe.DataBlk, false, MachineType:L1Cache);
    }
  
    action(sx_external_store_hit, "sx", desc="store required external msgs.") {
@@ -825,13 +886,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
      peek(responseToCache_in, ResponseMsg) {
  
-      sequencer.writeCallback(address, 
-                              getNondirectHitMachType(address, in_msg.Sender),
-                              cache_entry.DataBlk,
-                              tbe.InitialRequestTime,
-                              tbe.ForwardRequestTime,
-                              tbe.FirstResponseTime);
+      sequencer.writeCallback(address, cache_entry.DataBlk, true,
+              machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+              tbe.ForwardRequestTime, tbe.FirstResponseTime);
      }
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
      cache_entry.Dirty := true;
    }
  
@@ -840,12 +899,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
      assert(is_valid(tbe));
      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
  
-    sequencer.writeCallback(address, 
-                            getNondirectHitMachType(address, tbe.LastResponder),
-                            cache_entry.DataBlk,
-                            tbe.InitialRequestTime,
-                            tbe.ForwardRequestTime,
-                            tbe.FirstResponseTime);
+    sequencer.writeCallback(address, cache_entry.DataBlk, true,
+            machineIDToMachineType(tbe.LastResponder), tbe.InitialRequestTime,
+            tbe.ForwardRequestTime, tbe.FirstResponseTime);
  
      cache_entry.Dirty := true;
    }
@@ -897,7 +953,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
      peek(responseToCache_in, ResponseMsg) {
-      assert(in_msg.Acks > 0);
+      assert(in_msg.Acks >= 0);
        assert(is_valid(tbe));
        DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender);
        DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks);
@@ -924,7 +980,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
          tbe.ForwardRequestTime := in_msg.ForwardRequestTime;
        }
        if (tbe.FirstResponseTime == zero_time()) {
-        tbe.FirstResponseTime := get_time();
+        tbe.FirstResponseTime := curCycle();
        }
      }
    }
@@ -940,8 +996,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(ll_L2toL1Transfer, "ll", desc="") {
-    enqueue(triggerQueue_out, TriggerMsg, latency=l2_cache_hit_latency) {
-      out_msg.Address := address;
+    enqueue(triggerQueue_out, TriggerMsg, l2_cache_hit_latency) {
+      out_msg.Addr := address;
        out_msg.Type := TriggerType:L2_to_L1;
      }
    }
@@ -950,7 +1006,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
      assert(is_valid(tbe));
      if (tbe.NumPendingMsgs == 0) {
        enqueue(triggerQueue_out, TriggerMsg) {
-        out_msg.Address := address;
+        out_msg.Addr := address;
          if (tbe.Sharers) {
            out_msg.Type := TriggerType:ALL_ACKS;
          } else {
@@ -973,9 +1029,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
    action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") {
      peek(forwardToCache_in, RequestMsg) {
          assert(in_msg.Requestor != machineID);
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -998,9 +1054,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
    action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") {
      peek(forwardToCache_in, RequestMsg) {
          assert(in_msg.Requestor != machineID);
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination.add(in_msg.Requestor);
@@ -1022,9 +1078,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") {
      peek(forwardToCache_in, RequestMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
          assert(is_valid(tbe));
-        out_msg.Address := address;
+        out_msg.Addr := address;
          out_msg.Type := CoherenceResponseType:DATA_SHARED;
          out_msg.Sender := machineID;
          out_msg.Destination := in_msg.MergedRequestors;
@@ -1041,9 +1097,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Sender := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
        out_msg.Dirty := tbe.Dirty;
@@ -1055,7 +1111,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
          out_msg.Type := CoherenceResponseType:WB_CLEAN;
          // NOTE: in a real system this would not send data.  We send
          // data here only so we can check it at the memory
-        out_msg.DataBlk := tbe.DataBlk; 
+        out_msg.DataBlk := tbe.DataBlk;
          out_msg.MessageSize := MessageSizeType:Writeback_Control;
        }
      }
@@ -1072,12 +1128,12 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") {
-    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
        assert(is_valid(tbe));
-      out_msg.Address := address;
+      out_msg.Addr := address;
        out_msg.Sender := machineID;
        out_msg.Destination.add(map_Address_to_Directory(address));
-      out_msg.DataBlk := tbe.DataBlk; 
+      out_msg.DataBlk := tbe.DataBlk;
        out_msg.Dirty := tbe.Dirty;
        if (tbe.Dirty) {
          out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY;
@@ -1130,54 +1186,77 @@ machine(L1Cache, "AMD Hammer-like protocol")
        tbe.Dirty := in_msg.Dirty || tbe.Dirty;
      }
    }
-  
+
    action(gg_deallocateL1CacheBlock, "\g", desc="Deallocate cache block.  Sets the cache to invalid, allowing a replacement in parallel with a fetch.") {
-    if (L1DcacheMemory.isTagPresent(address)) {
-      L1DcacheMemory.deallocate(address);
+    if (L1Dcache.isTagPresent(address)) {
+      L1Dcache.deallocate(address);
      } else {
-      L1IcacheMemory.deallocate(address);
+      L1Icache.deallocate(address);
      }
      unset_cache_entry();
    }
-  
+
    action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") {
      if (is_invalid(cache_entry)) {
-      set_cache_entry(L1DcacheMemory.allocate(address, new Entry));
+      set_cache_entry(L1Dcache.allocate(address, new Entry));
      }
    }
  
    action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") {
      if (is_invalid(cache_entry)) {
-      set_cache_entry(L1IcacheMemory.allocate(address, new Entry));
+      set_cache_entry(L1Icache.allocate(address, new Entry));
      }
    }
  
    action(vv_allocateL2CacheBlock, "\v", desc="Set L2 cache tag equal to tag of block B.") {
-    set_cache_entry(L2cacheMemory.allocate(address, new Entry));
+    set_cache_entry(L2cache.allocate(address, new Entry));
    }
  
    action(rr_deallocateL2CacheBlock, "\r", desc="Deallocate L2 cache block.  Sets the cache to not present, allowing a replacement in parallel with a fetch.") {
-    L2cacheMemory.deallocate(address);
+    L2cache.deallocate(address);
      unset_cache_entry();
    }
  
-  action(uu_profileMiss, "\u", desc="Profile the demand miss") {
-    peek(mandatoryQueue_in, RubyRequest) {
-      if (L1IcacheMemory.isTagPresent(address)) {
-        L1IcacheMemory.profileMiss(in_msg);
-      } else if (L1DcacheMemory.isTagPresent(address)) {
-        L1DcacheMemory.profileMiss(in_msg);
-      }
-      if (L2cacheMemory.isTagPresent(address) == false) {
-        L2cacheMemory.profileMiss(in_msg);
-      }
+  action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
      }
    }
  
+  action(uu_profileL1DataMiss, "\udm", desc="Profile the demand miss") {
+      ++L1Dcache.demand_misses;
+  }
+
+  action(uu_profileL1DataHit, "\udh", desc="Profile the demand hits") {
+      ++L1Dcache.demand_hits;
+  }
+
+  action(uu_profileL1InstMiss, "\uim", desc="Profile the demand miss") {
+      ++L1Icache.demand_misses;
+  }
+
+  action(uu_profileL1InstHit, "\uih", desc="Profile the demand hits") {
+      ++L1Icache.demand_hits;
+  }
+
+  action(uu_profileL2Miss, "\um", desc="Profile the demand miss") {
+      ++L2cache.demand_misses;
+  }
+
+  action(uu_profileL2Hit, "\uh", desc="Profile the demand hits ") {
+      ++L2cache.demand_hits;
+  }
+
    action(zz_stallAndWaitMandatoryQueue, "\z", desc="Send the head of the mandatory queue to the back of the queue.") {
      stall_and_wait(mandatoryQueue_in, address);    
    }
  
+  action(z_stall, "z", desc="stall") {
+    // do nothing and the special z_stall action will return a protocol stall
+    // so that the next port is checked
+  }
+
    action(kd_wakeUpDependents, "kd", desc="wake-up dependents") {
      wakeUpBuffers(address);
    }
@@ -1207,7 +1286,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
      zz_stallAndWaitMandatoryQueue;
    }
  
-  transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT, IM_F, SM_F, ISM_F, OM_F, MM_WF, MI_F, MM_F}, L1_to_L2) {
+  transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT, IM_F, SM_F, ISM_F, OM_F, MM_WF, MI_F, MM_F, IR, SR, OR, MR, MMR}, L1_to_L2) {
      zz_stallAndWaitMandatoryQueue;
    }
  
@@ -1220,7 +1299,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    transition({IT, ST, OT, MT, MMT}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate, Flush_line}) {
-    // stall
+    z_stall;
+  }
+
+  transition({IR, SR, OR, MR, MMR}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) {
+    z_stall;
    }
  
    // Transitions moving data between the L1 and L2 caches
@@ -1230,7 +1313,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      vv_allocateL2CacheBlock;
      hp_copyFromTBEToL2;
      s_deallocateTBE;
-    ka_wakeUpAllDependents;
    }
  
    transition(I, Trigger_L2_to_L1D, IT) {
@@ -1239,7 +1321,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      ii_allocateL1DCacheBlock;
      nb_copyFromTBEToL1; // Not really needed for state I
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1250,7 +1331,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      ii_allocateL1DCacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1261,7 +1341,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      ii_allocateL1DCacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1272,7 +1351,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      ii_allocateL1DCacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1283,7 +1361,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      ii_allocateL1DCacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1294,7 +1371,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      jj_allocateL1ICacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1305,7 +1381,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      jj_allocateL1ICacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1316,7 +1391,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      jj_allocateL1ICacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1327,7 +1401,6 @@ machine(L1Cache, "AMD Hammer-like protocol")
      jj_allocateL1ICacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
@@ -1338,65 +1411,66 @@ machine(L1Cache, "AMD Hammer-like protocol")
      jj_allocateL1ICacheBlock;
      nb_copyFromTBEToL1;
      s_deallocateTBE;
-    uu_profileMiss;
      zz_stallAndWaitMandatoryQueue;
      ll_L2toL1Transfer;
    }
  
-  transition(IT, Complete_L2_to_L1, I) {
+  transition(IT, Complete_L2_to_L1, IR) {
      j_popTriggerQueue;
      kd_wakeUpDependents;
    }
  
-  transition(ST, Complete_L2_to_L1, S) {
+  transition(ST, Complete_L2_to_L1, SR) {
      j_popTriggerQueue;
      kd_wakeUpDependents;
    }
  
-  transition(OT, Complete_L2_to_L1, O) {
+  transition(OT, Complete_L2_to_L1, OR) {
      j_popTriggerQueue;
      kd_wakeUpDependents;
    }
  
-  transition(MT, Complete_L2_to_L1, M) {
+  transition(MT, Complete_L2_to_L1, MR) {
      j_popTriggerQueue;
      kd_wakeUpDependents;
    }
  
-  transition(MMT, Complete_L2_to_L1, MM) {
+  transition(MMT, Complete_L2_to_L1, MMR) {
      j_popTriggerQueue;
      kd_wakeUpDependents;
    }
  
    // Transitions from Idle
-  transition(I, Load, IS) {
+  transition({I,IR}, Load, IS) {
      ii_allocateL1DCacheBlock;
      i_allocateTBE;
      a_issueGETS;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
      k_popMandatoryQueue;
    }
  
-  transition(I, Ifetch, IS) {
+  transition({I,IR}, Ifetch, IS) {
      jj_allocateL1ICacheBlock;
      i_allocateTBE;
      a_issueGETS;
-    uu_profileMiss;
+    uu_profileL1InstMiss;
+    uu_profileL2Miss;
      k_popMandatoryQueue;
    }
  
-  transition(I, Store, IM) {
+  transition({I,IR}, Store, IM) {
      ii_allocateL1DCacheBlock;
      i_allocateTBE;
      b_issueGETX;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
      k_popMandatoryQueue;
    }
  
-  transition(I, Flush_line, IM_F) {
+  transition({I, IR}, Flush_line, IM_F) {
      it_allocateTBE;
      bf_issueGETF;
-    uu_profileMiss;
      k_popMandatoryQueue;
    }
  
@@ -1411,33 +1485,59 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    // Transitions from Shared
-  transition({S, SM, ISM}, {Load, Ifetch}) {
+  transition({S, SM, ISM}, Load) {
      h_load_hit;
+    uu_profileL1DataHit;
      k_popMandatoryQueue;
    }
  
-  transition(S, Store, SM) {
+  transition({S, SM, ISM}, Ifetch) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(SR, Load, S) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(SR, Ifetch, S) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({S,SR}, Store, SM) {
      i_allocateTBE;
      b_issueGETX;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
      k_popMandatoryQueue;
    }
  
-  transition(S, Flush_line, SM_F) {
+  transition({S, SR}, Flush_line, SM_F) {
      i_allocateTBE;
      bf_issueGETF;
-    uu_profileMiss;
+    forward_eviction_to_cpu;
      gg_deallocateL1CacheBlock;
      k_popMandatoryQueue;
    }
  
    transition(S, L2_Replacement, I) {
+    forward_eviction_to_cpu;
      rr_deallocateL2CacheBlock;
      ka_wakeUpAllDependents;
    }
  
    transition(S, {Other_GETX, Invalidate}, I) {
      f_sendAck;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
@@ -1447,23 +1547,48 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    // Transitions from Owned
-  transition({O, OM, SS, MM_W, M_W}, {Load, Ifetch}) {
+  transition({O, OM, SS, MM_W, M_W}, {Load}) {
+    h_load_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({O, OM, SS, MM_W, M_W}, {Ifetch}) {
      h_load_hit;
+    uu_profileL1InstHit;
      k_popMandatoryQueue;
    }
  
-  transition(O, Store, OM) {
+  transition(OR, Load, O) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(OR, Ifetch, O) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({O,OR}, Store, OM) {
      i_allocateTBE;
      b_issueGETX;
      p_decrementNumberOfMessagesByOne;
-    uu_profileMiss;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
      k_popMandatoryQueue;
    }
-  transition(O, Flush_line, OM_F) {
+
+  transition({O, OR}, Flush_line, OM_F) {
      i_allocateTBE;
      bf_issueGETF;
      p_decrementNumberOfMessagesByOne;
-    uu_profileMiss;
+    forward_eviction_to_cpu;
      gg_deallocateL1CacheBlock;
      k_popMandatoryQueue;
    }
@@ -1471,12 +1596,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
    transition(O, L2_Replacement, OI) {
      i_allocateTBE;
      d_issuePUT;
+    forward_eviction_to_cpu;
      rr_deallocateL2CacheBlock;
      ka_wakeUpAllDependents;
    }
  
    transition(O, {Other_GETX, Invalidate}, I) {
      e_sendData;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
@@ -1491,20 +1618,53 @@ machine(L1Cache, "AMD Hammer-like protocol")
    }
  
    // Transitions from Modified
-  transition(MM, {Load, Ifetch}) {
+  transition({MM, M}, {Ifetch}) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({MM, M}, {Load}) {
      h_load_hit;
+    uu_profileL1DataHit;
      k_popMandatoryQueue;
    }
  
    transition(MM, Store) {
      hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MMR, Load, MM) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Ifetch, MM) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
      k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
    }
  
-  transition({MM, M}, Flush_line, MM_F) {
+  transition({MM, M, MMR, MR}, Flush_line, MM_F) {
      i_allocateTBE;
      bf_issueGETF;
      p_decrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
      gg_deallocateL1CacheBlock;
      k_popMandatoryQueue;
    }
@@ -1518,55 +1678,80 @@ machine(L1Cache, "AMD Hammer-like protocol")
    transition(MM, L2_Replacement, MI) {
      i_allocateTBE;
      d_issuePUT;
+    forward_eviction_to_cpu;
      rr_deallocateL2CacheBlock;
      ka_wakeUpAllDependents;
    }
  
    transition(MM, {Other_GETX, Invalidate}, I) {
      c_sendExclusiveData;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
    transition(MM, Other_GETS, I) {
      c_sendExclusiveData;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
-  
+
    transition(MM, NC_DMA_GETS, O) {
      ee_sendDataShared;
      l_popForwardQueue;
    }
-  
+
    transition(MM, Other_GETS_No_Mig, O) {
      ee_sendDataShared;
      l_popForwardQueue;
    }
-  
+
    transition(MM, Merged_GETS, O) {
      em_sendDataSharedMultiple;
      l_popForwardQueue;
    }
- 
+
    // Transitions from Dirty Exclusive
-  transition(M, {Load, Ifetch}) {
+  transition(M, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MR, Load, M) {
      h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
      k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
    }
  
-  transition(M, Store, MM) {
+  transition(MR, Ifetch, M) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MR, Store, MM) {
      hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
      k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
    }
  
    transition(M, L2_Replacement, MI) {
      i_allocateTBE;
      d_issuePUT;
+    forward_eviction_to_cpu;
      rr_deallocateL2CacheBlock;
      ka_wakeUpAllDependents;
    }
  
    transition(M, {Other_GETX, Invalidate}, I) {
      c_sendExclusiveData;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
@@ -1600,7 +1785,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(IM, Data, ISM) {
      u_writeDataToCache;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;
    }
@@ -1614,7 +1799,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(IM, Exclusive_Data, MM_W) {
      u_writeDataToCache;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
      o_checkForCompletion;
      sx_external_store_hit;
      n_popResponseQueue;
@@ -1636,11 +1821,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(SM, {Other_GETX, Invalidate}, IM) {
      f_sendAck;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
    transition(SM_F, {Other_GETX, Invalidate}, IM_F) {
      f_sendAck;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
@@ -1652,7 +1839,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(SM, {Data, Exclusive_Data}, ISM) {
      v_writeDataToCacheVerify;
-    m_decrementNumberOfMessages; 
+    m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;
    }
@@ -1690,12 +1877,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
    transition(OM, {Other_GETX, Invalidate}, IM) {
      e_sendData;
      pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
    transition(OM_F, {Other_GETX, Invalidate}, IM_F) {
      q_sendDataFromTBEToCache;
      pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
      l_popForwardQueue;
    }
  
@@ -1745,13 +1934,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
      l_popForwardQueue;
    }
  
-  transition(IS, Ack) {  
+  transition(IS, Ack) {
      m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;
    }
  
-  transition(IS, Shared_Ack) {  
+  transition(IS, Shared_Ack) {
      m_decrementNumberOfMessages;
      r_setSharerBit;
      o_checkForCompletion;
@@ -1790,13 +1979,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    // Transitions from SS
  
-  transition(SS, Ack) {  
+  transition(SS, Ack) {
      m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;
    }
  
-  transition(SS, Shared_Ack) {  
+  transition(SS, Shared_Ack) {
      m_decrementNumberOfMessages;
      r_setSharerBit;
      o_checkForCompletion;
@@ -1822,10 +2011,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(MM_W, Store) {
      hh_store_hit;
+    uu_profileL1DataHit;
      k_popMandatoryQueue;
    }
  
-  transition({MM_W, MM_WF}, Ack) {  
+  transition({MM_W, MM_WF}, Ack) {
      m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;
@@ -1847,10 +2037,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
  
    transition(M_W, Store, MM_W) {
      hh_store_hit;
+    uu_profileL1DataHit;
      k_popMandatoryQueue;
    }
  
-  transition(M_W, Ack) {  
+  transition(M_W, Ack) {
      m_decrementNumberOfMessages;
      o_checkForCompletion;
      n_popResponseQueue;