From: Tuan Ta <qtt2@cornell.edu>
Date: Fri, 4 May 2018 16:14:13 +0000 (-0400)
Subject: mem-ruby: GCN3 and VIPER integration
X-Git-Tag: v20.1.0.0~566
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=18ebe62598e09771f58d555faaed6e3572cc4d14;p=gem5.git

mem-ruby: GCN3 and VIPER integration

This patch modifies the Coalescer and VIPER protocol to support memory
synchronization requests and write-completion responses that are
required by upcoming GCN3 implementation.

VIPER protocol is simplified to be a solely write-through protocol.

Change-Id: Iccfa3d749a0301172a1cc567c59609bb548dace6
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29913
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Bradford Beckmann <brad.beckmann@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index f8da4abf1..5f05a605b 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -392,14 +392,15 @@ machine(MachineType:TCC, "TCC Cache")
 
   action(w_sendResponseWBAck, "w", desc="send WB Ack") {
     peek(responseFromNB_in, ResponseMsg) {
-        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
-          out_msg.addr := address;
-          out_msg.Type := CoherenceResponseType:TDSysWBAck;
-          out_msg.Destination.clear();
-          out_msg.Destination.add(in_msg.WTRequestor);
-          out_msg.Sender := machineID;
-          out_msg.MessageSize := MessageSizeType:Writeback_Control;
-        }
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.WTRequestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
     }
   }
 
@@ -412,6 +413,7 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
       }
     }
   }
@@ -486,6 +488,7 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.Dirty := true;
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.instSeqNum := in_msg.instSeqNum;
       }
     }
   }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 4047dc689..3f6179159 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -56,9 +56,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
     I, AccessPermission:Invalid, desc="Invalid";
     V, AccessPermission:Read_Only, desc="Valid";
-    W, AccessPermission:Read_Write, desc="Written";
-    M, AccessPermission:Read_Write, desc="Written and Valid";
-    L, AccessPermission:Read_Write, desc="Local access is modifable";
     A, AccessPermission:Invalid, desc="Waiting on Atomic";
   }
 
@@ -67,7 +64,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     Load,           desc="Load";
     Store,          desc="Store to L1 (L1 is dirty)";
     StoreThrough,   desc="Store directly to L2(L1 is clean)";
-    StoreLocal,     desc="Store to L1 but L1 is clean";
     Atomic,         desc="Atomic";
     Flush,          desc="Flush if dirty(wbL1 for Store Release)";
     Evict,          desc="Evict if clean(invL1 for Load Acquire)";
@@ -264,7 +260,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
           // disable L1 cache
           if (disableL1) {
-	    trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+	          trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
               trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
@@ -291,18 +287,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         DPRINTF(RubySlicc, "%s\n", in_msg);
         if (in_msg.Type == RubyRequestType:LD) {
           trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
-        } else if (in_msg.Type == RubyRequestType:ATOMIC) {
+        } else if (in_msg.Type == RubyRequestType:ATOMIC ||
+                   in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
+                   in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
           trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:ST) {
           if(disableL1) {
             trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-              if (WB) {
-                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
-              } else {
-                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
-              }
+              trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
             } else {
               Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
               trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
@@ -314,16 +308,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
             trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
         } else {
           error("Unexpected Request Message from VIC");
-          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-            if (WB) {
-                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
-            } else {
-                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
-            }
-          } else {
-            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
-            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
-          }
         }
       }
     }
@@ -415,6 +399,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
       out_msg.Type := CoherenceRequestType:WriteThrough;
       out_msg.InitialRequestTime := curCycle();
       out_msg.Shared := false;
+
+      // forward inst sequence number to lower TCC
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
     }
   }
 
@@ -475,6 +464,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(ad_atomicDone, "ad", desc="atomic done") {
+    assert(is_valid(cache_entry));
+    coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+  }
+
   action(s_storeDone, "s", desc="local store done") {
     assert(is_valid(cache_entry));
 
@@ -491,37 +485,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
         assert(false);
     } else {
-      coalescer.invCallback(address);
-    }
-  }
-
-  action(wb_wbDone, "wb", desc="local wb done") {
-    if (inFlush == true) {
-      Fcnt := Fcnt + 1;
-      if (Fcnt > WTcnt) {
-        if (use_seq_not_coal) {
-            DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
-            assert(false);
-        } else {
-          coalescer.wbCallback(address);
-        }
-        Fcnt := Fcnt - 1;
-      }
-      if (WTcnt == 0 && Fcnt == 0) {
-        inFlush := false;
-        APPEND_TRANSITION_COMMENT(" inFlush is false");
-      }
+      coalescer.invTCPCallback(address);
     }
   }
 
   action(wd_wtDone, "wd", desc="writethrough done") {
-    WTcnt := WTcnt - 1;
-    if (inFlush == true) {
-      Fcnt := Fcnt -1;
+    if (use_seq_not_coal) {
+      DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
+      assert(false);
+    } else {
+      peek(responseToTCP_in, ResponseMsg) {
+        coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
+      }
     }
-    assert(WTcnt >= 0);
-    APPEND_TRANSITION_COMMENT("write-- = ");
-    APPEND_TRANSITION_COMMENT(WTcnt);
   }
 
   action(dw_dirtyWrite, "dw", desc="update write mask"){
@@ -562,77 +538,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   // Stalling transitions do NOT check the tag array...and if they do,
   // they can cause a resource stall deadlock!
 
-  transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
+  transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} {
       z_stall;
   }
 
-  transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
-    l_loadDone;
-    mru_updateMRU;
-    p_popMandatoryQueue;
-  }
-
   transition(I, Load) {TagArrayRead} {
     n_issueRdBlk;
     p_popMandatoryQueue;
   }
 
-  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
-    t_allocateTBE;
+  transition(V, Load) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
     mru_updateMRU;
-    at_atomicThrough;
     p_popMandatoryQueue;
   }
 
-  transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
-    wt_writeThrough;
+  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
     t_allocateTBE;
-    at_atomicThrough;
-    ic_invCache;
-  }
-
-  transition(W, Load, I) {TagArrayRead, DataArrayRead} {
-    wt_writeThrough;
-    norl_issueRdBlkOrloadDone;
-    p_popMandatoryQueue;
-  }
-
-  transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    a_allocate;
-    dw_dirtyWrite;
-    s_storeDone;
-    p_popMandatoryQueue;
-  }
-
-  transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    dw_dirtyWrite;
     mru_updateMRU;
-    s_storeDone;
-    p_popMandatoryQueue;
-  }
-
-  transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    a_allocate;
-    dw_dirtyWrite;
-    s_storeDone;
-    p_popMandatoryQueue;
-  }
-
-  transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    dw_dirtyWrite;
-    mru_updateMRU;
-    s_storeDone;
-    p_popMandatoryQueue;
-  }
-
-  transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    dw_dirtyWrite;
-    mru_updateMRU;
-    s_storeDone;
+    at_atomicThrough;
     p_popMandatoryQueue;
   }
 
-  //M,W should not see storeThrough
   transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
     a_allocate;
     dw_dirtyWrite;
@@ -642,7 +569,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
-  transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
     dw_dirtyWrite;
     s_storeDone;
     wt_writeThrough;
@@ -672,7 +599,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     d_deallocateTBE;
     a_allocate;
     w_writeCache;
-    s_storeDone;
+    ad_atomicDone;
     pr_popResponseQueue;
     ic_invCache;
   }
@@ -683,12 +610,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     pr_popResponseQueue;
   }
 
-  transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
-    w_writeCache;
-    l_loadDone;
-    pr_popResponseQueue;
-  }
-
   transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
     ic_invCache;
   }
@@ -697,26 +618,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     ic_invCache;
   }
 
-  transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
-    wt_writeThrough;
-    ic_invCache;
-  }
-
-  transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
-    wt_writeThrough;
-    ic_invCache;
-  }
-
-  transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+  transition({V, I, A},Flush) {TagArrayFlash} {
     sf_setFlush;
-    wt_writeThrough;
-    ic_invCache;
-    p_popMandatoryQueue;
-  }
-
-  transition({V, I, A, L},Flush) {TagArrayFlash} {
-    sf_setFlush;
-    wb_wbDone;
     p_popMandatoryQueue;
   }
 
@@ -726,20 +629,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     ic_invCache;
   }
 
-  transition({W, M}, Evict, W) {TagArrayFlash} {
-    inv_invDone;
-    p_popMandatoryQueue;
-  }
-
-  transition({A, L}, Evict) {TagArrayFlash} {
+  transition(A, Evict) {TagArrayFlash} {
     inv_invDone;
     p_popMandatoryQueue;
   }
 
   // TCC_AckWB only snoops TBE
-  transition({V, I, A, M, W, L}, TCC_AckWB) {
+  transition({V, I, A}, TCC_AckWB) {
     wd_wtDone;
-    wb_wbDone;
     pr_popResponseQueue;
   }
 }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
index 124ebbeda..60117482a 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,7 +62,8 @@ structure (VIPERCoalescer, external = "yes") {
                      Cycles, Cycles, Cycles);
   void writeCallback(Addr, MachineType, DataBlock,
                      Cycles, Cycles, Cycles, bool);
-  void invCallback(Addr);
-  void wbCallback(Addr);
+  void atomicCallback(Addr, MachineType, DataBlock);
+  void invTCPCallback(Addr);
+  void writeCompleteCallback(Addr, uint64_t);
   void evictionCallback(Addr);
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 6f788ce5f..efbffbd8c 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -514,6 +514,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.InitialRequestTime := in_msg.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.instSeqNum := in_msg.instSeqNum;
       }
     }
   }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index f4f50cb32..f0705192d 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -136,6 +136,7 @@ structure(CPURequestMsg, desc="...", interface="Message") {
   WriteMask writeMask, desc="Write Through Data";
   MachineID WTRequestor,            desc="Node who initiated the write through";
   int wfid,                         default="0", desc="wavefront id";
+  uint64_t instSeqNum,              desc="instruction sequence number";
   bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
   int ProgramCounter,               desc="PC that accesses to this block";
 
@@ -188,6 +189,7 @@ structure(TDProbeRequestMsg, desc="...", interface="Message") {
   MessageSizeType MessageSize, desc="size category of the message";
   int Phase,              desc="Synchronization Phase";
   int wfid,               desc="wavefront id for Release";
+  uint64_t instSeqNum,    desc="instruction sequence number";
   MachineID Requestor,    desc="Node who initiated the request";
 
   bool functionalRead(Packet *pkt) {
@@ -242,6 +244,7 @@ structure(ResponseMsg, desc="...", interface="Message") {
   bool NoAckNeeded, default="false", desc="For short circuting acks";
   bool isValid, default="false", desc="Is acked block valid";
   int wfid, default="0", desc="wavefront id";
+  uint64_t instSeqNum,    desc="instruction sequence number";
   int Phase,                   desc="Synchronization Phase";
 
   int ProgramCounter,       desc="PC that issues this request";
@@ -343,6 +346,7 @@ structure(FifoMsg, desc="...", interface="Message") {
   Addr addr,          desc="Address";
   FifoType Type,            desc="WriteThrough/WriteFlush";
   int wfid,                 default="0",desc="wavefront id";
+  uint64_t instSeqNum,      desc="instruction sequence number";
   MachineID Requestor,      desc="Flush Requestor";
   MachineID oRequestor,      desc="original Flush Requestor";
 
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 76c45b9b0..71716f9fe 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -150,6 +150,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   WriteMask writeMask,       desc="Writethrough mask";
   DataBlock WTData,          desc="Writethrough data block";
   int wfid,                  desc="Writethrough wavefront";
+  uint64_t instSeqNum,       desc="Instruction sequence number";
   PacketPtr pkt,             desc="Packet associated with this request";
 }
 
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 29bedfa51..b3e239686 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -56,6 +56,7 @@ class RubyRequest : public Message
     WriteMask m_writeMask;
     DataBlock m_WTData;
     int m_wfid;
+    uint64_t m_instSeqNum;
 
     RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -80,7 +81,8 @@ class RubyRequest : public Message
         RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
-        DataBlock & _Data)
+        DataBlock & _Data,
+        uint64_t _instSeqNum = 0)
         : Message(curTime),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
@@ -93,7 +95,8 @@ class RubyRequest : public Message
           m_contextId(_core_id),
           m_writeMask(_wm_size,_wm_mask),
           m_WTData(_Data),
-          m_wfid(_proc_id)
+          m_wfid(_proc_id),
+          m_instSeqNum(_instSeqNum)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
     }
@@ -104,7 +107,8 @@ class RubyRequest : public Message
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
         DataBlock & _Data,
-        std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps)
+        std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
+        uint64_t _instSeqNum = 0)
         : Message(curTime),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
@@ -117,7 +121,8 @@ class RubyRequest : public Message
           m_contextId(_core_id),
           m_writeMask(_wm_size,_wm_mask,_atomicOps),
           m_WTData(_Data),
-          m_wfid(_proc_id)
+          m_wfid(_proc_id),
+          m_instSeqNum(_instSeqNum)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
     }
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 1eecb82ad..d9793fafd 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -506,8 +506,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
         }
     }
 
-
-
     m_outstanding_count--;
     assert(m_outstanding_count >= 0);
 
@@ -555,25 +553,24 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
     assert(pkt->req->hasInstSeqNum());
 
     if (pkt->cmd == MemCmd::MemSyncReq) {
-        // issue mem_sync requests immedidately to the cache system without
-        // going though uncoalescedTable like normal LD/ST/Atomic requests
-        issueMemSyncRequest(pkt);
-    } else {
-        // otherwise, this must be either read or write command
-        assert(pkt->isRead() || pkt->isWrite());
-
-        // the pkt is temporarily stored in the uncoalesced table until
-        // it's picked for coalescing process later in this cycle or in a
-        // future cycle
-        uncoalescedTable.insertPacket(pkt);
-        DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
-                pkt->getAddr());
-
-        // we schedule an issue event here to process the uncoalesced table
-        // and try to issue Ruby request to cache system
-        if (!issueEvent.scheduled()) {
-            schedule(issueEvent, curTick());
-        }
+        // let the child coalescer handle MemSyncReq because this is
+        // cache coherence protocol specific
+        return RequestStatus_Issued;
+    }
+    // otherwise, this must be either read or write command
+    assert(pkt->isRead() || pkt->isWrite());
+
+    // the pkt is temporarily stored in the uncoalesced table until
+    // it's picked for coalescing process later in this cycle or in a
+    // future cycle
+    uncoalescedTable.insertPacket(pkt);
+    DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+            pkt->getAddr());
+
+    // we schedule an issue event here to process the uncoalesced table
+    // and try to issue Ruby request to cache system
+    if (!issueEvent.scheduled()) {
+        schedule(issueEvent, curTick());
     }
 
     // we always return RequestStatus_Issued in this coalescer
@@ -582,107 +579,6 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
     return RequestStatus_Issued;
 }
 
-/**
- * TODO: Figure out what do with this code. This code may go away
- *       and/or be merged into the VIPER coalescer once the VIPER
- *       protocol is re-integrated with GCN3 codes.
- */
-/*
-void
-GPUCoalescer::issueRequest(CoalescedRequest* crequest)
-{
-    PacketPtr pkt = crequest->getFirstPkt();
-
-    int proc_id = -1;
-    if (pkt != NULL && pkt->req->hasContextId()) {
-        proc_id = pkt->req->contextId();
-    }
-
-    // If valid, copy the pc to the ruby request
-    Addr pc = 0;
-    if (pkt->req->hasPC()) {
-        pc = pkt->req->getPC();
-    }
-
-    // At the moment setting scopes only counts
-    // for GPU spill space accesses
-    // which is pkt->req->isStack()
-    // this scope is REPLACE since it
-    // does not need to be flushed at the end
-    // of a kernel Private and local may need
-    // to be visible at the end of the kernel
-    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
-    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
-
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    // Creating WriteMask that records written bytes
-    // and atomic operations. This enables partial writes
-    // and partial reads of those writes
-    DataBlock dataBlock;
-    dataBlock.clear();
-    uint32_t blockSize = RubySystem::getBlockSizeBytes();
-    std::vector<bool> accessMask(blockSize,false);
-    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
-    uint32_t tableSize = crequest->getPackets().size();
-    for (int i = 0; i < tableSize; i++) {
-        PacketPtr tmpPkt = crequest->getPackets()[i];
-        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
-        uint32_t tmpSize = tmpPkt->getSize();
-        if (tmpPkt->isAtomicOp()) {
-            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
-                                                        tmpPkt->getAtomicOp());
-            atomicOps.push_back(tmpAtomicOp);
-        } else if (tmpPkt->isWrite()) {
-            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
-                              tmpOffset, tmpSize);
-        }
-        for (int j = 0; j < tmpSize; j++) {
-            accessMask[tmpOffset + j] = true;
-        }
-    }
-    std::shared_ptr<RubyRequest> msg;
-    if (pkt->isAtomicOp()) {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
-                              RubyAccessMode_Supervisor, pkt,
-                              PrefetchBit_No, proc_id, 100,
-                              blockSize, accessMask,
-                              dataBlock, atomicOps,
-                              accessScope, accessSegment);
-    } else {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
-                              RubyAccessMode_Supervisor, pkt,
-                              PrefetchBit_No, proc_id, 100,
-                              blockSize, accessMask,
-                              dataBlock,
-                              accessScope, accessSegment);
-    }
-    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
-             curTick(), m_version, "Coal", "Begin", "", "",
-             printAddress(msg->getPhysicalAddress()),
-             RubyRequestType_to_string(crequest->getRubyType()));
-
-    fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
-             "there should not be any I-Fetch requests in the GPU Coalescer");
-
-    Tick latency = cyclesToTicks(
-                m_controller->mandatoryQueueLatency(crequest->getRubyType()));
-    assert(latency > 0);
-
-    if (!deadlockCheckEvent.scheduled()) {
-        schedule(deadlockCheckEvent,
-                 m_deadlock_threshold * clockPeriod() +
-                 curTick());
-    }
-
-    assert(m_mandatory_q_ptr);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
-}*/
-
 template <class KEY, class VALUE>
 std::ostream &
 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
@@ -890,7 +786,13 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
         assert(port != NULL);
 
         pkt->senderState = ss->predecessor;
-        delete ss;
+
+        if (pkt->cmd != MemCmd::WriteReq) {
+            // for WriteReq, we keep the original senderState until
+            // writeCompleteCallback
+            delete ss;
+        }
+
         port->hitCallback(pkt);
         trySendRetries();
     }
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 789ca308f..74236cb36 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -294,9 +294,11 @@ class GPUCoalescer : public RubyPort
                       Cycles firstResponseTime,
                       bool isRegion);
 
-    void atomicCallback(Addr address,
-                        MachineType mach,
-                        const DataBlock& data);
+    /* atomics need their own callback because the data
+       might be const coming from SLICC */
+    virtual void atomicCallback(Addr address,
+                                MachineType mach,
+                                const DataBlock& data);
 
     RequestStatus makeRequest(PacketPtr pkt) override;
     int outstandingCount() const override { return m_outstanding_count; }
@@ -365,7 +367,7 @@ class GPUCoalescer : public RubyPort
     // since the two following issue functions are protocol-specific,
     // they must be implemented in a derived coalescer
     virtual void issueRequest(CoalescedRequest* crequest) = 0;
-    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+//    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
 
     void kernelCallback(int wavefront_id);
 
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 7632bbb4e..83aaa1a50 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -272,6 +272,10 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
                RubySystem::getBlockSizeBytes());
     }
 
+    // Save the port in the sender state object to be used later to
+    // route the response
+    pkt->pushSenderState(new SenderState(this));
+
     // Submit the ruby request
     RequestStatus requestStatus = ruby_port->makeRequest(pkt);
 
@@ -279,16 +283,16 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
     // Otherwise, we need to tell the port to retry at a later point
     // and return false.
     if (requestStatus == RequestStatus_Issued) {
-        // Save the port in the sender state object to be used later to
-        // route the response
-        pkt->pushSenderState(new SenderState(this));
-
-        DPRINTF(RubyPort, "Request %s address %#x issued\n", pkt->cmdString(),
+        DPRINTF(RubyPort, "Request %s 0x%x issued\n", pkt->cmdString(),
                 pkt->getAddr());
         return true;
     }
 
-    if (pkt->cmd != MemCmd::MemFenceReq) {
+    // pop off sender state as this request failed to issue
+    SenderState *ss = safe_cast<SenderState *>(pkt->popSenderState());
+    delete ss;
+
+    if (pkt->cmd != MemCmd::MemSyncReq) {
         DPRINTF(RubyPort,
                 "Request %s for address %#x did not issue because %s\n",
                 pkt->cmdString(), pkt->getAddr(),
@@ -558,7 +562,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
     }
 
     // turn packet around to go back to requester if response expected
-    if (needsResponse) {
+    if (needsResponse || pkt->isResponse()) {
         DPRINTF(RubyPort, "Sending packet back over port\n");
         // Send a response in the same cycle. There is no need to delay the
         // response because the response latency is already incurred in the
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index cdef2b1f3..eafce6da7 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -44,6 +44,7 @@
 #include "cpu/testers/rubytest/RubyTester.hh"
 #include "debug/GPUCoalescer.hh"
 #include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/common/SubBlock.hh"
 #include "mem/ruby/network/MessageBuffer.hh"
@@ -64,207 +65,228 @@ VIPERCoalescerParams::create()
 }
 
 VIPERCoalescer::VIPERCoalescer(const Params *p)
-    : GPUCoalescer(p)
+    : GPUCoalescer(p),
+      m_cache_inv_pkt(nullptr),
+      m_num_pending_invs(0)
 {
-    m_max_wb_per_cycle=p->max_wb_per_cycle;
-    m_max_inv_per_cycle=p->max_inv_per_cycle;
-    m_outstanding_inv = 0;
-    m_outstanding_wb = 0;
 }
 
 VIPERCoalescer::~VIPERCoalescer()
 {
 }
 
-void
-VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
-{
-}
-
-void
-VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt)
-{
-}
-
 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
 // special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 VIPERCoalescer::makeRequest(PacketPtr pkt)
 {
-    if (m_outstanding_wb | m_outstanding_inv) {
-        DPRINTF(GPUCoalescer,
-                "There are %d Writebacks and %d Invalidatons\n",
-                m_outstanding_wb, m_outstanding_inv);
-    }
-    // Are we in the middle of a release
-    if ((m_outstanding_wb) > 0) {
-        if (pkt->req->isKernel()) {
-            // Everythign is fine
-            // Barriers and Kernel End scan coalesce
-            // If it is a Kerenl Begin flush the cache
-            if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
-                invL1();
-            }
-
-            if (pkt->req->isRelease()) {
-                insertKernel(pkt->req->contextId(), pkt);
-            }
-
-            return RequestStatus_Issued;
-        }
-    } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
-        // Flush Dirty Data on Kernel End
-        // isKernel + isRelease
-        insertKernel(pkt->req->contextId(), pkt);
-        wbL1();
-        if (m_outstanding_wb == 0) {
-            for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
-                newKernelEnds.push_back(it->first);
-            }
-            completeIssue();
-        }
-        return RequestStatus_Issued;
+    // VIPER only supports following memory request types
+    //    MemSyncReq & Acquire: TCP cache invalidation
+    //    ReadReq             : cache read
+    //    WriteReq            : cache write
+    //    AtomicOp            : cache atomic
+    //
+    // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
+    // does not specify an equivalent type of memory request.
+    // TODO: future patches should rename Acquire and Release
+    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
+            pkt->cmd == MemCmd::ReadReq ||
+            pkt->cmd == MemCmd::WriteReq ||
+            pkt->isAtomicOp());
+
+    if (pkt->req->isAcquire() && m_cache_inv_pkt) {
+        // In VIPER protocol, the coalescer is not able to handle two or
+        // more cache invalidation requests at a time. Cache invalidation
+        // requests must be serialized to ensure that all stale data in
+        // TCP are invalidated correctly. If there's already a pending
+        // cache invalidation request, we must retry this request later
+        return RequestStatus_Aliased;
     }
 
     GPUCoalescer::makeRequest(pkt);
 
-    if (pkt->req->isKernel() && pkt->req->isAcquire()) {
-        // Invalidate clean Data on Kernel Begin
-        // isKernel + isAcquire
-        invL1();
-    } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
-        // Deschedule the AtomicAcqRel and
-        // Flush and Invalidate the L1 cache
-        invwbL1();
-        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
-            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
-            deschedule(issueEvent);
-        }
-    } else if (pkt->req->isRelease()) {
-        // Deschedule the StoreRel and
-        // Flush the L1 cache
-        wbL1();
-        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
-            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
-            deschedule(issueEvent);
-        }
-    } else if (pkt->req->isAcquire()) {
-        // LoadAcq or AtomicAcq
-        // Invalidate the L1 cache
-        invL1();
-    }
-    // Request was successful
-    if (m_outstanding_wb == 0) {
-        if (!issueEvent.scheduled()) {
-            DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
-            schedule(issueEvent, curTick());
-        }
+    if (pkt->req->isAcquire()) {
+        // In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
+        // flag to invalidate TCP. Upon receiving a request of this type,
+        // VIPERCoalescer starts a cache walk to invalidate all valid entries
+        // in TCP. The request is completed once all entries are invalidated.
+        assert(!m_cache_inv_pkt);
+        m_cache_inv_pkt = pkt;
+        invTCP();
     }
+
     return RequestStatus_Issued;
 }
 
 void
-VIPERCoalescer::wbCallback(Addr addr)
+VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
 {
-    m_outstanding_wb--;
-    // if L1 Flush Complete
-    // attemnpt to schedule issueEvent
-    assert(((int) m_outstanding_wb) >= 0);
-    if (m_outstanding_wb == 0) {
-        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
-            newKernelEnds.push_back(it->first);
+    PacketPtr pkt = crequest->getFirstPkt();
+
+    int proc_id = -1;
+    if (pkt != NULL && pkt->req->hasContextId()) {
+        proc_id = pkt->req->contextId();
+    }
+
+    // If valid, copy the pc to the ruby request
+    Addr pc = 0;
+    if (pkt->req->hasPC()) {
+        pc = pkt->req->getPC();
+    }
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Creating WriteMask that records written bytes
+    // and atomic operations. This enables partial writes
+    // and partial reads of those writes
+    DataBlock dataBlock;
+    dataBlock.clear();
+    uint32_t blockSize = RubySystem::getBlockSizeBytes();
+    std::vector<bool> accessMask(blockSize,false);
+    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+    uint32_t tableSize = crequest->getPackets().size();
+    for (int i = 0; i < tableSize; i++) {
+        PacketPtr tmpPkt = crequest->getPackets()[i];
+        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+        uint32_t tmpSize = tmpPkt->getSize();
+        if (tmpPkt->isAtomicOp()) {
+            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+                                                        tmpPkt->getAtomicOp());
+            atomicOps.push_back(tmpAtomicOp);
+        } else if (tmpPkt->isWrite()) {
+            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+                              tmpOffset, tmpSize);
+        }
+        for (int j = 0; j < tmpSize; j++) {
+            accessMask[tmpOffset + j] = true;
         }
-        completeIssue();
     }
-    trySendRetries();
+    std::shared_ptr<RubyRequest> msg;
+    if (pkt->isAtomicOp()) {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, crequest->getRubyType(),
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, atomicOps, crequest->getSeqNum());
+    } else {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, crequest->getRubyType(),
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, crequest->getSeqNum());
+    }
+
+    if (pkt->cmd == MemCmd::WriteReq) {
+        makeWriteCompletePkts(crequest);
+    }
+
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+             curTick(), m_version, "Coal", "Begin", "", "",
+             printAddress(msg->getPhysicalAddress()),
+             RubyRequestType_to_string(crequest->getRubyType()));
+
+    fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
+             "there should not be any I-Fetch requests in the GPU Coalescer");
+
+    if (!deadlockCheckEvent.scheduled()) {
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(crequest->getRubyType()));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 }
 
 void
-VIPERCoalescer::invCallback(Addr addr)
+VIPERCoalescer::makeWriteCompletePkts(CoalescedRequest* crequest)
 {
-    m_outstanding_inv--;
-    // if L1 Flush Complete
-    // attemnpt to schedule issueEvent
-    // This probably won't happen, since
-    // we dont wait on cache invalidations
-    if (m_outstanding_wb == 0) {
-        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
-            newKernelEnds.push_back(it->first);
-        }
-        completeIssue();
+    // In VIPER protocol, for each write request, down-stream caches
+    // return two responses: writeCallback and writeCompleteCallback.
+    // We need to prepare a writeCompletePkt for each write request so
+    // that when writeCompleteCallback is called, we can respond
+    // requesting wavefront right away.
+    // writeCompletePkt inherits request and senderState of the original
+    // write request packet so that we can find the original requestor
+    // later. This assumes that request and senderState are not deleted
+    // before writeCompleteCallback is called.
+
+    auto key = crequest->getSeqNum();
+    std::vector<PacketPtr>& req_pkts = crequest->getPackets();
+
+    for (auto pkt : req_pkts) {
+        DPRINTF(GPUCoalescer, "makeWriteCompletePkts: instSeqNum %d\n",
+                key);
+        assert(pkt->cmd == MemCmd::WriteReq);
+
+        PacketPtr writeCompletePkt = new Packet(pkt->req,
+            MemCmd::WriteCompleteResp);
+        writeCompletePkt->setAddr(pkt->getAddr());
+        writeCompletePkt->senderState = pkt->senderState;
+        m_writeCompletePktMap[key].push_back(writeCompletePkt);
     }
-    trySendRetries();
 }
 
-/**
-  * Invalidate L1 cache (Acquire)
-  */
 void
-VIPERCoalescer::invL1()
+VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
 {
-    int size = m_dataCache_ptr->getNumBlocks();
-    DPRINTF(GPUCoalescer,
-            "There are %d Invalidations outstanding before Cache Walk\n",
-            m_outstanding_inv);
-    // Walk the cache
-    for (int i = 0; i < size; i++) {
-        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
-        // Evict Read-only data
-        RubyRequestType request_type = RubyRequestType_REPLACEMENT;
-        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, (uint8_t*) 0, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
-            nullptr);
-        assert(m_mandatory_q_ptr != NULL);
-        Tick latency = cyclesToTicks(
-                            m_controller->mandatoryQueueLatency(request_type));
-        assert(latency > 0);
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-        m_outstanding_inv++;
+    DPRINTF(GPUCoalescer, "writeCompleteCallback: instSeqNum %d addr 0x%x\n",
+            instSeqNum, addr);
+
+    auto key = instSeqNum;
+    assert(m_writeCompletePktMap.count(key) == 1 &&
+           !m_writeCompletePktMap[key].empty());
+
+    for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
+        if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>
+                    (writeCompletePkt->senderState);
+            MemSlavePort *port = ss->port;
+            assert(port != NULL);
+
+            writeCompletePkt->senderState = ss->predecessor;
+            delete ss;
+            port->hitCallback(writeCompletePkt);
+        }
     }
-    DPRINTF(GPUCoalescer,
-            "There are %d Invalidatons outstanding after Cache Walk\n",
-            m_outstanding_inv);
+
+    trySendRetries();
+
+    if (m_writeCompletePktMap[key].empty())
+        m_writeCompletePktMap.erase(key);
 }
 
-/**
-  * Writeback L1 cache (Release)
-  */
 void
-VIPERCoalescer::wbL1()
+VIPERCoalescer::invTCPCallback(Addr addr)
 {
-    int size = m_dataCache_ptr->getNumBlocks();
-    DPRINTF(GPUCoalescer,
-            "There are %d Writebacks outstanding before Cache Walk\n",
-            m_outstanding_wb);
-    // Walk the cache
-    for (int i = 0; i < size; i++) {
-        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
-        // Write dirty data back
-        RubyRequestType request_type = RubyRequestType_FLUSH;
-        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, (uint8_t*) 0, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
-            nullptr);
-        assert(m_mandatory_q_ptr != NULL);
-        Tick latency = cyclesToTicks(
-                            m_controller->mandatoryQueueLatency(request_type));
-        assert(latency > 0);
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-        m_outstanding_wb++;
+    assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+    m_num_pending_invs--;
+
+    if (m_num_pending_invs == 0) {
+        std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+        completeHitCallback(pkt_list);
+        m_cache_inv_pkt = nullptr;
     }
-    DPRINTF(GPUCoalescer,
-            "There are %d Writebacks outstanding after Cache Walk\n",
-            m_outstanding_wb);
 }
 
 /**
-  * Invalidate and Writeback L1 cache (Acquire&Release)
+  * Invalidate TCP (Acquire)
   */
 void
-VIPERCoalescer::invwbL1()
+VIPERCoalescer::invTCP()
 {
     int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_num_pending_invs);
     // Walk the cache
     for (int i = 0; i < size; i++) {
         Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
@@ -274,27 +296,14 @@ VIPERCoalescer::invwbL1()
             clockEdge(), addr, (uint8_t*) 0, 0, 0,
             request_type, RubyAccessMode_Supervisor,
             nullptr);
+        DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
         assert(m_mandatory_q_ptr != NULL);
         Tick latency = cyclesToTicks(
-                            m_controller->mandatoryQueueLatency(request_type));
-        assert(latency > 0);
+            m_controller->mandatoryQueueLatency(request_type));
         m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-        m_outstanding_inv++;
-    }
-    // Walk the cache
-    for (int i = 0; i< size; i++) {
-        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
-        // Write dirty data back
-        RubyRequestType request_type = RubyRequestType_FLUSH;
-        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, (uint8_t*) 0, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
-            nullptr);
-        assert(m_mandatory_q_ptr != NULL);
-        Tick latency = cyclesToTicks(
-                m_controller->mandatoryQueueLatency(request_type));
-        assert(latency > 0);
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-        m_outstanding_wb++;
+        m_num_pending_invs++;
     }
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidatons outstanding after Cache Walk\n",
+            m_num_pending_invs);
 }
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index 659c9fd34..2f68c10bc 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -57,19 +57,31 @@ class VIPERCoalescer : public GPUCoalescer
     typedef VIPERCoalescerParams Params;
     VIPERCoalescer(const Params *);
     ~VIPERCoalescer();
-
-    void issueMemSyncRequest(PacketPtr pkt) override;
-    void issueRequest(CoalescedRequest* crequest) override;
-    void wbCallback(Addr address);
-    void invCallback(Addr address);
+    void writeCompleteCallback(Addr address, uint64_t instSeqNum);
+    void invTCPCallback(Addr address);
     RequestStatus makeRequest(PacketPtr pkt) override;
+    void issueRequest(CoalescedRequest* crequest) override;
+
   private:
-    void invL1();
-    void wbL1();
-    void invwbL1();
-    uint64_t m_outstanding_inv;
-    uint64_t m_outstanding_wb;
-    uint64_t m_max_inv_per_cycle;
-    uint64_t m_max_wb_per_cycle;
+    void invTCP();
+
+    // make write-complete response packets from original write request packets
+    void makeWriteCompletePkts(CoalescedRequest* crequest);
+
+    // current cache invalidation packet
+    // nullptr if there is no active cache invalidation request
+    PacketPtr m_cache_inv_pkt;
+
+    // number of remaining cache lines to be invalidated in TCP
+    int m_num_pending_invs;
+
+    // a map of instruction sequence number and corresponding pending
+    // write-complete response packets. Each write-complete response
+    // corresponds to a pending store request that is waiting for
+    // writeCompleteCallback. We may have multiple pending store requests per
+    // wavefront at a time. Each time writeCompleteCallback is called, an entry
+    // with a corresponding seqNum is popped off from map and returned to
+    // compute unit.
+    std::unordered_map<uint64_t, std::vector<PacketPtr>> m_writeCompletePktMap;
 };
 #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__