From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 8 May 2020 15:37:59 +0000 (-0500)
Subject: gpu-compute,mem-ruby: Refactor GPU coalescer
X-Git-Tag: v20.1.0.0~649
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=3d57eaf9f52813b2b89e2def8fe52c0c09544207;p=gem5.git

gpu-compute,mem-ruby: Refactor GPU coalescer

Remove the read/write tables and coalescing table and introduce a two
levels of tables for uncoalesced and coalesced packets. Tokens are
granted to GPU instructions to place in uncoalesced table. If tokens
are available, the operation always succeeds such that the 'Aliased'
status is never returned. Coalesced accesses are placed in the
coalesced table while requests are outstanding. Requests to the same
address are added as targets to the table similar to how MSHRs
operate.

Change-Id: I44983610307b638a97472db3576d0a30df2de600
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27429
Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Bradford Beckmann <brad.beckmann@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---

diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index fee025435..7eaf65fec 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -129,6 +129,8 @@ class ComputeUnit(ClockedObject):
                                       "memory pipeline's queues")
     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                       "memory pipeline's queues")
+    max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
+                            " of instructions that can be sent to coalescer")
     ldsBus = Bridge() # the bridge between the CU and its LDS
     ldsPort = MasterPort("The port that goes to the LDS")
     localDataStore = Param.LdsState("the LDS for this CU")
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 59bc6a004..cd880d6cc 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -74,9 +74,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
-    lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
-    globalSeqNum(0), wavefrontSize(p->wfSize),
-    kernelLaunchInst(new KernelLaunchStaticInst())
+    lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
+    wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
 {
     /**
      * This check is necessary because std::bitset only provides conversion
@@ -139,6 +139,10 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
 
     memPort.resize(wfSize());
 
+    // Setup tokens for slave ports. The number of tokens in memSlaveTokens
+    // is the total token count for the entire vector port (i.e., this CU).
+    memPortTokens = new TokenManager(p->max_cu_tokens);
+
     // resize the tlbPort vectorArray
     int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
@@ -612,6 +616,8 @@ ComputeUnit::init()
     vectorAluInstAvail.resize(numSIMDs, false);
     shrMemInstAvail = 0;
     glbMemInstAvail = 0;
+
+    gmTokenPort.setTokenManager(memPortTokens);
 }
 
 bool
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a023cb23c..49713e936 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -51,6 +51,7 @@
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
+#include "mem/token_port.hh"
 #include "sim/clocked_object.hh"
 
 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
@@ -415,6 +416,26 @@ class ComputeUnit : public ClockedObject
 
     CUExitCallback *cuExitCallback;
 
+    class GMTokenPort : public TokenMasterPort
+    {
+      public:
+        GMTokenPort(const std::string& name, SimObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenMasterPort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        bool recvTimingResp(PacketPtr) { return false; }
+        void recvReqRetry() { }
+    };
+
+    // Manager for the number of tokens available to this compute unit to
+    // send global memory request packets to the coalescer this is only used
+    // between global memory pipe and TCP coalescer.
+    TokenManager *memPortTokens;
+    GMTokenPort gmTokenPort;
+
     /** Data access Port **/
     class DataPort : public MasterPort
     {
@@ -677,6 +698,12 @@ class ComputeUnit : public ClockedObject
         return ldsPort;
     }
 
+    TokenManager *
+    getTokenManager()
+    {
+        return memPortTokens;
+    }
+
     /** The memory port for SIMD data accesses.
      *  Can be connected to PhysMem for Ruby for timing simulations
      */
@@ -712,6 +739,8 @@ class ComputeUnit : public ClockedObject
             }
             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
             return *ldsPort;
+        } else if (if_name == "gmTokenPort") {
+            return gmTokenPort;
         } else {
             panic("incorrect port name");
         }
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index d8e6d47df..64778f011 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -33,6 +33,7 @@
 
 #include "gpu-compute/global_memory_pipeline.hh"
 
+#include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
@@ -56,6 +57,25 @@ GlobalMemPipeline::init(ComputeUnit *cu)
     _name = computeUnit->name() + ".GlobalMemPipeline";
 }
 
+bool
+GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    // Make sure the vector port has tokens. There is a single pool
+    // of tokens so only one port in the vector port needs to be checked.
+    // Lane 0 is chosen arbirarily.
+    DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
+    if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
+        DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -124,6 +144,14 @@ GlobalMemPipeline::exec()
             }
         }
 
+        DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
+                mp->disassemble(), mp->seqNum());
+        // Memfences will not return tokens and must be issued so we should
+        // not request one as this will deplete the token count until deadlock
+        if (!mp->isMemFence()) {
+            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
+            mp->computeUnit()->getTokenManager()->acquireTokens(1);
+        }
         mp->initiateAcc(mp);
 
         if (!outOfOrderDataDelivery && !mp->isMemFence()) {
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index 0bc8596c6..2f83185a9 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -121,6 +121,8 @@ class GlobalMemPipeline
         loadVrfBankConflictCycles += num_cycles;
     }
 
+    bool coalescerReady(GPUDynInstPtr mp) const;
+
   private:
     ComputeUnit *computeUnit;
     std::string _name;
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index e70a874d2..46cce9ce8 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -434,6 +434,11 @@ Wavefront::ready(itype_e type)
             return 0;
         }
 
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
@@ -504,6 +509,12 @@ Wavefront::ready(itype_e type)
         if (!locMemIssueRdy) {
             return 0;
         }
+
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index a7b658ee1..0153b4c4b 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -113,11 +113,95 @@ reqSegmentToHSASegment(const RequestPtr &req)
     return accessSegment;
 }
 
+UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
+    : coalescer(gc)
+{
+}
+
+void
+UncoalescedTable::insertPacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    instMap[seqNum].push_back(pkt);
+    DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
+            pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
+}
+
+bool
+UncoalescedTable::packetAvailable()
+{
+    return !instMap.empty();
+}
+
+PerInstPackets*
+UncoalescedTable::getInstPackets(int offset)
+{
+    if (offset >= instMap.size()) {
+        return nullptr;
+    }
+
+    auto instMapIter = instMap.begin();
+    std::advance(instMapIter, offset);
+
+    return &(instMapIter->second);
+}
+
+void
+UncoalescedTable::updateResources()
+{
+    for (auto iter = instMap.begin(); iter != instMap.end(); ) {
+        if (iter->second.empty()) {
+            instMap.erase(iter++);
+            coalescer->getGMTokenPort().sendTokens(1);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+void
+UncoalescedTable::printRequestTable(std::stringstream& ss)
+{
+    ss << "UncoalescedTable contains " << instMap.size()
+       << " address entries." << std::endl;
+    for (auto& inst : instMap) {
+        ss << "Addr 0x" << std::hex << inst.first << std::dec
+           << " with " << inst.second.size() << " packets"
+           << std::endl;
+    }
+}
+
+void
+UncoalescedTable::checkDeadlock(Tick threshold)
+{
+    Tick current_time = curTick();
+
+    for (auto &it : instMap) {
+        for (auto &pkt : it.second) {
+            if (current_time - pkt->req->time() > threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+
+                panic("Possible Deadlock detected. Aborting!\n"
+                     "version: %d request.paddr: 0x%x uncoalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n\n%s", coalescer->getId(),
+                      pkt->getAddr(), instMap.size(), current_time,
+                      pkt->req->time(), current_time - pkt->req->time(),
+                      ss.str());
+            }
+        }
+    }
+}
+
 GPUCoalescer::GPUCoalescer(const Params *p)
     : RubyPort(p),
       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
                  false, Event::Progress_Event_Pri),
-      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
+      uncoalescedTable(this),
+      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
+      gmTokenPort(name() + ".gmTokenPort", this)
 {
     m_store_waiting_on_load_cycles = 0;
     m_store_waiting_on_store_cycles = 0;
@@ -126,8 +210,9 @@ GPUCoalescer::GPUCoalescer(const Params *p)
 
     m_outstanding_count = 0;
 
+    coalescingWindow = p->max_coalesces_per_cycle;
+
     m_max_outstanding_requests = 0;
-    m_deadlock_threshold = 0;
     m_instCache_ptr = nullptr;
     m_dataCache_ptr = nullptr;
 
@@ -149,58 +234,72 @@ GPUCoalescer::~GPUCoalescer()
 {
 }
 
+Port &
+GPUCoalescer::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "gmTokenPort") {
+        return gmTokenPort;
+    }
+
+    // delgate to RubyPort otherwise
+    return RubyPort::getPort(if_name, idx);
+}
+
 void
 GPUCoalescer::wakeup()
 {
-    // Check for deadlock of any of the requests
     Cycles current_time = curCycle();
-
-    // Check across all outstanding requests
-    int total_outstanding = 0;
-
-    RequestTable::iterator read = m_readRequestTable.begin();
-    RequestTable::iterator read_end = m_readRequestTable.end();
-    for (; read != read_end; ++read) {
-        GPUCoalescerRequest* request = read->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_readRequestTable.size(),
-              current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time)*clockPeriod());
-    }
-
-    RequestTable::iterator write = m_writeRequestTable.begin();
-    RequestTable::iterator write_end = m_writeRequestTable.end();
-    for (; write != write_end; ++write) {
-        GPUCoalescerRequest* request = write->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_writeRequestTable.size(),
-              current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time) * clockPeriod());
+    for (auto& requestList : coalescedTable) {
+        for (auto& req : requestList.second) {
+            if (current_time - req->getIssueTime() > m_deadlock_threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+                ss << "Outstanding requests: " << m_outstanding_count
+                   << std::endl;
+
+                panic("Possible Deadlock detected. Aborting!\n"
+                     "version: %d request.paddr: 0x%x coalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n %s", m_version,
+                      req->getFirstPkt()->getAddr(),
+                      coalescedTable.size(), cyclesToTicks(current_time),
+                      cyclesToTicks(req->getIssueTime()),
+                      cyclesToTicks(current_time - req->getIssueTime()),
+                      ss.str());
+            }
+        }
     }
 
-    total_outstanding += m_writeRequestTable.size();
-    total_outstanding += m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
+    Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
+    uncoalescedTable.checkDeadlock(tick_threshold);
 
     if (m_outstanding_count > 0) {
-        // If there are still outstanding requests, keep checking
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
     }
 }
 
+void
+GPUCoalescer::printRequestTable(std::stringstream& ss)
+{
+    uncoalescedTable.printRequestTable(ss);
+
+    ss << "CoalescedTable contains " << coalescedTable.size()
+       << " address entries." << std::endl;
+    for (auto& requestList : coalescedTable) {
+        ss << "Addr 0x" << std::hex << requestList.first << std::dec
+           << ": type-";
+        for (auto& request : requestList.second) {
+            ss << RubyRequestType_to_string(request->getRubyType())
+               << " pkts-" << request->getPackets().size()
+               << " issued-" << request->getIssueTime() << " seqNum-"
+               << request->getSeqNum() << "; ";
+        }
+        ss << std::endl;
+    }
+}
+
 void
 GPUCoalescer::resetStats()
 {
@@ -229,65 +328,6 @@ GPUCoalescer::printProgress(ostream& out) const
 {
 }
 
-RequestStatus
-GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
-{
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
-        return RequestStatus_BufferFull;
-    }
-
-    if (m_controller->isBlocked(line_addr) &&
-       request_type != RubyRequestType_Locked_RMW_Write) {
-        return RequestStatus_Aliased;
-    }
-
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        // Check if there is any outstanding read request for the same
-        // cache line.
-        if (m_readRequestTable.count(line_addr) > 0) {
-            m_store_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_writeRequestTable.count(line_addr) > 0) {
-          // There is an outstanding write request for the cache line
-          m_store_waiting_on_store_cycles++;
-          return RequestStatus_Aliased;
-        }
-    } else {
-        // Check if there is any outstanding write request for the same
-        // cache line.
-        if (m_writeRequestTable.count(line_addr) > 0) {
-            m_load_waiting_on_store_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_readRequestTable.count(line_addr) > 0) {
-            // There is an outstanding read request for the cache line
-            m_load_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-    }
-
-    return RequestStatus_Ready;
-
-}
-
-
-
 // sets the kernelEndList
 void
 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
@@ -303,153 +343,6 @@ GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
             kernelEndList.size());
 }
 
-
-// Insert the request on the correct request table.  Return true if
-// the entry was already present.
-bool
-GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
-{
-    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
-           pkt->req->isLockedRMW() ||
-           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
-
-    int total_outstanding M5_VAR_USED =
-        m_writeRequestTable.size() + m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
-
-    // See if we should schedule a deadlock check
-    if (!deadlockCheckEvent.scheduled()) {
-        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
-    }
-
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        pair<RequestTable::iterator, bool> r =
-          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
-                                       (GPUCoalescerRequest*) NULL));
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                                curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting write request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    } else {
-        pair<RequestTable::iterator, bool> r =
-            m_readRequestTable.insert(RequestTable::value_type(line_addr,
-                                        (GPUCoalescerRequest*) NULL));
-
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                             curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting read request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    }
-
-    m_outstandReqHist.sample(m_outstanding_count);
-
-    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
-    assert(m_outstanding_count == total_outstanding);
-
-    return false;
-}
-
-void
-GPUCoalescer::markRemoved()
-{
-    m_outstanding_count--;
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-}
-
-void
-GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
-{
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-
-    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
-    if ((srequest->m_type == RubyRequestType_ST) ||
-        (srequest->m_type == RubyRequestType_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_RMW_Write) ||
-        (srequest->m_type == RubyRequestType_Load_Linked) ||
-        (srequest->m_type == RubyRequestType_Store_Conditional) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
-        m_writeRequestTable.erase(line_addr);
-    } else {
-        m_readRequestTable.erase(line_addr);
-    }
-
-    markRemoved();
-}
-
-bool
-GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
-{
-    //
-    // The success flag indicates whether the LLSC operation was successful.
-    // LL ops will always succeed, but SC may fail if the cache line is no
-    // longer locked.
-    //
-    bool success = true;
-    if (request->m_type == RubyRequestType_Store_Conditional) {
-        if (!m_dataCache_ptr->isLocked(address, m_version)) {
-            //
-            // For failed SC requests, indicate the failure to the cpu by
-            // setting the extra data to zero.
-            //
-            request->pkt->req->setExtraData(0);
-            success = false;
-        } else {
-            //
-            // For successful SC requests, indicate the success to the cpu by
-            // setting the extra data to one.
-            //
-            request->pkt->req->setExtraData(1);
-        }
-        //
-        // Independent of success, all SC operations must clear the lock
-        //
-        m_dataCache_ptr->clearLocked(address);
-    } else if (request->m_type == RubyRequestType_Load_Linked) {
-        //
-        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
-        // previously locked cache lines?
-        //
-        m_dataCache_ptr->setLocked(address, m_version);
-    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
-               (m_dataCache_ptr->isLocked(address, m_version))) {
-        //
-        // Normal writes should clear the locked address
-        //
-        m_dataCache_ptr->clearLocked(address);
-    }
-    return success;
-}
-
 void
 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 {
@@ -487,49 +380,22 @@ GPUCoalescer::writeCallback(Addr address,
                          bool isRegion)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));
 
-    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
-
-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
-
-    m_writeRequestTable.erase(i);
-    markRemoved();
+    auto crequest = coalescedTable.at(address).front();
 
-    assert((request->m_type == RubyRequestType_ST) ||
-           (request->m_type == RubyRequestType_ATOMIC) ||
-           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-           (request->m_type == RubyRequestType_RMW_Read) ||
-           (request->m_type == RubyRequestType_RMW_Write) ||
-           (request->m_type == RubyRequestType_Load_Linked) ||
-           (request->m_type == RubyRequestType_Store_Conditional) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
-           (request->m_type == RubyRequestType_FLUSH));
+    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                forwardRequestTime, firstResponseTime, isRegion);
 
+    delete crequest;
+    coalescedTable.at(address).pop_front();
 
-    //
-    // For Alpha, properly handle LL, SC, and write requests with respect to
-    // locked cache blocks.
-    //
-    // Not valid for Garnet_standalone protocl
-    //
-    bool success = true;
-    if (!m_runningGarnetStandalone)
-        success = handleLlsc(address, request);
-
-    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
-        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
-    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
-        m_controller->unblock(address);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-
-    hitCallback(request, mach, data, success,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
 }
 
 void
@@ -570,26 +436,37 @@ GPUCoalescer::readCallback(Addr address,
                         bool isRegion)
 {
     assert(address == makeLineAddress(address));
-    assert(m_readRequestTable.count(makeLineAddress(address)));
-
-    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
-
-    m_readRequestTable.erase(i);
-    markRemoved();
+    assert(coalescedTable.count(address));
+
+    auto crequest = coalescedTable.at(address).front();
+    fatal_if(crequest->getRubyType() != RubyRequestType_LD,
+             "readCallback received non-read type response\n");
+
+    // Iterate over the coalesced requests to respond to as many loads as
+    // possible until another request type is seen. Models MSHR for TCP.
+    while (crequest->getRubyType() == RubyRequestType_LD) {
+        hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                    forwardRequestTime, firstResponseTime, isRegion);
+
+        delete crequest;
+        coalescedTable.at(address).pop_front();
+        if (coalescedTable.at(address).empty()) {
+            break;
+        }
 
-    assert((request->m_type == RubyRequestType_LD) ||
-           (request->m_type == RubyRequestType_IFETCH));
+        crequest = coalescedTable.at(address).front();
+    }
 
-    hitCallback(request, mach, data, true,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
+    }
 }
 
 void
-GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                        MachineType mach,
                        DataBlock& data,
                        bool success,
@@ -598,22 +475,15 @@ GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
                        Cycles firstResponseTime,
                        bool isRegion)
 {
-    PacketPtr pkt = srequest->pkt;
+    PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(request_address);
 
-    RubyRequestType type = srequest->m_type;
+    RubyRequestType type = crequest->getRubyType();
 
-    // Set this cache entry to the most recently used
-    if (type == RubyRequestType_IFETCH) {
-        if (m_instCache_ptr->isTagPresent(request_line_address))
-            m_instCache_ptr->setMRU(request_line_address);
-    } else {
-        if (m_dataCache_ptr->isTagPresent(request_line_address))
-            m_dataCache_ptr->setMRU(request_line_address);
-    }
+    DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
 
-    recordMissLatency(srequest, mach,
+    recordMissLatency(crequest, mach,
                       initialRequestTime,
                       forwardRequestTime,
                       firstResponseTime,
@@ -621,13 +491,11 @@ GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
     // update the data
     //
     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(type == reqCoalescer[request_line_address][i].primaryType);
+    std::vector<PacketPtr> pktList = crequest->getPackets();
+    DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
+            pktList.size(), request_line_address);
+    for (auto& pkt : pktList) {
         request_address = pkt->getAddr();
-        request_line_address = makeLineAddress(pkt->getAddr());
         if (pkt->getPtr<uint8_t>()) {
             if ((type == RubyRequestType_LD) ||
                 (type == RubyRequestType_ATOMIC) ||
@@ -658,36 +526,56 @@ GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+                safe_cast<RubyTester::SenderState*>
+                    (requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }
-
-        mylist.push_back(pkt);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));
 
 
 
-    completeHitCallback(mylist, len);
+    m_outstanding_count--;
+    assert(m_outstanding_count >= 0);
+
+    completeHitCallback(pktList);
 }
 
 bool
 GPUCoalescer::empty() const
 {
-    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+    return coalescedTable.empty();
+}
+
+RubyRequestType
+GPUCoalescer::getRequestType(PacketPtr pkt)
+{
+    RubyRequestType req_type = RubyRequestType_NULL;
+
+    // These types are not support or not used in GPU caches.
+    assert(!pkt->req->isLLSC());
+    assert(!pkt->req->isLockedRMW());
+    assert(!pkt->req->isInstFetch());
+    assert(!pkt->isFlush());
+
+    if (pkt->req->isAtomicReturn()) {
+        req_type = RubyRequestType_ATOMIC_RETURN;
+    } else if (pkt->req->isAtomicNoReturn()) {
+        req_type = RubyRequestType_ATOMIC_NO_RETURN;
+    } else if (pkt->isRead()) {
+        req_type = RubyRequestType_LD;
+    } else if (pkt->isWrite()) {
+        req_type = RubyRequestType_ST;
+    } else {
+        // Acquire and release packets will have been issued by
+        // makeRequest, so we do not need to check for it here.
+        panic("Unsupported ruby packet type\n");
+    }
+
+    return req_type;
 }
 
-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -719,147 +607,37 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         }
     }
 
-    // If number of outstanding requests greater than the max allowed,
-    // return RequestStatus_BufferFull. This logic can be extended to
-    // support proper backpressure.
-    if (m_outstanding_count >= m_max_outstanding_requests) {
-        return RequestStatus_BufferFull;
-    }
-
-    RubyRequestType primary_type = RubyRequestType_NULL;
-    RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
-        //
-        // Alpha LL/SC instructions need to be handled carefully by the cache
-        // coherence protocol to ensure they follow the proper semantics. In
-        // particular, by identifying the operations as atomic, the protocol
-        // should understand that migratory sharing optimizations should not
-        // be performed (i.e. a load between the LL and SC should not steal
-        // away exclusive permission).
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Store_Conditional;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Load_Linked;
-        }
-        secondary_type = RubyRequestType_ATOMIC;
-    } else if (pkt->req->isLockedRMW()) {
-        //
-        // x86 locked instructions are translated to store cache coherence
-        // requests because these requests should always be treated as read
-        // exclusive operations and should leverage any migratory sharing
-        // optimization built into the protocol.
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Locked_RMW_Write;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Locked_RMW_Read;
-        }
-        secondary_type = RubyRequestType_ST;
-    } else if (pkt->isAtomicOp()) {
-        //
-        // GPU Atomic Operation
-        //
-        primary_type = RubyRequestType_ATOMIC;
-        secondary_type = RubyRequestType_ATOMIC;
-    } else {
-        if (pkt->isRead()) {
-            if (pkt->req->isInstFetch()) {
-                primary_type = secondary_type = RubyRequestType_IFETCH;
-            } else {
-#if THE_ISA == X86_ISA
-                uint32_t flags = pkt->req->getFlags();
-                bool storeCheck = flags &
-                        (TheISA::StoreCheck << TheISA::FlagShift);
-#else
-                bool storeCheck = false;
-#endif // X86_ISA
-                if (storeCheck) {
-                    primary_type = RubyRequestType_RMW_Read;
-                    secondary_type = RubyRequestType_ST;
-                } else {
-                    primary_type = secondary_type = RubyRequestType_LD;
-                }
+    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
+        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
+        (pkt->req->isRelease() || pkt->req->isAcquire())) {
+        if (assumingRfOCoherence) {
+            // If we reached here, this request must be a memFence
+            // and the protocol implements RfO, the coalescer can
+            // assume sequentially consistency and schedule the callback
+            // immediately.
+            // Currently the code implements fence callbacks
+            // by reusing the mechanism for kernel completions.
+            // This should be fixed.
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
             }
-        } else if (pkt->isWrite()) {
-            //
-            // Note: M5 packets do not differentiate ST from RMW_Write
-            //
-            primary_type = secondary_type = RubyRequestType_ST;
-        } else if (pkt->isFlush()) {
-            primary_type = secondary_type = RubyRequestType_FLUSH;
-        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
-            if (assumingRfOCoherence) {
-                // If we reached here, this request must be a memFence
-                // and the protocol implements RfO, the coalescer can
-                // assume sequentially consistency and schedule the callback
-                // immediately.
-                // Currently the code implements fence callbacks
-                // by reusing the mechanism for kernel completions.
-                // This should be fixed.
-                int wf_id = 0;
-                if (pkt->req->hasContextId()) {
-                    wf_id = pkt->req->contextId();
-                }
-                insertKernel(wf_id, pkt);
-                newKernelEnds.push_back(wf_id);
-                if (!issueEvent.scheduled()) {
-                    schedule(issueEvent, curTick());
-                }
-                return RequestStatus_Issued;
-            } else {
-                // If not RfO, return issued here and let the child coalescer
-                // take care of it.
-                return RequestStatus_Issued;
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
             }
+            return RequestStatus_Issued;
         } else {
-            panic("Unsupported ruby packet type\n");
+            // If not RfO, return issued here and let the child coalescer
+            // take care of it.
+            return RequestStatus_Issued;
         }
     }
 
-    // Check if there is any pending request to this cache line from
-    // previous cycles.
-    // If there is a pending request, return aliased. Since coalescing
-    // across time is not permitted, aliased requests are not coalesced.
-    // If a request for this address has already been issued, we must block
-    RequestStatus status = getRequestStatus(pkt, primary_type);
-    if (status != RequestStatus_Ready)
-        return status;
-
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    // Check if this request can be coalesced with previous
-    // requests from this cycle.
-    if (!reqCoalescer.count(line_addr)) {
-        // This is the first access to this cache line.
-        // A new request to the memory subsystem has to be
-        // made in the next cycle for this cache line, so
-        // add this line addr to the "newRequests" queue
-        newRequests.push_back(line_addr);
-
-    // There was a request to this cache line in this cycle,
-    // let us see if we can coalesce this request with the previous
-    // requests from this cycle
-    } else if (primary_type !=
-               reqCoalescer[line_addr][0].primaryType) {
-        // can't coalesce loads, stores and atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->isLockedRMW() ||
-               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
-        // can't coalesce locked accesses, but can coalesce atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
-               pkt->req->contextId() !=
-               reqCoalescer[line_addr][0].pkt->req->contextId()) {
-        // can't coalesce releases from different wavefronts
-        return RequestStatus_Aliased;
-    }
+    uncoalescedTable.insertPacket(pkt);
+    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
 
-    // in addition to the packet, we need to save both request types
-    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
     if (!issueEvent.scheduled())
         schedule(issueEvent, curTick());
     // TODO: issue hardware prefetches here
@@ -867,8 +645,9 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
 }
 
 void
-GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
+    PacketPtr pkt = crequest->getFirstPkt();
 
     int proc_id = -1;
     if (pkt != NULL && pkt->req->hasContextId()) {
@@ -901,9 +680,9 @@ GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
-    uint32_t tableSize = reqCoalescer[line_addr].size();
+    uint32_t tableSize = crequest->getPackets().size();
     for (int i = 0; i < tableSize; i++) {
-        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
+        PacketPtr tmpPkt = crequest->getPackets()[i];
         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
         uint32_t tmpSize = tmpPkt->getSize();
         if (tmpPkt->isAtomicOp()) {
@@ -922,7 +701,7 @@ GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     if (pkt->isAtomicOp()) {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -931,7 +710,7 @@ GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     } else {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -941,15 +720,21 @@ GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
              printAddress(msg->getPhysicalAddress()),
-             RubyRequestType_to_string(secondary_type));
+             RubyRequestType_to_string(crequest->getRubyType()));
 
-    fatal_if(secondary_type == RubyRequestType_IFETCH,
+    fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
              "there should not be any I-Fetch requests in the GPU Coalescer");
 
     Tick latency = cyclesToTicks(
-                        m_controller->mandatoryQueueLatency(secondary_type));
+                m_controller->mandatoryQueueLatency(crequest->getRubyType()));
     assert(latency > 0);
 
+    if (!deadlockCheckEvent.scheduled()) {
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+
     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 }
@@ -971,8 +756,6 @@ GPUCoalescer::print(ostream& out) const
 {
     out << "[GPUCoalescer: " << m_version
         << ", outstanding requests: " << m_outstanding_count
-        << ", read request table: " << m_readRequestTable
-        << ", write request table: " << m_writeRequestTable
         << "]";
 }
 
@@ -983,40 +766,96 @@ GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
             SequencerRequestType_to_string(requestType));
 }
 
+bool
+GPUCoalescer::coalescePacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // If the packet has the same line address as a request already in the
+    // coalescedTable and has the same sequence number, it can be coalesced.
+    if (coalescedTable.count(line_addr)) {
+        // Search for a previous coalesced request with the same seqNum.
+        auto& creqQueue = coalescedTable.at(line_addr);
+        auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
+            [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
+        );
+        if (citer != creqQueue.end()) {
+            (*citer)->insertPacket(pkt);
+            return true;
+        }
+    }
+
+    if (m_outstanding_count < m_max_outstanding_requests) {
+        // This is an "aliased" or new request. Create a RubyRequest and
+        // append it to the list of "targets" in the coalescing table.
+        DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
+                line_addr);
+
+        CoalescedRequest *creq = new CoalescedRequest(seqNum);
+        creq->insertPacket(pkt);
+        creq->setRubyType(getRequestType(pkt));
+        creq->setIssueTime(curCycle());
+
+        if (!coalescedTable.count(line_addr)) {
+            // If there is no outstanding request for this line address,
+            // create a new coalecsed request and issue it immediately.
+            auto reqList = std::deque<CoalescedRequest*> { creq };
+            coalescedTable.insert(std::make_pair(line_addr, reqList));
+
+            DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+                    RubyRequestType_to_string(creq->getRubyType()), seqNum);
+            issueRequest(creq);
+        } else {
+            // The request is for a line address that is already outstanding
+            // but for a different instruction. Add it as a new request to be
+            // issued when the current outstanding request is completed.
+            coalescedTable.at(line_addr).push_back(creq);
+            DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
+                    line_addr, seqNum);
+        }
+
+        // In both cases, requests are added to the coalescing table and will
+        // be counted as outstanding requests.
+        m_outstanding_count++;
+
+        return true;
+    }
+
+    // The maximum number of outstanding requests have been issued.
+    return false;
+}
 
 void
 GPUCoalescer::completeIssue()
 {
-    // newRequests has the cacheline addresses of all the
-    // requests which need to be issued to the memory subsystem
-    // in this cycle
-    int len = newRequests.size();
-    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
-    for (int i = 0; i < len; ++i) {
-        // Get the requests from reqCoalescer table. Get only the
-        // first request for each cacheline, the remaining requests
-        // can be coalesced with the first request. So, only
-        // one request is issued per cacheline.
-        RequestDesc info = reqCoalescer[newRequests[i]][0];
-        PacketPtr pkt = info.pkt;
-        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
-                i, pkt->req->getPaddr());
-        // Insert this request to the read/writeRequestTables. These tables
-        // are used to track aliased requests in makeRequest subroutine
-        bool found = insertRequest(pkt, info.primaryType);
-
-        if (found) {
-            panic("GPUCoalescer::makeRequest should never be called if the "
-                  "request is already outstanding\n");
+    // Iterate over the maximum number of instructions we can coalesce
+    // per cycle (coalescingWindow).
+    for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
+        PerInstPackets *pktList =
+            uncoalescedTable.getInstPackets(instIdx);
+
+        // getInstPackets will return nullptr if no instruction
+        // exists at the current offset.
+        if (!pktList) {
+            break;
+        } else {
+            // Since we have a pointer to the list of packets in the inst,
+            // erase them from the list if coalescing is successful and
+            // leave them in the list otherwise. This aggressively attempts
+            // to coalesce as many packets as possible from the current inst.
+            pktList->remove_if(
+                [&](PacketPtr pkt) { return coalescePacket(pkt); }
+            );
         }
-
-        // Issue request to ruby subsystem
-        issueRequest(pkt, info.secondaryType);
     }
-    newRequests.clear();
+
+    // Clean up any instructions in the uncoalesced table that have had
+    // all of their packets coalesced and return a token for that column.
+    uncoalescedTable.updateResources();
 
     // have Kernel End releases been issued this cycle
-    len = newKernelEnds.size();
+    int len = newKernelEnds.size();
     for (int i = 0; i < len; i++) {
         kernelCallback(newKernelEnds[i]);
     }
@@ -1045,71 +884,27 @@ GPUCoalescer::atomicCallback(Addr address,
                              const DataBlock& data)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));
 
-    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
-
-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* srequest = i->second;
-
-    m_writeRequestTable.erase(i);
-    markRemoved();
+    auto crequest = coalescedTable.at(address).front();
 
-    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+    fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
+             "atomicCallback saw non-atomic type response\n");
 
+    hitCallback(crequest, mach, (DataBlock&)data, true,
+                crequest->getIssueTime(), Cycles(0), Cycles(0), false);
 
-    // Atomics don't write to cache, so there is no MRU update...
+    delete crequest;
+    coalescedTable.at(address).pop_front();
 
-    recordMissLatency(srequest, mach,
-                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
-
-    PacketPtr pkt = srequest->pkt;
-    Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(pkt->getAddr());
-
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(srequest->m_type ==
-               reqCoalescer[request_line_address][i].primaryType);
-        request_address = (pkt->getAddr());
-        request_line_address = makeLineAddress(request_address);
-        if (pkt->getPtr<uint8_t>() &&
-            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
-            /* atomics are done in memory, and return the data *before* the atomic op... */
-            pkt->setData(
-                data.getData(getOffset(request_address), pkt->getSize()));
-        } else {
-            DPRINTF(MemoryAccess,
-                    "WARNING.  Data not transfered from Ruby to M5 for type " \
-                    "%s\n",
-                    RubyRequestType_to_string(srequest->m_type));
-        }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
-
-        mylist.push_back(pkt);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));
-
-    completeHitCallback(mylist, len);
 }
 
 void
@@ -1141,42 +936,42 @@ GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
 }
 
 void
-GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
-    for (int i = 0; i < len; ++i) {
+    for (auto& pkt : mylist) {
         RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
         MemSlavePort *port = ss->port;
         assert(port != NULL);
 
-        mylist[i]->senderState = ss->predecessor;
+        pkt->senderState = ss->predecessor;
         delete ss;
-        port->hitCallback(mylist[i]);
+        port->hitCallback(pkt);
         trySendRetries();
     }
 
-    testDrainComplete();
-}
+    // We schedule an event in the same tick as hitCallback (similar to
+    // makeRequest) rather than calling completeIssue directly to reduce
+    // function calls to complete issue. This can only happen if the max
+    // outstanding requests is less than the number of slots in the
+    // uncoalesced table and makeRequest is not called again.
+    if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
+        schedule(issueEvent, curTick());
+    }
 
-PacketPtr
-GPUCoalescer::mapAddrToPkt(Addr address)
-{
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
-    return request->pkt;
+    testDrainComplete();
 }
 
 void
-GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                 MachineType mach,
                                 Cycles initialRequestTime,
                                 Cycles forwardRequestTime,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
-    RubyRequestType type = srequest->m_type;
-    Cycles issued_time = srequest->issue_time;
+    RubyRequestType type = crequest->getRubyType();
+    Cycles issued_time = crequest->getIssueTime();
     Cycles completion_time = curCycle();
     assert(completion_time >= issued_time);
     Cycles total_lat = completion_time - issued_time;
@@ -1242,7 +1037,7 @@ GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
              curTick(), m_version, "Coal",
              success ? "Done" : "SC_Failed", "", "",
-             printAddress(srequest->pkt->getAddr()), total_lat);
+             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }
 
 void
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 3230ef1ee..56a207906 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -48,6 +48,7 @@
 #include "mem/ruby/protocol/RubyRequestType.hh"
 #include "mem/ruby/protocol/SequencerRequestType.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "mem/token_port.hh"
 
 class DataBlock;
 class CacheMsg;
@@ -59,47 +60,99 @@ class RubyGPUCoalescerParams;
 HSAScope reqScopeToHSAScope(const RequestPtr &req);
 HSASegment reqSegmentToHSASegment(const RequestPtr &req);
 
-struct GPUCoalescerRequest
-{
-    PacketPtr pkt;
-    RubyRequestType m_type;
-    Cycles issue_time;
+// List of packets that belongs to a specific instruction.
+typedef std::list<PacketPtr> PerInstPackets;
 
-    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
-                        Cycles _issue_time)
-        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
-    {}
+class UncoalescedTable
+{
+  public:
+    UncoalescedTable(GPUCoalescer *gc);
+    ~UncoalescedTable() {}
+
+    void insertPacket(PacketPtr pkt);
+    bool packetAvailable();
+    void printRequestTable(std::stringstream& ss);
+
+    // Returns a pointer to the list of packets corresponding to an
+    // instruction in the instruction map or nullptr if there are no
+    // instructions at the offset.
+    PerInstPackets* getInstPackets(int offset);
+    void updateResources();
+
+    // Check if a packet hasn't been removed from instMap in too long.
+    // Panics if a deadlock is detected and returns nothing otherwise.
+    void checkDeadlock(Tick threshold);
+
+  private:
+    GPUCoalescer *coalescer;
+
+    // Maps an instructions unique sequence number to a queue of packets
+    // which need responses. This data structure assumes the sequence number
+    // is monotonically increasing (which is true for CU class) in order to
+    // issue packets in age order.
+    std::map<uint64_t, PerInstPackets> instMap;
 };
 
-class RequestDesc
+class CoalescedRequest
 {
   public:
-    RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
-        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
-    {
-    }
-
-    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
-        secondaryType(RubyRequestType_NULL)
-    {
-    }
-
-    PacketPtr pkt;
-    RubyRequestType primaryType;
-    RubyRequestType secondaryType;
+    CoalescedRequest(uint64_t _seqNum)
+        : seqNum(_seqNum), issueTime(Cycles(0)),
+          rubyType(RubyRequestType_NULL)
+    {}
+    ~CoalescedRequest() {}
+
+    void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
+    void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
+    void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
+    void setRubyType(RubyRequestType type) { rubyType = type; }
+
+    uint64_t getSeqNum() const { return seqNum; }
+    PacketPtr getFirstPkt() const { return pkts[0]; }
+    Cycles getIssueTime() const { return issueTime; }
+    RubyRequestType getRubyType() const { return rubyType; }
+    std::vector<PacketPtr>& getPackets() { return pkts; }
+
+  private:
+    uint64_t seqNum;
+    Cycles issueTime;
+    RubyRequestType rubyType;
+    std::vector<PacketPtr> pkts;
 };
 
-std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
-
 class GPUCoalescer : public RubyPort
 {
   public:
+    class GMTokenPort : public TokenSlavePort
+    {
+      public:
+        GMTokenPort(const std::string& name, ClockedObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenSlavePort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        Tick recvAtomic(PacketPtr) { return Tick(0); }
+        void recvFunctional(PacketPtr) { }
+        bool recvTimingReq(PacketPtr) { return false; }
+        AddrRangeList getAddrRanges() const
+        {
+            AddrRangeList ranges;
+            return ranges;
+        }
+    };
+
     typedef RubyGPUCoalescerParams Params;
     GPUCoalescer(const Params *);
     ~GPUCoalescer();
 
+    Port &getPort(const std::string &if_name,
+                  PortID idx = InvalidPortID) override;
+
     // Public Methods
     void wakeup(); // Used only for deadlock detection
+    void printRequestTable(std::stringstream& ss);
 
     void printProgress(std::ostream& out) const;
     void resetStats() override;
@@ -177,13 +230,13 @@ class GPUCoalescer : public RubyPort
 
     void print(std::ostream& out) const;
 
-    void markRemoved();
-    void removeRequest(GPUCoalescerRequest* request);
     void evictionCallback(Addr address);
     void completeIssue();
 
     void insertKernel(int wavefront_id, PacketPtr pkt);
 
+    GMTokenPort& getGMTokenPort() { return gmTokenPort; }
+
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
 
@@ -224,11 +277,11 @@ class GPUCoalescer : public RubyPort
                         Addr pc, RubyAccessMode access_mode,
                         int size, DataBlock*& data_ptr);
     // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+    virtual void issueRequest(CoalescedRequest* crequest);
 
     void kernelCallback(int wavfront_id);
 
-    void hitCallback(GPUCoalescerRequest* request,
+    void hitCallback(CoalescedRequest* crequest,
                      MachineType mach,
                      DataBlock& data,
                      bool success,
@@ -236,21 +289,23 @@ class GPUCoalescer : public RubyPort
                      Cycles forwardRequestTime,
                      Cycles firstResponseTime,
                      bool isRegion);
-    void recordMissLatency(GPUCoalescerRequest* request,
+    void recordMissLatency(CoalescedRequest* crequest,
                            MachineType mach,
                            Cycles initialRequestTime,
                            Cycles forwardRequestTime,
                            Cycles firstResponseTime,
                            bool success, bool isRegion);
-    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
-    PacketPtr mapAddrToPkt(Addr address);
+    void completeHitCallback(std::vector<PacketPtr> & mylist);
 
 
-    RequestStatus getRequestStatus(PacketPtr pkt,
-                                   RubyRequestType request_type);
-    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+    virtual RubyRequestType getRequestType(PacketPtr pkt);
 
-    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+    // Attempt to remove a packet from the uncoalescedTable and coalesce
+    // with a previous request from the same instruction. If there is no
+    // previous instruction and the max number of outstanding requests has
+    // not be reached, a new coalesced request is created and added to the
+    // "target" list of the coalescedTable.
+    bool coalescePacket(PacketPtr pkt);
 
     EventFunctionWrapper issueEvent;
 
@@ -258,22 +313,27 @@ class GPUCoalescer : public RubyPort
   // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     int m_max_outstanding_requests;
-    int m_deadlock_threshold;
+    Cycles m_deadlock_threshold;
 
     CacheMemory* m_dataCache_ptr;
     CacheMemory* m_instCache_ptr;
 
-    // We need to track both the primary and secondary request types.
-    // The secondary request type comprises a subset of RubyRequestTypes that
-    // are understood by the L1 Controller. A primary request type can be any
-    // RubyRequestType.
-    typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
-    CoalescingTable reqCoalescer;
-    std::vector<Addr> newRequests;
-
-    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
-    RequestTable m_writeRequestTable;
-    RequestTable m_readRequestTable;
+    // coalescingWindow is the maximum number of instructions that are
+    // allowed to be coalesced in a single cycle.
+    int coalescingWindow;
+
+    // The uncoalescedTable contains several "columns" which hold memory
+    // request packets for an instruction. The maximum size is the number of
+    // columns * the wavefront size.
+    UncoalescedTable uncoalescedTable;
+
+    // An MSHR-like struct for holding coalesced requests. The requests in
+    // this table may or may not be outstanding in the memory hierarchy. The
+    // maximum size is equal to the maximum outstanding requests for a CU
+    // (typically the number of blocks in TCP). If there are duplicates of
+    // an address, the are serviced in age order.
+    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -334,7 +394,12 @@ class GPUCoalescer : public RubyPort
     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
 
-private:
+  private:
+    // Token port is used to send/receive tokens to/from GPU's global memory
+    // pipeline across the port boundary. There is one per <wave size> data
+    // ports in the CU.
+    GMTokenPort gmTokenPort;
+
     // Private copy constructor and assignment operator
     GPUCoalescer(const GPUCoalescer& obj);
     GPUCoalescer& operator=(const GPUCoalescer& obj);
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
index c02fb7554..0335981c0 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -42,6 +42,8 @@ class RubyGPUCoalescer(RubyPort):
    # max_outstanding_requests = (wave front slots) x (wave front size)
    max_outstanding_requests = Param.Int(40*64,
                                 "max requests (incl. prefetches) outstanding")
+   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
+                                "coalesced in a single cycle")
    assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
                            "Ownership coherence");
 
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index feb13c555..d8977ac85 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -76,15 +76,8 @@ VIPERCoalescer::~VIPERCoalescer()
 {
 }
 
-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 VIPERCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -109,7 +102,6 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
 
             return RequestStatus_Issued;
         }
-//        return RequestStatus_Aliased;
     } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
         // Flush Dirty Data on Kernel End
         // isKernel + isRelease
@@ -123,13 +115,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
         }
         return RequestStatus_Issued;
     }
-    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
-    if (requestStatus!=RequestStatus_Issued) {
-        // Request not isssued
-        // enqueue Retry
-        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
-        return requestStatus;
-    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+
+    GPUCoalescer::makeRequest(pkt);
+
+    if (pkt->req->isKernel() && pkt->req->isAcquire()) {
         // Invalidate clean Data on Kernel Begin
         // isKernel + isAcquire
         invL1();