From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Tue, 1 May 2018 20:59:35 +0000 (-0400)
Subject: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
X-Git-Tag: v20.1.0.0~589
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b8da9abba7b7ec710a749a893ed698fc41f2edcf;p=gem5.git

gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---

diff --git a/build_opts/GCN3_X86 b/build_opts/GCN3_X86
new file mode 100644
index 000000000..21e3cf0e4
--- /dev/null
+++ b/build_opts/GCN3_X86
@@ -0,0 +1,5 @@
+PROTOCOL = 'GPU_VIPER'
+TARGET_ISA = 'x86'
+TARGET_GPU_ISA = 'gcn3'
+BUILD_GPU = True
+CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
diff --git a/configs/common/GPUTLBConfig.py b/configs/common/GPUTLBConfig.py
index ced8aa198..8e2b1e46e 100644
--- a/configs/common/GPUTLBConfig.py
+++ b/configs/common/GPUTLBConfig.py
@@ -48,7 +48,7 @@ def TLB_constructor(level):
             maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
             accessDistance = options.L%(level)dAccessDistanceStat,\
             clk_domain = SrcClockDomain(\
-                clock = options.GPUClock,\
+                clock = options.gpu_clock,\
                 voltage_domain = VoltageDomain(\
                     voltage = options.gpu_voltage)))" % locals()
     return constructor_call
@@ -60,23 +60,22 @@ def Coalescer_constructor(level):
                 coalescingWindow = options.L%(level)dCoalescingWindow,\
                 disableCoalescing = options.L%(level)dDisableCoalescing,\
                 clk_domain = SrcClockDomain(\
-                    clock = options.GPUClock,\
+                    clock = options.gpu_clock,\
                     voltage_domain = VoltageDomain(\
                         voltage = options.gpu_voltage)))" % locals()
     return constructor_call
 
-def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
-    # arguments: options, TLB level, number of private structures for this Level,
-    # TLB name and  Coalescer name
+def create_TLB_Coalescer(options, my_level, my_index, tlb_name,
+    coalescer_name):
+    # arguments: options, TLB level, number of private structures for this
+    # Level, TLB name and  Coalescer name
     for i in range(my_index):
-        TLB_name.append(eval(TLB_constructor(my_level)))
-        Coalescer_name.append(eval(Coalescer_constructor(my_level)))
+        tlb_name.append(eval(TLB_constructor(my_level)))
+        coalescer_name.append(eval(Coalescer_constructor(my_level)))
 
 def config_tlb_hierarchy(options, system, shader_idx):
-    n_cu = options.num_compute_units
-    # Make this configurable now, instead of the hard coded val.  The dispatcher
-    # is always the last item in the system.cpu list.
-    dispatcher_idx = len(system.cpu) - 1
+    n_cu = options.cu_per_sa * options.sa_per_complex * \
+           options.num_gpu_complexes
 
     if options.TLB_config == "perLane":
         num_TLBs = 64 * n_cu
@@ -90,21 +89,26 @@ def config_tlb_hierarchy(options, system, shader_idx):
         print("Bad option for TLB Configuration.")
         sys.exit(1)
 
-    #----------------------------------------------------------------------------------------
+    #-------------------------------------------------------------------------
     # A visual representation of the TLB hierarchy
     # for ease of configuration
-    # < Modify here the width and the number of levels if you want a different configuration >
-    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
-    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
-          {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
-          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
+    # < Modify here the width and the number of levels if you want a different
+    # configuration >
+    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc)
+    # for this level
+    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [],
+           'CoalescerArray': []},
+          {'name': 'scalar', 'width' : options.num_scalar_cache,
+           'TLBarray': [], 'CoalescerArray': []},
+          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [],
+           'CoalescerArray': []}]
 
     L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
     L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
 
     TLB_hierarchy = [L1, L2, L3]
 
-    #----------------------------------------------------------------------------------------
+    #-------------------------------------------------------------------------
     # Create the hiearchy
     # Call the appropriate constructors and add objects to the system
 
@@ -164,17 +168,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
                     for tlb in range(tlb_per_cu):
                         exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                                 system.l1_coalescer[%d].slave[%d]' % \
-                                (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
+                                (shader_idx, cu_idx, tlb,
+                                    cu_idx*tlb_per_cu+tlb, 0))
                 else:
                     exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                             system.l1_coalescer[%d].slave[%d]' % \
-                            (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
-
-        elif name == 'dispatcher': # Dispatcher TLB
-            for index in range(TLB_type['width']):
-                exec('system.cpu[%d].translation_port = \
-                        system.dispatcher_coalescer[%d].slave[0]' % \
-                        (dispatcher_idx, index))
+                            (shader_idx, cu_idx, tlb_per_cu,
+                                cu_idx / (n_cu / num_TLBs),
+                                cu_idx % (n_cu / num_TLBs)))
         elif name == 'sqc': # I-TLB
             for index in range(n_cu):
                 sqc_tlb_index = index / options.cu_per_sqc
@@ -182,7 +183,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
                 exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
                         system.sqc_coalescer[%d].slave[%d]' % \
                         (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
-
+        elif name == 'scalar': # Scalar D-TLB
+            for index in range(n_cu):
+                scalar_tlb_index = index / options.cu_per_scalar_cache
+                scalar_tlb_port_id = index % options.cu_per_scalar_cache
+                exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \
+                        system.scalar_coalescer[%d].slave[%d]' % \
+                        (shader_idx, index, scalar_tlb_index,
+                         scalar_tlb_port_id))
 
     # Connect the memSidePorts (masters) of all the TLBs with the
     # cpuSidePorts (slaves) of the Coalescers of the next level
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index 88095e7ad..7578694b6 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3728,7 +3728,7 @@ namespace Gcn3ISA
         DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
             wf->computeUnit->cu_id, wf->wgId, refCount);
 
-        wf->computeUnit->registerManager.freeRegisters(wf);
+        wf->computeUnit->registerManager->freeRegisters(wf);
         wf->computeUnit->completedWfs++;
         wf->computeUnit->activeWaves--;
 
diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh
index 8bb49c0b7..3197dc078 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -192,7 +192,7 @@ namespace Gcn3ISA
              */
             bool misaligned_acc = split_addr > vaddr;
 
-            RequestPtr req = new Request(0, vaddr, req_size, 0,
+            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
                     gpuDynInst->computeUnit()->masterId(), 0,
                     gpuDynInst->wfDynId);
 
@@ -208,7 +208,6 @@ namespace Gcn3ISA
                 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-                delete req;
             } else {
                 gpuDynInst->numScalarReqs = 1;
                 gpuDynInst->setRequestFlags(req);
@@ -243,7 +242,7 @@ namespace Gcn3ISA
              */
             bool misaligned_acc = split_addr > vaddr;
 
-            RequestPtr req = new Request(0, vaddr, req_size, 0,
+            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
                     gpuDynInst->computeUnit()->masterId(), 0,
                     gpuDynInst->wfDynId);
 
@@ -259,7 +258,6 @@ namespace Gcn3ISA
                 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-                delete req;
             } else {
                 gpuDynInst->numScalarReqs = 1;
                 gpuDynInst->setRequestFlags(req);
@@ -574,7 +572,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                         gpuDynInst->computeUnit()->masterId(), 0,
                         gpuDynInst->wfDynId);
 
@@ -600,7 +599,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                         gpuDynInst->computeUnit()->masterId(),
                         0, gpuDynInst->wfDynId);
 
@@ -619,7 +619,7 @@ namespace Gcn3ISA
         {
             // create request and set flags
             gpuDynInst->statusBitVector = VectorMask(1);
-            Request *req = new Request(0, 0, 0, 0,
+            RequestPtr req = std::make_shared<Request>(0, 0, 0,
                                        gpuDynInst->computeUnit()->
                                        masterId(), 0,
                                        gpuDynInst->wfDynId);
@@ -777,7 +777,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                             gpuDynInst->computeUnit()->masterId(), 0,
                             gpuDynInst->wfDynId);
 
@@ -802,7 +803,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, req_size, 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+                        0,
                         gpuDynInst->computeUnit()->masterId(), 0,
                         gpuDynInst->wfDynId);
 
@@ -826,7 +828,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                         gpuDynInst->computeUnit()->masterId(),
                             0, gpuDynInst->wfDynId);
 
@@ -851,7 +854,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, req_size, 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+                        0,
                         gpuDynInst->computeUnit()->masterId(),
                             0, gpuDynInst->wfDynId);
 
@@ -875,7 +879,8 @@ namespace Gcn3ISA
                 if (gpuDynInst->exec_mask[lane]) {
                     Addr vaddr = gpuDynInst->addr[lane];
 
-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                         gpuDynInst->computeUnit()->masterId(), 0,
                         gpuDynInst->wfDynId,
                         gpuDynInst->makeAtomicOpFunctor<T>(
diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh
index ac340f19b..218faf8cc 100644
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -153,7 +153,7 @@ namespace Gcn3ISA
             ComputeUnit *cu = _gpuDynInst->computeUnit();
 
             for (auto i = 0; i < NumDwords; ++i) {
-                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
+                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
                 vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
 
                 DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
@@ -207,7 +207,7 @@ namespace Gcn3ISA
                 ? _gpuDynInst->exec_mask : wf->execMask();
 
             if (NumDwords == 1) {
-                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
+                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
                 vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
                 assert(vrfData[0]);
                 auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
@@ -223,8 +223,8 @@ namespace Gcn3ISA
                 DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
                 cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
             } else if (NumDwords == 2) {
-                int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
-                int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
+                int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
+                int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
                 vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
                 vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
                 assert(vrfData[0]);
@@ -605,16 +605,16 @@ namespace Gcn3ISA
 
             if (_opIdx == REG_VCC_LO) {
                 sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
             } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
                 sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
             } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
                 assert(NumDwords == 1);
                 sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
             } else {
-                sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
+                sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
             }
 
             assert(sgprIdx > -1);
diff --git a/src/dev/hsa/hsa_device.cc b/src/dev/hsa/hsa_device.cc
index 78ec8e8b4..094623dd8 100644
--- a/src/dev/hsa/hsa_device.cc
+++ b/src/dev/hsa/hsa_device.cc
@@ -101,7 +101,7 @@ HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
      * with new extensions, it will likely be wrong to just arbitrarily
      * grab context zero.
      */
-    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto process = sys->threads[0]->getProcessPtr();
 
     if (!process->pTable->translate(vaddr, paddr)) {
         fatal("failed translation: vaddr 0x%x\n", vaddr);
diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc
index 3f5c8eb0a..459043d93 100644
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -92,3 +92,28 @@ HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
     DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
     return start;
 }
+
+/**
+ * Forward relevant parameters to packet processor; queueID
+ * is used to link doorbell. The queueIDs are not re-used
+ * in current implementation, and we allocate only one page
+ * (4096 bytes) for doorbells, so check if this queue ID can
+ * be mapped into that page.
+ */
+void
+HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
+{
+    TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
+    args.copyIn(mem_proxy);
+
+    if (queueId >= 0x1000) {
+        fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
+    }
+
+    args->queue_id = queueId++;
+    auto &hsa_pp = device->hsaPacketProc();
+    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
+                              args->ring_base_address, args->queue_id,
+                              args->ring_size);
+    args.copyOut(mem_proxy);
+}
diff --git a/src/dev/hsa/hsa_driver.hh b/src/dev/hsa/hsa_driver.hh
index b3c7ee2af..abf79abfc 100644
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -56,7 +56,7 @@
 
 struct HSADriverParams;
 class HSADevice;
-class SETranslatingPortProxy;
+class PortProxy;
 class ThreadContext;
 
 class HSADriver : public EmulatedDriver
@@ -74,8 +74,7 @@ class HSADriver : public EmulatedDriver
     HSADevice *device;
     uint32_t queueId;
 
-    void allocateQueue(const SETranslatingPortProxy &mem_proxy,
-                       Addr ioc_buf_addr);
+    void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf);
 };
 
 #endif // __DEV_HSA_HSA_DRIVER_HH__
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index bd050163b..f9880e40e 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -151,7 +151,7 @@ HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
     // Grab the process and try to translate the virtual address with it; with
     // new extensions, it will likely be wrong to just arbitrarily grab context
     // zero.
-    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto process = sys->threads[0]->getProcessPtr();
 
     if (!process->pTable->translate(vaddr, paddr))
         fatal("failed translation: vaddr 0x%x\n", vaddr);
@@ -393,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
                  * The reason for this is that the DMASequencer does
                  * not support atomic operations.
                  */
-                auto tc = sys->getThreadContext(0);
+                auto tc = sys->threads[0];
                 auto &virt_proxy = tc->getVirtProxy();
                 TypedBufferArg<uint64_t> prev_signal(signal_addr);
                 prev_signal.copyIn(virt_proxy);
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index 57cf6d1b1..8523be9cc 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -92,7 +92,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
     // We use the same mapping function used by hsa runtime to do this mapping
     //
     // Originally
-    // #define VOID_PTR_ADD32(ptr,n) \
+    // #define VOID_PTR_ADD32(ptr,n)
     //     (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
     // (Addr)VOID_PTR_ADD32(0, queue_id)
     Addr db_offset = queue_id;
@@ -343,7 +343,7 @@ HWScheduler::unregisterQueue(uint64_t queue_id)
     // `(Addr)(VOID_PRT_ADD32(0, queue_id))`
     //
     // Originally
-    // #define VOID_PTR_ADD32(ptr,n) \
+    // #define VOID_PTR_ADD32(ptr,n)
     //     (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
     // (Addr)VOID_PTR_ADD32(0, queue_id)
     Addr db_offset = queue_id;
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 7eaf65fec..6b033f403 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -1,48 +1,48 @@
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
 #
-#  Copyright (c) 2015 Advanced Micro Devices, Inc.
-#  All rights reserved.
+# For use for simulation and test purposes only
 #
-#  For use for simulation and test purposes only
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
 #
-#  1. Redistributions of source code must retain the above copyright notice,
-#  this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-#  2. Redistributions in binary form must reproduce the above copyright notice,
-#  this list of conditions and the following disclaimer in the documentation
-#  and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
 #
-#  3. Neither the name of the copyright holder nor the names of its contributors
-#  may be used to endorse or promote products derived from this software
-#  without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-#  POSSIBILITY OF SUCH DAMAGE.
-#
-#  Author: Steve Reinhardt
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
 #
+# Authors: Steve Reinhardt
 
 from m5.defines import buildEnv
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import SimObject
 
+from m5.objects.Bridge import Bridge
 from m5.objects.ClockedObject import ClockedObject
 from m5.objects.Device import DmaDevice
-from m5.objects.Process import EmulatedDriver
-from m5.objects.Bridge import Bridge
+from m5.objects.HSADevice import HSADevice
+from m5.objects.HSADriver import HSADriver
 from m5.objects.LdsState import LdsState
+from m5.objects.Process import EmulatedDriver
 
 class PrefetchType(Enum): vals = [
     'PF_CU',
@@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [
     'PF_END',
     ]
 
-class VectorRegisterFile(SimObject):
+class PoolManager(SimObject):
+    type = 'PoolManager'
+    abstract = True
+    cxx_header = "gpu-compute/pool_manager.hh"
+
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+    pool_size = Param.Int(2048, 'number of vector registers per SIMD')
+
+# The simple pool manage only allows one workgroup to
+# be executing on a CU at any given time.
+class SimplePoolManager(PoolManager):
+    type = 'SimplePoolManager'
+    cxx_class = 'SimplePoolManager'
+    cxx_header = "gpu-compute/simple_pool_manager.hh"
+
+class RegisterFile(SimObject):
+    type = 'RegisterFile'
+    cxx_class = 'RegisterFile'
+    cxx_header = 'gpu-compute/register_file.hh'
+
+    simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
+    num_regs = Param.Int(2048, 'number of registers in this RF')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+
+class ScalarRegisterFile(RegisterFile):
+    type = 'ScalarRegisterFile'
+    cxx_class = 'ScalarRegisterFile'
+    cxx_header = 'gpu-compute/scalar_register_file.hh'
+
+class VectorRegisterFile(RegisterFile):
     type = 'VectorRegisterFile'
     cxx_class = 'VectorRegisterFile'
     cxx_header = 'gpu-compute/vector_register_file.hh'
 
-    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
-    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
-    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+class RegisterManager(SimObject):
+    type = 'RegisterManager'
+    cxx_class = 'RegisterManager'
+    cxx_header = 'gpu-compute/register_manager.hh'
+
+    policy = Param.String("static", "Register Manager Policy")
+    vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
+    srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
 
 class Wavefront(SimObject):
     type = 'Wavefront'
@@ -69,45 +102,68 @@ class Wavefront(SimObject):
 
     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+    max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
+                            'instruction buffer (IB).')
 
+# Most of the default values here are obtained from the
+# AMD Graphics Core Next (GCN) Architecture whitepaper.
 class ComputeUnit(ClockedObject):
     type = 'ComputeUnit'
     cxx_class = 'ComputeUnit'
     cxx_header = 'gpu-compute/compute_unit.hh'
 
     wavefronts = VectorParam.Wavefront('Number of wavefronts')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    # Wavefront size is 64. This is configurable, however changing
+    # this value to anything other than 64 will likely cause errors.
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
     num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+    num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
+    num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
+                                     'per CU')
+    simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
+
+    operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
+                                          'network')
 
     spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
                                         'latency')
 
-    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+    dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
                                         'latency')
-
+    scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
     issue_period = Param.Int(4, 'number of cycles per issue period')
+
+    vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'GM bus')
+    srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
+                                       'to Scalar Mem bus')
+    vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'LM bus')
+
     num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
     num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
-    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
-                                "Represents the pipeline to reach the TCP and "\
-                                "specified in GPU clock cycles")
-    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
-                                 "cu. Represents the pipeline between the TCP "\
-                                 "and cu as well as TCP data array access. "\
-                                 "Specified in GPU clock cycles")
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP "\
+                                "and specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the "\
+                                 "TCP and cu as well as TCP data array "\
+                                 "access. Specified in GPU clock cycles")
     system = Param.System(Parent.any, "system object")
     cu_id = Param.Int('CU id')
-    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
-                                           "in bytes")
-    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
-                                           "in bytes")
+    vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
+                                           "width in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
+                                           "width  in bytes")
 
     memory_port = VectorMasterPort("Port to the memory system")
     translation_port = VectorMasterPort('Port to the TLB hierarchy')
     sqc_port = MasterPort("Port to the SQC (I-cache")
     sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    scalar_port = MasterPort("Port to the scalar data cache")
+    scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
     perLaneTLB = Param.Bool(False, "enable per-lane TLB")
     prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
                                "(0 turns off prefetching)")
@@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject):
                                             "from last mem req in lane of "\
                                             "CU|Phase|Wavefront")
     execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
-    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
     debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
     functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
 
     localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
                                         "kernel end")
 
-    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
-                                   "and how many times")
+    countPages = Param.Bool(False, "Generate per-CU file of all pages "\
+                            "touched and how many times")
+    scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
+                                      "memory pipeline's queues")
     global_mem_queue_size = Param.Int(256, "Number of entries in the global "
                                       "memory pipeline's queues")
     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                       "memory pipeline's queues")
+    max_wave_requests = Param.Int(64, "number of pending vector memory "\
+                                      "requests per wavefront")
     max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
                             " of instructions that can be sent to coalescer")
     ldsBus = Bridge() # the bridge between the CU and its LDS
@@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject):
 
     vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                           "file")
+
+    scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
+                                                          "file")
     out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
                                             " in the GM pipeline")
+    register_manager = Param.RegisterManager("Register Manager")
+    fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
+                            'buffered in the fetch unit.')
 
 class Shader(ClockedObject):
     type = 'Shader'
     cxx_class = 'Shader'
     cxx_header = 'gpu-compute/shader.hh'
-
     CUs = VectorParam.ComputeUnit('Number of compute units')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
+    dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
     impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
-                                                  ruby at kernel boundaries""")
-    separate_acquire_release = Param.Bool(False,
-        """Do ld_acquire/st_release generate separate requests for the
-        acquire and release?""")
+                                         ruby at kernel boundaries""")
     globalmem = Param.MemorySize('64kB', 'Memory size')
     timing = Param.Bool(False, 'timing memory accesses')
 
     cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
     translation = Param.Bool(False, "address translation");
+    timer_period = Param.Clock('10us', "system timer period")
+    idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
+    max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
 
-class ClDriver(EmulatedDriver):
-    type = 'ClDriver'
-    cxx_header = 'gpu-compute/cl_driver.hh'
-    codefile = VectorParam.String('code file name(s)')
+class GPUComputeDriver(HSADriver):
+    type = 'GPUComputeDriver'
+    cxx_header = 'gpu-compute/gpu_compute_driver.hh'
 
-class GpuDispatcher(DmaDevice):
-    type = 'GpuDispatcher'
+class GPUDispatcher(SimObject):
+    type = 'GPUDispatcher'
     cxx_header = 'gpu-compute/dispatcher.hh'
-    # put at 8GB line for now
-    pio_addr = Param.Addr(0x200000000, "Device Address")
-    pio_latency = Param.Latency('1ns', "Programmed IO latency")
-    shader_pointer = Param.Shader('pointer to shader')
-    translation_port = MasterPort('Port to the dispatcher TLB')
-    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
-
-    cl_driver = Param.ClDriver('pointer to driver')
-
-class MemType(Enum): vals = [
-    'M_U8',
-    'M_U16',
-    'M_U32',
-    'M_U64',
-    'M_S8',
-    'M_S16',
-    'M_S32',
-    'M_S64',
-    'M_F16',
-    'M_F32',
-    'M_F64',
-    ]
+
+class GPUCommandProcessor(HSADevice):
+    type = 'GPUCommandProcessor'
+    cxx_header = 'gpu-compute/gpu_command_processor.hh'
+    dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
 
 class StorageClassType(Enum): vals = [
     'SC_SPILL',
     'SC_GLOBAL',
-    'SC_SHARED',
+    'SC_GROUP',
     'SC_PRIVATE',
     'SC_READONLY',
     'SC_KERNARG',
+    'SC_ARG',
     'SC_NONE',
     ]
-
-class RegisterType(Enum): vals = [
-    'RT_VECTOR',
-    'RT_SCALAR',
-    'RT_CONDITION',
-    'RT_HARDWARE',
-    'RT_NONE',
-    ]
diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py
index e12db7107..ad4c6c3f7 100644
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -13,9 +13,9 @@
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
-# 3. Neither the name of the copyright holder nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -40,15 +40,18 @@ class GPUStaticInstFlags(Enum):
         # Op types
         'ALU',               # ALU op
         'Branch',            # Branch instruction
+        'CondBranch',        # Conditinal Branch instruction
         'Nop',               # No-op (no effect at all)
-        'Return',            # Return instruction
+        'Return',            # Subroutine return instruction
+        'EndOfKernel',       # Kernel termination instruction
+        'KernelLaunch',      # Kernel launch inst
         'UnconditionalJump', #
         'SpecialOp',         # Special op
         'Waitcnt',           # Is a waitcnt instruction
 
         # Memory ops
         'MemBarrier',        # Barrier instruction
-        'MemFence',          # Memory fence instruction
+        'MemSync',           # Synchronizing instruction
         'MemoryRef',         # References memory (load, store, or atomic)
         'Flat',              # Flat memory op
         'Load',              # Reads from memory
@@ -64,6 +67,13 @@ class GPUStaticInstFlags(Enum):
         'WritesSCC',         # The instruction writes SCC
         'ReadsVCC',          # The instruction reads VCC
         'WritesVCC',         # The instruction writes VCC
+        'ReadsEXEC',         # The instruction reads Exec Mask
+        'WritesEXEC',        # The instruction writes Exec Mask
+        'ReadsMode',         # The instruction reads Mode register
+        'WritesMode',        # The instruction writes Mode register
+        'IgnoreExec',        # The instruction ignores the Exec Mask
+        'IsSDWA',            # The instruction is a SDWA instruction
+        'IsDPP',             # The instruction is a DPP instruction
 
         # Atomic OP types
         'AtomicAnd',
@@ -78,13 +88,6 @@ class GPUStaticInstFlags(Enum):
         'AtomicMax',
         'AtomicMin',
 
-        # Memory order flags
-        'RelaxedOrder',
-        'Acquire',           # Has acquire semantics
-        'Release',           # Has release semantics
-        'AcquireRelease',    # Has acquire and release semantics
-        'NoOrder',           # Has no ordering restrictions
-
         # Segment access flags
         'ArgSegment',        # Accesses the arg segment
         'GlobalSegment',     # Accesses global memory
@@ -95,15 +98,17 @@ class GPUStaticInstFlags(Enum):
         'SpillSegment',      # Accesses the spill segment
         'NoSegment',         # Does not have an associated segment
 
-        # Scope flags
-        'WorkitemScope',
-        'WavefrontScope',
-        'WorkgroupScope',
-        'DeviceScope',
-        'SystemScope',
-        'NoScope',           # Does not have an associated scope
-
         # Coherence flags
-        'GloballyCoherent',  # Coherent with other workitems on same device
-        'SystemCoherent'     # Coherent with a different device, or the host
+        'GloballyCoherent',  # Coherent with other work-items on same device
+        'SystemCoherent',    # Coherent with a different device, or the host
+
+        # Floating-point flags
+        'F16',               # F16 operation
+        'F32',               # F32 operation
+        'F64',               # F64 operation
+
+        # MAC, MAD, FMA
+        'FMA',               # FMA
+        'MAC',               # MAC
+        'MAD'                # MAD
         ]
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index b0ffebf0b..244791b9b 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -41,56 +41,62 @@ SimObject('GPUStaticInstFlags.py')
 SimObject('LdsState.py')
 SimObject('X86GPUTLB.py')
 
-if env['TARGET_GPU_ISA'] == 'hsail':
-    Source('brig_object.cc')
-    Source('hsail_code.cc')
-
-Source('cl_driver.cc')
 Source('compute_unit.cc')
-Source('condition_register_state.cc')
 Source('dispatcher.cc')
 Source('exec_stage.cc')
 Source('fetch_stage.cc')
 Source('fetch_unit.cc')
 Source('global_memory_pipeline.cc')
+Source('gpu_command_processor.cc')
+Source('gpu_compute_driver.cc')
 Source('gpu_dyn_inst.cc')
 Source('gpu_exec_context.cc')
 Source('gpu_static_inst.cc')
 Source('gpu_tlb.cc')
-Source('hsa_object.cc')
-Source('kernel_cfg.cc')
 Source('lds_state.cc')
 Source('local_memory_pipeline.cc')
 Source('pool_manager.cc')
+Source('register_file.cc')
+Source('register_manager.cc')
+Source('scalar_memory_pipeline.cc')
+Source('scalar_register_file.cc')
 Source('schedule_stage.cc')
 Source('scheduler.cc')
 Source('scoreboard_check_stage.cc')
 Source('shader.cc')
 Source('simple_pool_manager.cc')
+Source('static_register_manager_policy.cc')
 Source('tlb_coalescer.cc')
 Source('vector_register_file.cc')
-Source('vector_register_state.cc')
 Source('wavefront.cc')
 
-DebugFlag('BRIG')
 DebugFlag('GPUCoalescer')
+DebugFlag('GPUCommandProc')
+DebugFlag('GPUDriver')
+DebugFlag('GPUInitAbi')
 DebugFlag('GPUDisp')
 DebugFlag('GPUExec')
 DebugFlag('GPUFetch')
-DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUKernelInfo')
 DebugFlag('GPUMem')
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
 DebugFlag('GPUReg')
+DebugFlag('GPURename')
+DebugFlag('GPURF')
+DebugFlag('GPURfState')
+DebugFlag('GPUSched')
+DebugFlag('GPUShader')
+DebugFlag('GPUSRF')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('GPUVRF')
-DebugFlag('HSALoader')
-DebugFlag('HSAIL')
-DebugFlag('HSAILObject')
+DebugFlag('GPUVRFSched')
+DebugFlag('GPUWgLatency')
 DebugFlag('Predictor')
 DebugFlag('WavefrontStack')
 
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
-                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL',
-                        'GPUVRF'])
+                        'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
+                        'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
+                        'GPUInitAbi'])
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index cd880d6cc..feeb803e1 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -42,53 +42,69 @@
 #include "debug/GPUMem.hh"
 #include "debug/GPUPort.hh"
 #include "debug/GPUPrefetch.hh"
+#include "debug/GPUReg.hh"
+#include "debug/GPURename.hh"
 #include "debug/GPUSync.hh"
 #include "debug/GPUTLB.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/page_table.hh"
 #include "sim/process.hh"
-
-ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
-    scoreboardCheckStage(p), scheduleStage(p), execStage(p),
-    globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
-    cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+#include "sim/sim_exit.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
+    numVectorGlobalMemUnits(p->num_global_mem_pipes),
+    numVectorSharedMemUnits(p->num_shared_mem_pipes),
+    numScalarMemUnits(p->num_scalar_mem_pipes),
+    numVectorALUs(p->num_SIMDs),
+    numScalarALUs(p->num_scalar_cores),
+    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+    registerManager(p->register_manager), fetchStage(p),
+    scoreboardCheckStage(p), scheduleStage(p, this), execStage(p),
+    globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p),
+    tickEvent([this]{ exec(); }, "Compute unit tick event",
+          false, Event::CPU_Tick_Pri),
+    cu_id(p->cu_id),
+    vrf(p->vector_register_file), srf(p->scalar_register_file),
+    simdWidth(p->simd_width),
     spBypassPipeLength(p->spbypass_pipe_length),
     dpBypassPipeLength(p->dpbypass_pipe_length),
+    scalarPipeStages(p->scalar_pipe_length),
+    operandNetworkLength(p->operand_network_length),
     issuePeriod(p->issue_period),
-    numGlbMemUnits(p->num_global_mem_pipes),
-    numLocMemUnits(p->num_shared_mem_pipes),
+    vrf_gm_bus_latency(p->vrf_gm_bus_latency),
+    srf_scm_bus_latency(p->srf_scm_bus_latency),
+    vrf_lm_bus_latency(p->vrf_lm_bus_latency),
     perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
     prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
-    xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+    debugSegFault(p->debugSegFault),
     functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
     countPages(p->countPages), barrier_id(0),
-    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
-    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
     lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
     _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
-    wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
+    wavefrontSize(p->wf_size)
 {
     /**
      * This check is necessary because std::bitset only provides conversion
      * to unsigned long or unsigned long long via to_ulong() or to_ullong().
-     * there are * a few places in the code where to_ullong() is used, however
-     * if VSZ is larger than a value the host can support then bitset will
-     * throw a runtime exception. we should remove all use of to_long() or
-     * to_ullong() so we can have VSZ greater than 64b, however until that is
-     * done this assert is required.
+     * there are a few places in the code where to_ullong() is used, however
+     * if wavefrontSize is larger than a value the host can support then
+     * bitset will throw a runtime exception. We should remove all use of
+     * to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
+     * however until that is done this assert is required.
      */
-    fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
-             p->wfSize <= 0,
+    fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
+             p->wf_size <= 0,
              "WF size is larger than the host can support");
     fatal_if(!isPowerOf2(wavefrontSize),
              "Wavefront size should be a power of 2");
@@ -101,10 +117,12 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
     numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
                                / coalescerToVrfBusWidth;
 
-    lastVaddrWF.resize(numSIMDs);
-    wfList.resize(numSIMDs);
+    // Initialization: all WF slots are assumed STOPPED
+    idleWfs = p->n_wf * numVectorALUs;
+    lastVaddrWF.resize(numVectorALUs);
+    wfList.resize(numVectorALUs);
 
-    for (int j = 0; j < numSIMDs; ++j) {
+    for (int j = 0; j < numVectorALUs; ++j) {
         lastVaddrWF[j].resize(p->n_wf);
 
         for (int i = 0; i < p->n_wf; ++i) {
@@ -119,9 +137,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
         }
     }
 
-    lastVaddrSimd.resize(numSIMDs);
+    lastVaddrSimd.resize(numVectorALUs);
 
-    for (int i = 0; i < numSIMDs; ++i) {
+    for (int i = 0; i < numVectorALUs; ++i) {
         lastVaddrSimd[i].resize(wfSize(), 0);
     }
 
@@ -150,20 +168,33 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
     cuExitCallback = new CUExitCallback(this);
     registerExitCallback(cuExitCallback);
 
-    xactCasLoadMap.clear();
-    lastExecCycle.resize(numSIMDs, 0);
+    lastExecCycle.resize(numVectorALUs, 0);
 
     for (int i = 0; i < vrf.size(); ++i) {
         vrf[i]->setParent(this);
     }
-
+    for (int i = 0; i < srf.size(); ++i) {
+        srf[i]->setParent(this);
+    }
     numVecRegsPerSimd = vrf[0]->numRegs();
+    numScalarRegsPerSimd = srf[0]->numRegs();
+
+    registerManager->setParent(this);
+
+    activeWaves = 0;
+
+    instExecPerSimd.resize(numVectorALUs, 0);
+
+    // Calculate the number of bits to address a cache line
+    panic_if(!isPowerOf2(_cacheLineSize),
+        "Cache line size should be a power of two.");
+    cacheLineBits = floorLog2(_cacheLineSize);
 }
 
 ComputeUnit::~ComputeUnit()
 {
     // Delete wavefront slots
-    for (int j = 0; j < numSIMDs; ++j) {
+    for (int j = 0; j < numVectorALUs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
             delete wfList[j][i];
         }
@@ -171,63 +202,110 @@ ComputeUnit::~ComputeUnit()
     }
     lastVaddrCU.clear();
     readyList.clear();
-    waveStatusList.clear();
     dispatchList.clear();
-    vectorAluInstAvail.clear();
     delete cuExitCallback;
     delete ldsPort;
 }
 
-void
-ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr)
+int
+ComputeUnit::numExeUnits() const
+{
+    return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
+        numVectorSharedMemUnits + numScalarMemUnits;
+}
+
+// index into readyList of the first memory unit
+int
+ComputeUnit::firstMemUnit() const
+{
+    return numVectorALUs + numScalarALUs;
+}
+
+// index into readyList of the last memory unit
+int
+ComputeUnit::lastMemUnit() const
 {
-    w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+    return numExeUnits() - 1;
+}
 
-    w->workGroupSz[0] = ndr->q.wgSize[0];
-    w->workGroupSz[1] = ndr->q.wgSize[1];
-    w->workGroupSz[2] = ndr->q.wgSize[2];
+// index into scalarALUs vector of SALU used by the wavefront
+int
+ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const
+{
+    if (numScalarALUs == 1) {
+        return 0;
+    } else {
+        return w->simdId % numScalarALUs;
+    }
+}
+
+// index into readyList of Scalar ALU unit used by wavefront
+int
+ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const
+{
+    return numVectorALUs + mapWaveToScalarAlu(w);
+}
+
+// index into readyList of Global Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const
+{
+    // TODO: FIXME if more than 1 GM pipe supported
+    return numVectorALUs + numScalarALUs;
+}
+
+// index into readyList of Local Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToLocalMem(Wavefront *w) const
+{
+    // TODO: FIXME if more than 1 LM pipe supported
+    return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits;
+}
+
+// index into readyList of Scalar Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToScalarMem(Wavefront *w) const
+{
+    // TODO: FIXME if more than 1 ScM pipe supported
+    return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
+        numVectorSharedMemUnits;
+}
+
+void
+ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
+{
+    w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
+    w->workGroupSz[0] = task->wgSize(0);
+    w->workGroupSz[1] = task->wgSize(1);
+    w->workGroupSz[2] = task->wgSize(2);
     w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
-    w->gridSz[0] = ndr->q.gdSize[0];
-    w->gridSz[1] = ndr->q.gdSize[1];
-    w->gridSz[2] = ndr->q.gdSize[2];
-    w->kernelArgs = ndr->q.args;
-    w->privSizePerItem = ndr->q.privMemPerItem;
-    w->spillSizePerItem = ndr->q.spillMemPerItem;
-    w->roBase = ndr->q.roMemStart;
-    w->roSize = ndr->q.roMemTotal;
-    w->computeActualWgSz(ndr);
+    w->gridSz[0] = task->gridSize(0);
+    w->gridSz[1] = task->gridSize(1);
+    w->gridSz[2] = task->gridSize(2);
+    w->computeActualWgSz(task);
 }
 
+// delete all wavefronts that have been marked as ready at SCB stage
+// but are found to have empty instruction buffers at SCH stage
 void
-ComputeUnit::updateEvents() {
-
-    if (!timestampVec.empty()) {
-        uint32_t vecSize = timestampVec.size();
-        uint32_t i = 0;
-        while (i < vecSize) {
-            if (timestampVec[i] <= shader->tick_cnt) {
-                std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
-                vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
-                                            statusVec[i]);
-                timestampVec.erase(timestampVec.begin() + i);
-                regIdxVec.erase(regIdxVec.begin() + i);
-                statusVec.erase(statusVec.begin() + i);
-                --vecSize;
-                --i;
+ComputeUnit::updateReadyList(int unitId)
+{
+    if (!readyList[unitId].empty()) {
+        for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
+             it != readyList[unitId].end();) {
+            if ((*it)->instructionBuffer.empty()) {
+                it = readyList[unitId].erase(it);
+            }
+            else {
+                ++it;
             }
-            ++i;
         }
     }
-
-    for (int i = 0; i< numSIMDs; ++i) {
-        vrf[i]->updateEvents();
-    }
 }
 
-
 void
 ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                            NDRange *ndr)
+                            HSAQueueEntry *task, bool fetchContext)
 {
     static int _n_wave = 0;
 
@@ -239,7 +317,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
             init_mask[k] = 1;
     }
 
-    w->kernId = ndr->dispatchId;
+    w->execMask() = init_mask;
+
+    w->kernId = task->dispatchId();
     w->wfId = waveId;
     w->initMask = init_mask.to_ullong();
 
@@ -263,29 +343,21 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
     w->oldBarrierCnt = 0;
     w->barrierCnt = 0;
 
-    w->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
-
-    w->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
-
-    w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
-
     // WG state
-    w->wgId = ndr->globalWgId;
-    w->dispatchId = ndr->dispatchId;
-    w->workGroupId[0] = w->wgId % ndr->numWg[0];
-    w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
-    w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
+    w->wgId = task->globalWgId();
+    w->dispatchId = task->dispatchId();
+    w->workGroupId[0] = w->wgId % task->numWg(0);
+    w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
+    w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
 
     w->barrierId = barrier_id;
-    w->stalledAtBarrier = false;
+    w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
 
     // set the wavefront context to have a pointer to this section of the LDS
     w->ldsChunk = ldsChunk;
 
     int32_t refCount M5_VAR_USED =
-                    lds.increaseRefCounter(w->dispatchId, w->wgId);
+                lds.increaseRefCounter(w->dispatchId, w->wgId);
     DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
                     cu_id, w->wgId, refCount);
 
@@ -294,85 +366,134 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
     if (w->pendingFetch)
         w->dropFetch = true;
 
-    // is this the last wavefront in the workgroup
-    // if set the spillWidth to be the remaining work-items
-    // so that the vector access is correct
-    if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
-        w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
-    } else {
-        w->spillWidth = wfSize();
-    }
-
     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
-            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+            "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
+            w->wfSlotId);
+
+    w->initRegState(task, w->actualWgSzTotal);
+    w->start(_n_wave++, task->codeAddr());
 
-    w->start(++_n_wave, ndr->q.code_ptr);
+    waveLevelParallelism.sample(activeWaves);
+    activeWaves++;
+}
+
+/**
+ * trigger invalidate operation in the cu
+ *
+ * req: request initialized in shader, carrying the invlidate flags
+ */
+void
+ComputeUnit::doInvalidate(RequestPtr req, int kernId){
+    GPUDynInstPtr gpuDynInst
+        = std::make_shared<GPUDynInst>(this, nullptr,
+            new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+    // kern_id will be used in inv responses
+    gpuDynInst->kern_id = kernId;
+    // update contextId field
+    req->setContext(gpuDynInst->wfDynId);
+
+    injectGlobalMemFence(gpuDynInst, true, req);
+}
+
+/**
+ * trigger flush operation in the cu
+ *
+ * gpuDynInst: inst passed to the request
+ */
+void
+ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
+    injectGlobalMemFence(gpuDynInst, true);
 }
 
 void
-ComputeUnit::StartWorkgroup(NDRange *ndr)
+ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
 {
-    // reserve the LDS capacity allocated to the work group
-    // disambiguated by the dispatch ID and workgroup ID, which should be
-    // globally unique
-    LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
-                                          ndr->q.ldsSize);
-
-    // Send L1 cache acquire
-    // isKernel + isAcquire = Kernel Begin
-    if (shader->impl_kern_boundary_sync) {
-        GPUDynInstPtr gpuDynInst =
-            std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
-                                         getAndIncSeqNum());
-
-        gpuDynInst->useContinuation = false;
-        injectGlobalMemFence(gpuDynInst, true);
+    // If we aren't ticking, start it up!
+    if (!tickEvent.scheduled()) {
+        DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
+        schedule(tickEvent, nextCycle());
     }
 
-    // calculate the number of 32-bit vector registers required by wavefront
-    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
-    int wave_id = 0;
-
-    // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
-    for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
-        Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
-        // Check if this wavefront slot is available:
-        // It must be stopped and not waiting
-        // for a release to complete S_RETURNING
-        if (w->status == Wavefront::S_STOPPED) {
-            fillKernelState(w, ndr);
-            // if we have scheduled all work items then stop
-            // scheduling wavefronts
-            if (wave_id * wfSize() >= w->actualWgSzTotal)
-                break;
+    // the kernel's invalidate must have finished before any wg dispatch
+    assert(task->isInvDone());
 
-            // reserve vector registers for the scheduled wavefront
-            assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
-            uint32_t normSize = 0;
+    // reserve the LDS capacity allocated to the work group
+    // disambiguated by the dispatch ID and workgroup ID, which should be
+    // globally unique
+    LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
+                                          task->globalWgId(),
+                                          task->ldsSize());
 
-            w->startVgprIndex = vrf[m % numSIMDs]->manager->
-                                    allocateRegion(vregDemand, &normSize);
+    panic_if(!ldsChunk, "was not able to reserve space for this WG");
 
-            w->reservedVectorRegs = normSize;
-            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+    // calculate the number of 32-bit vector registers required
+    // by each work item
+    int vregDemand = task->numVectorRegs();
+    int sregDemand = task->numScalarRegs();
+    int wave_id = 0;
 
-            startWavefront(w, wave_id, ldsChunk, ndr);
-            ++wave_id;
+    // Assign WFs according to numWfsToSched vector, which is computed by
+    // hasDispResources()
+    for (int j = 0; j < shader->n_wf; ++j) {
+        for (int i = 0; i < numVectorALUs; ++i) {
+            Wavefront *w = wfList[i][j];
+            // Check if this wavefront slot is available and there are WFs
+            // remaining to be dispatched to current SIMD:
+            // WF slot must be stopped and not waiting
+            // for a release to complete S_RETURNING
+            if (w->getStatus() == Wavefront::S_STOPPED &&
+                numWfsToSched[i] > 0) {
+                // decrement number of WFs awaiting dispatch to current SIMD
+                numWfsToSched[i] -= 1;
+
+                fillKernelState(w, task);
+
+                DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
+                    "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
+                    vregDemand, sregDemand);
+
+                registerManager->allocateRegisters(w, vregDemand, sregDemand);
+
+                startWavefront(w, wave_id, ldsChunk, task);
+                ++wave_id;
+            }
         }
     }
     ++barrier_id;
 }
 
-int
-ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+void
+ComputeUnit::insertInPipeMap(Wavefront *w)
+{
+    panic_if(w->instructionBuffer.empty(),
+             "Instruction Buffer of WF%d can't be empty", w->wgId);
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    pipeMap.emplace(ii->seqNum());
+}
+
+void
+ComputeUnit::deleteFromPipeMap(Wavefront *w)
+{
+    panic_if(w->instructionBuffer.empty(),
+             "Instruction Buffer of WF%d can't be empty", w->wgId);
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    // delete the dynamic instruction from the pipeline map
+    auto it = pipeMap.find(ii->seqNum());
+    panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
+    pipeMap.erase(it);
+}
+
+bool
+ComputeUnit::hasDispResources(HSAQueueEntry *task)
 {
-    // Get true size of workgroup (after clamping to grid size)
-    int trueWgSize[3];
+    // compute true size of workgroup (after clamping to grid size)
+    int trueWgSize[HSAQueueEntry::MAX_DIM];
     int trueWgSizeTotal = 1;
 
-    for (int d = 0; d < 3; ++d) {
-        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
-                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+    for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
+        trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
+                                 task->wgId(d) * task->wgSize(d));
 
         trueWgSizeTotal *= trueWgSize[d];
         DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
@@ -380,69 +501,104 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
 
     DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
 
+    // calculate the number of WFs in this WG
+    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+
     // calculate the number of 32-bit vector registers required by each
     // work item of the work group
-    int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
-    bool vregAvail = true;
-    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+    int vregDemandPerWI = task->numVectorRegs();
+    // calculate the number of 32-bit scalar registers required by each
+    // work item of the work group
+    int sregDemandPerWI = task->numScalarRegs();
+
+    // check if the total number of VGPRs snd SGPRs required by all WFs
+    // of the WG fit in the VRFs of all SIMD units and the CU's SRF
+    panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
+             "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
+             "that has %d VGPRs\n",
+             numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
+    panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
+             "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
+             "with %d SGPRs\n",
+             numWfs, sregDemandPerWI, numScalarRegsPerSimd);
+
+    // number of WF slots that are not occupied
     int freeWfSlots = 0;
-    // check if the total number of VGPRs required by all WFs of the WG
-    // fit in the VRFs of all SIMD units
-    assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+    // number of Wfs from WG that were successfully mapped to a SIMD
     int numMappedWfs = 0;
-    std::vector<int> numWfsPerSimd;
-    numWfsPerSimd.resize(numSIMDs, 0);
-    // find how many free WF slots we have across all SIMDs
+    numWfsToSched.clear();
+    numWfsToSched.resize(numVectorALUs, 0);
+
+    // attempt to map WFs to the SIMDs, based on WF slot availability
+    // and register file availability
     for (int j = 0; j < shader->n_wf; ++j) {
-        for (int i = 0; i < numSIMDs; ++i) {
-            if (wfList[i][j]->status == Wavefront::S_STOPPED) {
-                // count the number of free WF slots
+        for (int i = 0; i < numVectorALUs; ++i) {
+            if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
                 ++freeWfSlots;
-                if (numMappedWfs < numWfs) {
-                    // count the WFs to be assigned per SIMD
-                    numWfsPerSimd[i]++;
+                // check if current WF will fit onto current SIMD/VRF
+                // if all WFs have not yet been mapped to the SIMDs
+                if (numMappedWfs < numWfs &&
+                    registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
+                                                      sregDemandPerWI) &&
+                    registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
+                                                      vregDemandPerWI)) {
+                    numWfsToSched[i]++;
+                    numMappedWfs++;
                 }
-                numMappedWfs++;
             }
         }
     }
 
-    // if there are enough free WF slots then find if there are enough
-    // free VGPRs per SIMD based on the WF->SIMD mapping
-    if (freeWfSlots >= numWfs) {
-        for (int j = 0; j < numSIMDs; ++j) {
-            // find if there are enough free VGPR regions in the SIMD's VRF
-            // to accommodate the WFs of the new WG that would be mapped to
-            // this SIMD unit
-            vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
-                                                     vregDemandPerWI);
-
-            // stop searching if there is at least one SIMD
-            // whose VRF does not have enough free VGPR pools.
-            // This is because a WG is scheduled only if ALL
-            // of its WFs can be scheduled
-            if (!vregAvail)
-                break;
+    // check that the number of mapped WFs is not greater
+    // than the actual number of WFs
+    assert(numMappedWfs <= numWfs);
+
+    bool vregAvail = true;
+    bool sregAvail = true;
+    // if a WF to SIMD mapping was not found, find the limiting resource
+    if (numMappedWfs < numWfs) {
+
+        for (int j = 0; j < numVectorALUs; ++j) {
+            // find if there are enough free VGPRs in the SIMD's VRF
+            // to accomodate the WFs of the new WG that would be mapped
+            // to this SIMD unit
+            vregAvail &= registerManager->
+                canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
+            // find if there are enough free SGPRs in the SIMD's SRF
+            // to accomodate the WFs of the new WG that would be mapped
+            // to this SIMD unit
+            sregAvail &= registerManager->
+                canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
         }
     }
 
-    DPRINTF(GPUDisp, "Free WF slots =  %d, VGPR Availability = %d\n",
-            freeWfSlots, vregAvail);
+    DPRINTF(GPUDisp, "Free WF slots =  %d, Mapped WFs = %d, \
+            VGPR Availability = %d, SGPR Availability = %d\n",
+            freeWfSlots, numMappedWfs, vregAvail, sregAvail);
 
     if (!vregAvail) {
         ++numTimesWgBlockedDueVgprAlloc;
     }
 
+    if (!sregAvail) {
+        ++numTimesWgBlockedDueSgprAlloc;
+    }
+
     // Return true if enough WF slots to submit workgroup and if there are
     // enough VGPRs to schedule all WFs to their SIMD units
-    if (!lds.canReserve(ndr->q.ldsSize)) {
+    bool ldsAvail = lds.canReserve(task->ldsSize());
+    if (!ldsAvail) {
         wgBlockedDueLdsAllocation++;
     }
 
-    // Return true if (a) there are enough free WF slots to submit
-    // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
-    // SIMD units and (c) if there is enough space in LDS
-    return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+    // Return true if the following are all true:
+    // (a) all WFs of the WG were mapped to free WF slots
+    // (b) there are enough VGPRs to schedule all WFs to their SIMD units
+    // (c) there are enough SGPRs on the CU to schedule all WFs
+    // (d) there is enough space in LDS to allocate for all WFs
+    bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
+                        && ldsAvail;
+    return can_dispatch;
 }
 
 int
@@ -451,21 +607,24 @@ ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
     DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
     int ccnt = 0;
 
-    for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+    for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
         for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
             Wavefront *w = wfList[i_simd][i_wf];
 
-            if (w->status == Wavefront::S_RUNNING) {
+            if (w->getStatus() == Wavefront::S_RUNNING) {
                 DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
 
                 DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
                         w->barrierId, _barrier_id);
 
-                DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+                DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
                         w->barrierCnt, bcnt);
+
+                DPRINTF(GPUSync, "outstanding Reqs = %d\n",
+                         w->outstandingReqs);
             }
 
-            if (w->status == Wavefront::S_RUNNING &&
+            if (w->getStatus() == Wavefront::S_RUNNING &&
                 w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
                 !w->outstandingReqs) {
                 ++ccnt;
@@ -482,61 +641,22 @@ ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
     return ccnt == bslots;
 }
 
-//  Check if the current wavefront is blocked on additional resources.
-bool
-ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
-{
-    bool cede = false;
-
-    // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
-    // magic instructions will impact the scheduling of wavefronts
-    if (xact_cas_mode) {
-        /*
-         * When a wavefront calls xact_cas_ld, it adds itself to a per address
-         * queue. All per address queues are managed by the xactCasLoadMap.
-         *
-         * A wavefront is not blocked if: it is not in ANY per address queue or
-         * if it is at the head of a per address queue.
-         */
-        for (auto itMap : xactCasLoadMap) {
-            std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
-
-            if (!curWaveIDQueue.empty()) {
-                for (auto it : curWaveIDQueue) {
-                    waveIdentifier cur_wave = it;
-
-                    if (cur_wave.simdId == simdId &&
-                        cur_wave.wfSlotId == wfSlotId) {
-                        // 2 possibilities
-                        // 1: this WF has a green light
-                        // 2: another WF has a green light
-                        waveIdentifier owner_wave = curWaveIDQueue.front();
-
-                        if (owner_wave.simdId != cur_wave.simdId ||
-                            owner_wave.wfSlotId != cur_wave.wfSlotId) {
-                            // possibility 2
-                            cede = true;
-                            break;
-                        } else {
-                            // possibility 1
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return cede;
-}
-
 // Execute one clock worth of work on the ComputeUnit.
 void
 ComputeUnit::exec()
 {
-    updateEvents();
+    // process reads and writes in the RFs
+    for (auto &vecRegFile : vrf) {
+        vecRegFile->exec();
+    }
+
+    for (auto &scRegFile : srf) {
+        scRegFile->exec();
+    }
+
     // Execute pipeline stages in reverse order to simulate
     // the pipeline latency
+    scalarMemoryPipe.exec();
     globalMemoryPipe.exec();
     localMemoryPipe.exec();
     execStage.exec();
@@ -545,65 +665,62 @@ ComputeUnit::exec()
     fetchStage.exec();
 
     totalCycles++;
+
+    // Put this CU to sleep if there is no more work to be done.
+    if (!isDone()) {
+        schedule(tickEvent, nextCycle());
+    } else {
+        shader->notifyCuSleep();
+        DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
+    }
 }
 
 void
 ComputeUnit::init()
 {
-    // Initialize CU Bus models
-    glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
-    locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
-    nextGlbMemBus = 0;
-    nextLocMemBus = 0;
-    fatal_if(numGlbMemUnits > 1,
-             "No support for multiple Global Memory Pipelines exists!!!");
-    vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
-    for (int j = 0; j < numGlbMemUnits; ++j) {
-        vrfToGlobalMemPipeBus[j] = WaitClass();
-        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
-    }
+    // Initialize CU Bus models and execution resources
 
-    fatal_if(numLocMemUnits > 1,
-             "No support for multiple Local Memory Pipelines exists!!!");
-    vrfToLocalMemPipeBus.resize(numLocMemUnits);
-    for (int j = 0; j < numLocMemUnits; ++j) {
-        vrfToLocalMemPipeBus[j] = WaitClass();
-        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
+    // Vector ALUs
+    vectorALUs.clear();
+    for (int i = 0; i < numVectorALUs; i++) {
+        vectorALUs.emplace_back(this, clockPeriod());
     }
-    vectorRegsReserved.resize(numSIMDs, 0);
-    aluPipe.resize(numSIMDs);
-    wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
 
-    for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
-        wfWait[i] = WaitClass();
-        wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
+    // Scalar ALUs
+    scalarALUs.clear();
+    for (int i = 0; i < numScalarALUs; i++) {
+        scalarALUs.emplace_back(this, clockPeriod());
     }
 
-    for (int i = 0; i < numSIMDs; ++i) {
-        aluPipe[i] = WaitClass();
-        aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
-    }
+    // Vector Global Memory
+    fatal_if(numVectorGlobalMemUnits > 1,
+             "No support for multiple Global Memory Pipelines exists!!!");
+    vectorGlobalMemUnit.init(this, clockPeriod());
+    vrfToGlobalMemPipeBus.init(this, clockPeriod());
+    glbMemToVrfBus.init(this, clockPeriod());
 
-    // Setup space for call args
-    for (int j = 0; j < numSIMDs; ++j) {
-        for (int i = 0; i < shader->n_wf; ++i) {
-            wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
-        }
-    }
+    // Vector Local/Shared Memory
+    fatal_if(numVectorSharedMemUnits > 1,
+             "No support for multiple Local Memory Pipelines exists!!!");
+    vectorSharedMemUnit.init(this, clockPeriod());
+    vrfToLocalMemPipeBus.init(this, clockPeriod());
+    locMemToVrfBus.init(this, clockPeriod());
 
-    // Initializing pipeline resources
-    readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
-    waveStatusList.resize(numSIMDs);
+    // Scalar Memory
+    fatal_if(numScalarMemUnits > 1,
+             "No support for multiple Scalar Memory Pipelines exists!!!");
+    scalarMemUnit.init(this, clockPeriod());
+    srfToScalarMemPipeBus.init(this, clockPeriod());
+    scalarMemToSrfBus.init(this, clockPeriod());
 
-    for (int j = 0; j < numSIMDs; ++j) {
-        for (int i = 0; i < shader->n_wf; ++i) {
-            waveStatusList[j].push_back(
-                std::make_pair(wfList[j][i], BLOCKED));
-        }
-    }
+    vectorRegsReserved.resize(numVectorALUs, 0);
+    scalarRegsReserved.resize(numVectorALUs, 0);
+
+    // Initializing pipeline resources
+    readyList.resize(numExeUnits());
 
-    for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
-        dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+    for (int j = 0; j < numExeUnits(); ++j) {
+        dispatchList.push_back(std::make_pair(nullptr, EMPTY));
     }
 
     fetchStage.init(this);
@@ -612,10 +729,7 @@ ComputeUnit::init()
     execStage.init(this);
     globalMemoryPipe.init(this);
     localMemoryPipe.init(this);
-    // initialize state for statistics calculation
-    vectorAluInstAvail.resize(numSIMDs, false);
-    shrMemInstAvail = 0;
-    glbMemInstAvail = 0;
+    scalarMemoryPipe.init(this);
 
     gmTokenPort.setTokenManager(memPortTokens);
 }
@@ -629,61 +743,176 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
     int index = sender_state->port_index;
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
+
+    // MemSyncResp + WriteAckResp are handled completely here and we don't
+    // schedule a MemRespEvent to process the responses further
+    if (pkt->cmd == MemCmd::MemSyncResp) {
+        // This response is for 1 of the following request types:
+        //  - kernel launch
+        //  - kernel end
+        //  - non-kernel mem sync
+
+        // Kernel Launch
+        // wavefront was nullptr when launching kernel, so it is meaningless
+        // here (simdId=-1, wfSlotId=-1)
+        if (gpuDynInst->isKernelLaunch()) {
+            // for kernel launch, the original request must be both kernel-type
+            // and acquire
+            assert(pkt->req->isKernel());
+            assert(pkt->req->isAcquire());
+
+            // one D-Cache inv is done, decrement counter
+            dispatcher.updateInvCounter(gpuDynInst->kern_id);
+
+            delete pkt->senderState;
+            delete pkt;
+            return true;
+        }
 
-    // Is the packet returned a Kernel End or Barrier
-    if (pkt->req->isKernel() && pkt->req->isRelease()) {
-        Wavefront *w =
-            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+        // retrieve wavefront from inst
+        Wavefront *w = gpuDynInst->wavefront();
 
         // Check if we are waiting on Kernel End Release
-        if (w->status == Wavefront::S_RETURNING) {
-            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+        if (w->getStatus() == Wavefront::S_RETURNING
+            && gpuDynInst->isEndOfKernel()) {
+            // for kernel end, the original request must be both kernel-type
+            // and release
+            assert(pkt->req->isKernel());
+            assert(pkt->req->isRelease());
+
+            // one wb done, decrement counter, and return whether all wbs are
+            // done for the kernel
+            bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
+
+            // not all wbs are done for the kernel, just release pkt
+            // resources
+            if (!isWbDone) {
+                delete pkt->senderState;
+                delete pkt;
+                return true;
+            }
+
+            // all wbs are completed for the kernel, do retirement work
+            // for the workgroup
+            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
                     computeUnit->cu_id, w->simdId, w->wfSlotId,
-                    w->wfDynId, w->kernId);
+                    w->wfDynId, w->wgId);
 
-            computeUnit->shader->dispatcher->notifyWgCompl(w);
-            w->status = Wavefront::S_STOPPED;
-        } else {
-            w->outstandingReqs--;
+            dispatcher.notifyWgCompl(w);
+            w->setStatus(Wavefront::S_STOPPED);
         }
 
-        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+        if (!pkt->req->isKernel()) {
+            w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+            DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
+                            "outstanding reqs %d => %d\n", gpuDynInst->simdId,
+                            gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
+                            gpuDynInst->disassemble(), w->outstandingReqs,
+                            w->outstandingReqs - 1);
+            computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
+        }
+
+        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
                 computeUnit->cu_id, gpuDynInst->simdId,
                 gpuDynInst->wfSlotId, w->barrierCnt);
 
-        if (gpuDynInst->useContinuation) {
-            assert(!gpuDynInst->isNoScope());
-            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
-                                           gpuDynInst);
-        }
-
         delete pkt->senderState;
         delete pkt;
         return true;
-    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
-        if (gpuDynInst->useContinuation) {
-            assert(!gpuDynInst->isNoScope());
-            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
-                                           gpuDynInst);
+    } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
+        // this is for writeComplete callback
+        // we simply get decrement write-related wait counters
+        assert(gpuDynInst);
+        Wavefront *w M5_VAR_USED =
+            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+        assert(w);
+        DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
+                        "outstanding reqs %d => %d\n", gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
+                        gpuDynInst->disassemble(), w->outstandingReqs,
+                        w->outstandingReqs - 1);
+        if (gpuDynInst->statusBitVector.none()) {
+            // ask gm pipe to decrement request counters, instead of directly
+            // performing here, to avoid asynchronous counter update and
+            // instruction retirement (which may hurt waincnt effects)
+            computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
+
+            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
+                            computeUnit->cu_id, gpuDynInst->simdId,
+                            gpuDynInst->wfSlotId);
         }
 
         delete pkt->senderState;
         delete pkt;
+
         return true;
     }
 
     EventFunctionWrapper *mem_resp_event =
         computeUnit->memPort[index]->createMemRespEvent(pkt);
 
-    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+    DPRINTF(GPUPort,
+            "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
             computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-            index, pkt->req->getPaddr());
+            gpuDynInst->seqNum(), index, pkt->req->getPaddr());
 
     computeUnit->schedule(mem_resp_event,
                           curTick() + computeUnit->resp_tick_latency);
+
+    return true;
+}
+
+bool
+ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
+{
+    assert(!pkt->req->isKernel());
+
+    // retrieve sender state
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+    assert(pkt->isRead() || pkt->isWrite());
+    assert(gpuDynInst->numScalarReqs > 0);
+
+    gpuDynInst->numScalarReqs--;
+
+    /**
+     * for each returned scalar request we decrement the
+     * numScalarReqs counter that is associated with this
+     * gpuDynInst, which should have been set to correspond
+     * to the number of packets sent for the memory op.
+     * once all packets return, the memory op is finished
+     * and we can push it into the response queue.
+     */
+    if (!gpuDynInst->numScalarReqs) {
+        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+                computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
+                                gpuDynInst);
+        } else {
+                computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
+                                gpuDynInst);
+        }
+    }
+
+    delete pkt->senderState;
+    delete pkt;
+
     return true;
 }
 
+void
+ComputeUnit::ScalarDataPort::recvReqRetry()
+{
+    for (const auto &pkt : retries) {
+        if (!sendTimingReq(pkt)) {
+            break;
+        } else {
+            retries.pop_front();
+        }
+    }
+}
+
 void
 ComputeUnit::DataPort::recvReqRetry()
 {
@@ -715,7 +944,6 @@ bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
     computeUnit->fetchStage.processFetchReturn(pkt);
-
     return true;
 }
 
@@ -759,9 +987,12 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
     BaseTLB::Mode TLB_mode;
     assert(pkt->isRead() || pkt->isWrite());
 
+    // only do some things if actually accessing data
+    bool isDataAccess = pkt->isWrite() || pkt->isRead();
+
     // Check write before read for atomic operations
     // since atomic operations should use BaseTLB::Write
-    if (pkt->isWrite()){
+    if (pkt->isWrite()) {
         TLB_mode = BaseTLB::Write;
     } else if (pkt->isRead()) {
         TLB_mode = BaseTLB::Read;
@@ -825,8 +1056,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             assert(pkt->req->hasPaddr());
             assert(pkt->req->hasSize());
 
-            uint8_t *tmpData = pkt->getPtr<uint8_t>();
-
             // this is necessary because the GPU TLB receives packets instead
             // of requests. when the translation is complete, all relevent
             // fields in the request will be populated, but not in the packet.
@@ -834,13 +1063,17 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             // and proper flags.
             PacketPtr oldPkt = pkt;
             pkt = new Packet(oldPkt->req, oldPkt->cmd);
+            if (isDataAccess) {
+                uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
+                pkt->dataStatic(tmpData);
+            }
             delete oldPkt;
-            pkt->dataStatic(tmpData);
 
 
             // New SenderState for the memory access
-            pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
-                                                             index, nullptr);
+            pkt->senderState =
+                new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+                    nullptr);
 
             gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
             gpuDynInst->tlbHitLevel[index] = hit_level;
@@ -860,8 +1093,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             assert(tlbPort[tlbPort_index]->retries.size() > 0);
 
             DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
-                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                    tmp_vaddr);
+                    "failed!\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, tmp_vaddr);
 
             tlbPort[tlbPort_index]->retries.push_back(pkt);
         } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
@@ -872,8 +1105,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             tlbPort[tlbPort_index]->stallPort();
 
             DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
-                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                    tmp_vaddr);
+                    "failed!\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, tmp_vaddr);
 
             tlbPort[tlbPort_index]->retries.push_back(pkt);
         } else {
@@ -882,7 +1115,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
                    cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
         }
     } else {
-        if (pkt->cmd == MemCmd::MemFenceReq) {
+        if (pkt->cmd == MemCmd::MemSyncReq) {
             gpuDynInst->statusBitVector = VectorMask(0);
         } else {
             gpuDynInst->statusBitVector &= (~(1ll << index));
@@ -907,6 +1140,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
         // Translation is done. It is safe to send the packet to memory.
         memPort[0]->sendFunctional(new_pkt);
 
+        DPRINTF(GPUMem, "Functional sendRequest\n");
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
                 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
                 new_pkt->req->getPaddr());
@@ -923,56 +1157,105 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
 }
 
 void
-ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
 {
-    EventFunctionWrapper *mem_req_event =
-        memPort[index]->createMemReqEvent(pkt);
-
+    assert(pkt->isWrite() || pkt->isRead());
 
-    // New SenderState for the memory access
-    pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
-                                                              nullptr);
+    BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
 
-    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
-            cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
-            pkt->req->getPaddr());
+    pkt->senderState =
+        new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
 
-    schedule(mem_req_event, curTick() + req_tick_latency);
+    pkt->senderState =
+        new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
+                                             pkt->senderState);
+
+    if (scalarDTLBPort->isStalled()) {
+        assert(scalarDTLBPort->retries.size());
+        scalarDTLBPort->retries.push_back(pkt);
+    } else if (!scalarDTLBPort->sendTimingReq(pkt)) {
+        scalarDTLBPort->stallPort();
+        scalarDTLBPort->retries.push_back(pkt);
+    } else {
+        DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
+                tlb_mode == BaseTLB::Read ? "read" : "write",
+                pkt->req->getVaddr());
+    }
 }
 
 void
-ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+                                  bool kernelMemSync,
                                   RequestPtr req)
 {
-    assert(gpuDynInst->isGlobalSeg());
+    assert(gpuDynInst->isGlobalSeg() ||
+           gpuDynInst->executedAs() == Enums::SC_GLOBAL);
 
     if (!req) {
         req = std::make_shared<Request>(
             0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
     }
+
+    // all mem sync requests have Paddr == 0
     req->setPaddr(0);
-    if (kernelLaunch) {
-        req->setFlags(Request::KERNEL);
-    }
 
-    // for non-kernel MemFence operations, memorder flags are set depending
-    // on which type of request is currently being sent, so this
-    // should be set by the caller (e.g. if an inst has acq-rel
-    // semantics, it will send one acquire req an one release req)
-    gpuDynInst->setRequestFlags(req, kernelLaunch);
+    PacketPtr pkt = nullptr;
 
-    // a mem fence must correspond to an acquire/release request
-    assert(req->isAcquire() || req->isRelease());
+    if (kernelMemSync) {
+        if (gpuDynInst->isKernelLaunch()) {
+            req->setCacheCoherenceFlags(Request::ACQUIRE);
+            req->setReqInstSeqNum(gpuDynInst->seqNum());
+            req->setFlags(Request::KERNEL);
+            pkt = new Packet(req, MemCmd::MemSyncReq);
+            pkt->pushSenderState(
+               new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
 
-    // create packet
-    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+            EventFunctionWrapper *mem_req_event =
+              memPort[0]->createMemReqEvent(pkt);
 
-    // set packet's sender state
-    pkt->senderState =
-        new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
+                    "an acquire\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+
+            schedule(mem_req_event, curTick() + req_tick_latency);
+        } else {
+            assert(gpuDynInst->isEndOfKernel());
+
+            req->setCacheCoherenceFlags(Request::RELEASE);
+            req->setReqInstSeqNum(gpuDynInst->seqNum());
+            req->setFlags(Request::KERNEL);
+            pkt = new Packet(req, MemCmd::MemSyncReq);
+            pkt->pushSenderState(
+               new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
+
+            EventFunctionWrapper *mem_req_event =
+              memPort[0]->createMemReqEvent(pkt);
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
+                    "a release\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+
+            schedule(mem_req_event, curTick() + req_tick_latency);
+        }
+    } else {
+        gpuDynInst->setRequestFlags(req);
+
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        pkt = new Packet(req, MemCmd::MemSyncReq);
+        pkt->pushSenderState(
+            new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
 
-    // send the packet
-    sendSyncRequest(gpuDynInst, 0, pkt);
+        EventFunctionWrapper *mem_req_event =
+          memPort[0]->createMemReqEvent(pkt);
+
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+                cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
+                pkt->req->getPaddr());
+
+        schedule(mem_req_event, curTick() + req_tick_latency);
+    }
 }
 
 void
@@ -992,69 +1275,60 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 
     Addr paddr = pkt->req->getPaddr();
 
-    if (pkt->cmd != MemCmd::MemFenceResp) {
-        int index = gpuDynInst->memStatusVector[paddr].back();
-
-        DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
-                pkt->req->getPaddr(), index);
+    // mem sync resp and write-complete callback must be handled already in
+    // DataPort::recvTimingResp
+    assert(pkt->cmd != MemCmd::MemSyncResp);
+    assert(pkt->cmd != MemCmd::WriteCompleteResp);
 
-        gpuDynInst->memStatusVector[paddr].pop_back();
-        gpuDynInst->pAddr = pkt->req->getPaddr();
+    // this is for read, write and atomic
+    int index = gpuDynInst->memStatusVector[paddr].back();
 
-        if (pkt->isRead() || pkt->isWrite()) {
-
-            if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
-                gpuDynInst->statusBitVector &= (~(1ULL << index));
-            } else {
-                assert(gpuDynInst->statusVector[index] > 0);
-                gpuDynInst->statusVector[index]--;
+    DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+            pkt->req->getPaddr(), index);
 
-                if (!gpuDynInst->statusVector[index])
-                    gpuDynInst->statusBitVector &= (~(1ULL << index));
-            }
+    gpuDynInst->memStatusVector[paddr].pop_back();
+    gpuDynInst->pAddr = pkt->req->getPaddr();
 
-            DPRINTF(GPUMem, "bitvector is now %#x\n",
-                    gpuDynInst->statusBitVector);
+    gpuDynInst->statusBitVector &= (~(1ULL << index));
 
-            if (gpuDynInst->statusBitVector == VectorMask(0)) {
-                auto iter = gpuDynInst->memStatusVector.begin();
-                auto end = gpuDynInst->memStatusVector.end();
+    DPRINTF(GPUMem, "bitvector is now %#x\n",
+            gpuDynInst->statusBitVector);
 
-                while (iter != end) {
-                    assert(iter->second.empty());
-                    ++iter;
-                }
+    if (gpuDynInst->statusBitVector == VectorMask(0)) {
+        auto iter = gpuDynInst->memStatusVector.begin();
+        auto end = gpuDynInst->memStatusVector.end();
 
-                gpuDynInst->memStatusVector.clear();
+        while (iter != end) {
+            assert(iter->second.empty());
+            ++iter;
+        }
 
-                if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
-                    gpuDynInst->statusVector.clear();
+        // Calculate the difference between the arrival of the first cache
+        // block and the last cache block to arrive if we have the time
+        // for the first cache block.
+        if (compute_unit->headTailMap.count(gpuDynInst)) {
+            Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
+            compute_unit->headTailLatency.sample(curTick() - headTick);
+            compute_unit->headTailMap.erase(gpuDynInst);
+        }
 
-                compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
+        gpuDynInst->memStatusVector.clear();
 
-                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
-                        compute_unit->cu_id, gpuDynInst->simdId,
-                        gpuDynInst->wfSlotId);
+        // note: only handle read response here; for write, the response
+        // is separately handled when writeComplete callback is received
+        if (pkt->isRead()) {
+            gpuDynInst->
+                profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
+            compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
 
-                // after clearing the status vectors,
-                // see if there is a continuation to perform
-                // the continuation may generate more work for
-                // this memory request
-                if (gpuDynInst->useContinuation) {
-                    assert(!gpuDynInst->isNoScope());
-                    gpuDynInst->execContinuation(
-                        gpuDynInst->staticInstruction(),
-                        gpuDynInst);
-                }
-            }
+            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                    compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId);
         }
     } else {
-        gpuDynInst->statusBitVector = VectorMask(0);
-
-        if (gpuDynInst->useContinuation) {
-            assert(!gpuDynInst->isNoScope());
-            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
-                                         gpuDynInst);
+        if (!compute_unit->headTailMap.count(gpuDynInst)) {
+            compute_unit->headTailMap.insert(
+                    std::make_pair(gpuDynInst, curTick()));
         }
     }
 
@@ -1192,8 +1466,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
             // Because it's atomic operation, only need TLB translation state
             prefetch_pkt->senderState =
                 new TheISA::GpuTLB::TranslationState(TLB_mode,
-                                                     computeUnit->shader->gpuTc,
-                                                     true);
+                    computeUnit->shader->gpuTc, true);
 
             // Currently prefetches are zero-latency, hence the sendFunctional
             sendFunctional(prefetch_pkt);
@@ -1270,10 +1543,40 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
                 pkt->req->getPaddr());
     } else {
         DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
+                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index,
+                pkt->req->getPaddr());
+    }
+}
+
+const char*
+ComputeUnit::ScalarDataPort::MemReqEvent::description() const
+{
+    return "ComputeUnit scalar memory request event";
+}
+
+void
+ComputeUnit::ScalarDataPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit;
+
+    if (!(scalarDataPort->sendTimingReq(pkt))) {
+        scalarDataPort->retries.push_back(pkt);
+
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
                 compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, index,
+                gpuDynInst->wfSlotId, scalarDataPort->index,
                 pkt->req->getPaddr());
+    } else {
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
+                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
+                scalarDataPort->index, pkt->req->getPaddr());
     }
 }
 
@@ -1314,6 +1617,66 @@ ComputeUnit::DTLBPort::recvReqRetry()
     }
 }
 
+bool
+ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
+{
+    assert(pkt->senderState);
+
+    TheISA::GpuTLB::TranslationState *translation_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // Page faults are not allowed
+    fatal_if(!translation_state->tlbEntry,
+            "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
+
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+
+    pkt->senderState = translation_state->saved;
+    delete translation_state;
+
+    ScalarDTLBPort::SenderState *sender_state =
+        safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    delete pkt->senderState;
+
+    Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
+
+    DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
+        "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
+        w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
+
+    MemCmd mem_cmd;
+
+    if (pkt->cmd == MemCmd::ReadResp) {
+        mem_cmd = MemCmd::ReadReq;
+    } else if (pkt->cmd == MemCmd::WriteResp) {
+        mem_cmd = MemCmd::WriteReq;
+    } else {
+      fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
+            pkt->cmd.toString());
+    }
+
+    PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
+    req_pkt->dataStatic(pkt->getPtr<uint8_t>());
+    delete pkt;
+
+    req_pkt->senderState =
+        new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
+
+    if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) {
+        computeUnit->scalarDataPort->retries.push_back(req_pkt);
+        DPRINTF(GPUMem, "send scalar req failed for: %s\n",
+                gpuDynInst->disassemble());
+    } else {
+        DPRINTF(GPUMem, "send scalar req for: %s\n",
+                gpuDynInst->disassemble());
+    }
+
+    return true;
+}
+
 bool
 ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
 {
@@ -1324,8 +1687,8 @@ ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
     assert(pkt->senderState);
 
     // pop off the TLB translation state
-    TheISA::GpuTLB::TranslationState *translation_state =
-                 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+    TheISA::GpuTLB::TranslationState *translation_state
+        = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 
     bool success = translation_state->tlbEntry != nullptr;
     delete translation_state->tlbEntry;
@@ -1510,6 +1873,152 @@ ComputeUnit::regStats()
     scalarMemWritesPerWF = scalarMemWrites / completedWfs;
     scalarMemReadsPerWF = scalarMemReads / completedWfs;
 
+    vectorMemReadsPerKiloInst
+        .name(name() + ".vector_mem_reads_per_kilo_inst")
+        .desc("Number of vector mem reads per kilo-instruction")
+        ;
+    vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
+    vectorMemWritesPerKiloInst
+        .name(name() + ".vector_mem_writes_per_kilo_inst")
+        .desc("Number of vector mem writes per kilo-instruction")
+        ;
+    vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
+    vectorMemInstsPerKiloInst
+        .name(name() + ".vector_mem_insts_per_kilo_inst")
+        .desc("Number of vector mem insts per kilo-instruction")
+        ;
+    vectorMemInstsPerKiloInst =
+        ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
+    scalarMemReadsPerKiloInst
+        .name(name() + ".scalar_mem_reads_per_kilo_inst")
+        .desc("Number of scalar mem reads per kilo-instruction")
+    ;
+    scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
+    scalarMemWritesPerKiloInst
+        .name(name() + ".scalar_mem_writes_per_kilo_inst")
+        .desc("Number of scalar mem writes per kilo-instruction")
+    ;
+    scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
+    scalarMemInstsPerKiloInst
+        .name(name() + ".scalar_mem_insts_per_kilo_inst")
+        .desc("Number of scalar mem insts per kilo-instruction")
+        ;
+    scalarMemInstsPerKiloInst =
+        ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
+
+    instCyclesVMemPerSimd
+       .init(numVectorALUs)
+       .name(name() + ".inst_cycles_vector_memory")
+       .desc("Number of cycles to send address, command, data from VRF to "
+             "vector memory unit, per SIMD")
+       ;
+
+    instCyclesScMemPerSimd
+       .init(numVectorALUs)
+       .name(name() + ".inst_cycles_scalar_memory")
+       .desc("Number of cycles to send address, command, data from SRF to "
+             "scalar memory unit, per SIMD")
+       ;
+
+    instCyclesLdsPerSimd
+       .init(numVectorALUs)
+       .name(name() + ".inst_cycles_lds")
+       .desc("Number of cycles to send address, command, data from VRF to "
+             "LDS unit, per SIMD")
+       ;
+
+    globalReads
+        .name(name() + ".global_mem_reads")
+        .desc("Number of reads to the global segment")
+    ;
+    globalWrites
+        .name(name() + ".global_mem_writes")
+        .desc("Number of writes to the global segment")
+    ;
+    globalMemInsts
+        .name(name() + ".global_mem_insts")
+        .desc("Number of memory instructions sent to the global segment")
+    ;
+    globalMemInsts = globalReads + globalWrites;
+    argReads
+        .name(name() + ".arg_reads")
+        .desc("Number of reads to the arg segment")
+    ;
+    argWrites
+        .name(name() + ".arg_writes")
+        .desc("NUmber of writes to the arg segment")
+    ;
+    argMemInsts
+        .name(name() + ".arg_mem_insts")
+        .desc("Number of memory instructions sent to the arg segment")
+    ;
+    argMemInsts = argReads + argWrites;
+    spillReads
+        .name(name() + ".spill_reads")
+        .desc("Number of reads to the spill segment")
+    ;
+    spillWrites
+        .name(name() + ".spill_writes")
+        .desc("Number of writes to the spill segment")
+    ;
+    spillMemInsts
+        .name(name() + ".spill_mem_insts")
+        .desc("Number of memory instructions sent to the spill segment")
+    ;
+    spillMemInsts = spillReads + spillWrites;
+    groupReads
+        .name(name() + ".group_reads")
+        .desc("Number of reads to the group segment")
+    ;
+    groupWrites
+        .name(name() + ".group_writes")
+        .desc("Number of writes to the group segment")
+    ;
+    groupMemInsts
+        .name(name() + ".group_mem_insts")
+        .desc("Number of memory instructions sent to the group segment")
+    ;
+    groupMemInsts = groupReads + groupWrites;
+    privReads
+        .name(name() + ".private_reads")
+        .desc("Number of reads to the private segment")
+    ;
+    privWrites
+        .name(name() + ".private_writes")
+        .desc("Number of writes to the private segment")
+    ;
+    privMemInsts
+        .name(name() + ".private_mem_insts")
+        .desc("Number of memory instructions sent to the private segment")
+    ;
+    privMemInsts = privReads + privWrites;
+    readonlyReads
+        .name(name() + ".readonly_reads")
+        .desc("Number of reads to the readonly segment")
+    ;
+    readonlyWrites
+        .name(name() + ".readonly_writes")
+        .desc("Number of memory instructions sent to the readonly segment")
+    ;
+    readonlyMemInsts
+        .name(name() + ".readonly_mem_insts")
+        .desc("Number of memory instructions sent to the readonly segment")
+    ;
+    readonlyMemInsts = readonlyReads + readonlyWrites;
+    kernargReads
+        .name(name() + ".kernarg_reads")
+        .desc("Number of reads sent to the kernarg segment")
+    ;
+    kernargWrites
+        .name(name() + ".kernarg_writes")
+        .desc("Number of memory instructions sent to the kernarg segment")
+    ;
+    kernargMemInsts
+        .name(name() + ".kernarg_mem_insts")
+        .desc("Number of memory instructions sent to the kernarg segment")
+    ;
+    kernargMemInsts = kernargReads + kernargWrites;
+
     tlbCycles
         .name(name() + ".tlb_cycles")
         .desc("total number of cycles for all uncoalesced requests")
@@ -1596,6 +2105,71 @@ ComputeUnit::regStats()
         .desc("number of vec ops executed (e.g. WF size/inst)")
         ;
 
+    numVecOpsExecutedF16
+        .name(name() + ".num_vec_ops_f16_executed")
+        .desc("number of f16 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedF32
+        .name(name() + ".num_vec_ops_f32_executed")
+        .desc("number of f32 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedF64
+        .name(name() + ".num_vec_ops_f64_executed")
+        .desc("number of f64 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedFMA16
+        .name(name() + ".num_vec_ops_fma16_executed")
+        .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedFMA32
+        .name(name() + ".num_vec_ops_fma32_executed")
+        .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedFMA64
+        .name(name() + ".num_vec_ops_fma64_executed")
+        .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAD16
+        .name(name() + ".num_vec_ops_mad16_executed")
+        .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAD32
+        .name(name() + ".num_vec_ops_mad32_executed")
+        .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAD64
+        .name(name() + ".num_vec_ops_mad64_executed")
+        .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAC16
+        .name(name() + ".num_vec_ops_mac16_executed")
+        .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAC32
+        .name(name() + ".num_vec_ops_mac32_executed")
+        .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedMAC64
+        .name(name() + ".num_vec_ops_mac64_executed")
+        .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
+        ;
+
+    numVecOpsExecutedTwoOpFP
+        .name(name() + ".num_vec_ops_two_op_fp_executed")
+        .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
+        ;
+
     totalCycles
         .name(name() + ".num_total_cycles")
         .desc("number of cycles the CU ran for")
@@ -1611,6 +2185,21 @@ ComputeUnit::regStats()
         .desc("Vector Operations per cycle (this CU only)")
         ;
 
+    vpc_f16
+        .name(name() + ".vpc_f16")
+        .desc("F16 Vector Operations per cycle (this CU only)")
+        ;
+
+    vpc_f32
+        .name(name() + ".vpc_f32")
+        .desc("F32 Vector Operations per cycle (this CU only)")
+        ;
+
+    vpc_f64
+        .name(name() + ".vpc_f64")
+        .desc("F64 Vector Operations per cycle (this CU only)")
+        ;
+
     numALUInstsExecuted
         .name(name() + ".num_alu_insts_executed")
         .desc("Number of dynamic non-GM memory insts executed")
@@ -1623,15 +2212,30 @@ ComputeUnit::regStats()
 
     ipc = numInstrExecuted / totalCycles;
     vpc = numVecOpsExecuted / totalCycles;
+    vpc_f16 = numVecOpsExecutedF16 / totalCycles;
+    vpc_f32 = numVecOpsExecutedF32 / totalCycles;
+    vpc_f64 = numVecOpsExecutedF64 / totalCycles;
 
     numTimesWgBlockedDueVgprAlloc
         .name(name() + ".times_wg_blocked_due_vgpr_alloc")
-        .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+        .desc("Number of times WGs are blocked due to VGPR allocation per "
+              "SIMD")
+        ;
+
+    numTimesWgBlockedDueSgprAlloc
+        .name(name() + ".times_wg_blocked_due_sgpr_alloc")
+        .desc("Number of times WGs are blocked due to SGPR allocation per "
+              "SIMD")
         ;
 
     dynamicGMemInstrCnt
         .name(name() + ".global_mem_instr_cnt")
-        .desc("dynamic global memory instructions count")
+        .desc("dynamic non-flat global memory instruction count")
+        ;
+
+    dynamicFlatMemInstrCnt
+        .name(name() + ".flat_global_mem_instr_cnt")
+        .desc("dynamic flat global memory instruction count")
         ;
 
     dynamicLMemInstrCnt
@@ -1647,6 +2251,11 @@ ComputeUnit::regStats()
         .desc("number of completed wavefronts")
         ;
 
+    completedWGs
+        .name(name() + ".num_completed_wgs")
+        .desc("number of completed workgroups")
+        ;
+
     numCASOps
         .name(name() + ".num_CAS_ops")
         .desc("number of compare and swap operations")
@@ -1657,15 +2266,37 @@ ComputeUnit::regStats()
         .desc("number of compare and swap operations that failed")
         ;
 
+    headTailLatency
+        .init(0, 1000000, 10000)
+        .name(name() + ".head_tail_latency")
+        .desc("ticks between first and last cache block arrival at coalescer")
+        .flags(Stats::pdf | Stats::oneline)
+        ;
+
+    waveLevelParallelism
+        .init(0, shader->n_wf * numVectorALUs, 1)
+        .name(name() + ".wlp")
+        .desc("wave level parallelism: count of active waves at wave launch")
+        ;
+
+    instInterleave
+        .init(numVectorALUs, 0, 20, 1)
+        .name(name() + ".interleaving")
+        .desc("Measure of instruction interleaving per SIMD")
+        ;
+
     // register stats of pipeline stages
     fetchStage.regStats();
     scoreboardCheckStage.regStats();
     scheduleStage.regStats();
     execStage.regStats();
 
-    // register stats of memory pipeline
+    // register stats of memory pipelines
     globalMemoryPipe.regStats();
     localMemoryPipe.regStats();
+    scalarMemoryPipe.regStats();
+
+    registerManager->regStats();
 }
 
 void
@@ -1682,6 +2313,10 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
         }
     } else {
         if (gpuDynInst->isALU()) {
+            shader->total_valu_insts++;
+            if (shader->total_valu_insts == shader->max_valu_insts) {
+                exitSimLoop("max vALU insts");
+            }
             vALUInsts++;
             instCyclesVALU++;
             threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
@@ -1698,6 +2333,74 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
         } else if (gpuDynInst->isStore()) {
             vectorMemWrites++;
         }
+
+        if (gpuDynInst->isLoad()) {
+            switch (gpuDynInst->executedAs()) {
+              case Enums::SC_SPILL:
+                spillReads++;
+                break;
+              case Enums::SC_GLOBAL:
+                globalReads++;
+                break;
+              case Enums::SC_GROUP:
+                groupReads++;
+                break;
+              case Enums::SC_PRIVATE:
+                privReads++;
+                break;
+              case Enums::SC_READONLY:
+                readonlyReads++;
+                break;
+              case Enums::SC_KERNARG:
+                kernargReads++;
+                break;
+              case Enums::SC_ARG:
+                argReads++;
+                break;
+              case Enums::SC_NONE:
+                /**
+                 * this case can occur for flat mem insts
+                 * who execute with EXEC = 0
+                 */
+                break;
+              default:
+                fatal("%s has no valid segment\n", gpuDynInst->disassemble());
+                break;
+            }
+        } else if (gpuDynInst->isStore()) {
+            switch (gpuDynInst->executedAs()) {
+              case Enums::SC_SPILL:
+                spillWrites++;
+                break;
+              case Enums::SC_GLOBAL:
+                globalWrites++;
+                break;
+              case Enums::SC_GROUP:
+                groupWrites++;
+                break;
+              case Enums::SC_PRIVATE:
+                privWrites++;
+                break;
+              case Enums::SC_READONLY:
+                readonlyWrites++;
+                break;
+              case Enums::SC_KERNARG:
+                kernargWrites++;
+                break;
+              case Enums::SC_ARG:
+                argWrites++;
+                break;
+              case Enums::SC_NONE:
+                /**
+                 * this case can occur for flat mem insts
+                 * who execute with EXEC = 0
+                 */
+                break;
+              default:
+                fatal("%s has no valid segment\n", gpuDynInst->disassemble());
+                break;
+            }
+        }
     }
 }
 
@@ -1728,31 +2431,32 @@ ComputeUnit::CUExitCallback::process()
             *page_stat_file << std::dec << iter.second.second << std::endl;
         }
     }
- }
+}
 
 bool
 ComputeUnit::isDone() const
 {
-    for (int i = 0; i < numSIMDs; ++i) {
-        if (!isSimdDone(i)) {
+    for (int i = 0; i < numVectorALUs; ++i) {
+        if (!isVectorAluIdle(i)) {
             return false;
         }
     }
 
-    bool glbMemBusRdy = true;
-    for (int j = 0; j < numGlbMemUnits; ++j) {
-        glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+    // TODO: FIXME if more than 1 of any memory pipe supported
+    if (!srfToScalarMemPipeBus.rdy()) {
+        return false;
+    }
+    if (!vrfToGlobalMemPipeBus.rdy()) {
+        return false;
     }
-    bool locMemBusRdy = true;
-    for (int j = 0; j < numLocMemUnits; ++j) {
-        locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+    if (!vrfToLocalMemPipeBus.rdy()) {
+        return false;
     }
 
-    if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
-        !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
-        !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+    if (!globalMemoryPipe.isGMReqFIFOWrRdy()
+        || !localMemoryPipe.isLMReqFIFOWrRdy()
         || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
-        !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+        !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
         return false;
     }
 
@@ -1760,30 +2464,19 @@ ComputeUnit::isDone() const
 }
 
 int32_t
-ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+ComputeUnit::getRefCounter(const uint32_t dispatchId,
+    const uint32_t wgId) const
 {
     return lds.getRefCounter(dispatchId, wgId);
 }
 
 bool
-ComputeUnit::isSimdDone(uint32_t simdId) const
+ComputeUnit::isVectorAluIdle(uint32_t simdId) const
 {
-    assert(simdId < numSIMDs);
-
-    for (int i=0; i < numGlbMemUnits; ++i) {
-        if (!vrfToGlobalMemPipeBus[i].rdy())
-            return false;
-    }
-    for (int i=0; i < numLocMemUnits; ++i) {
-        if (!vrfToLocalMemPipeBus[i].rdy())
-            return false;
-    }
-    if (!aluPipe[simdId].rdy()) {
-        return false;
-    }
+    assert(simdId < numVectorALUs);
 
     for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
-        if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+        if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
             return false;
         }
     }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 49713e936..187cbc9d5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -36,28 +36,30 @@
 
 #include <deque>
 #include <map>
-#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "base/callback.hh"
 #include "base/statistics.hh"
 #include "base/types.hh"
+#include "config/the_gpu_isa.hh"
 #include "enums/PrefetchType.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/local_memory_pipeline.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/register_manager.hh"
+#include "gpu-compute/scalar_memory_pipeline.hh"
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
 #include "mem/token_port.hh"
 #include "sim/clocked_object.hh"
 
-static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
-static const int MAX_WIDTH_FOR_MEM_INST = 32;
-
-class NDRange;
+class HSAQueueEntry;
+class LdsChunk;
+class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;
 
@@ -69,18 +71,6 @@ enum EXEC_POLICY
     RR
 };
 
-// List of execution units
-enum EXEC_UNIT
-{
-    SIMD0 = 0,
-    SIMD1,
-    SIMD2,
-    SIMD3,
-    GLBMEM_PIPE,
-    LDSMEM_PIPE,
-    NUM_UNITS
-};
-
 enum TLB_CACHE
 {
     TLB_MISS_CACHE_MISS = 0,
@@ -92,32 +82,100 @@ enum TLB_CACHE
 class ComputeUnit : public ClockedObject
 {
   public:
-    FetchStage fetchStage;
-    ScoreboardCheckStage scoreboardCheckStage;
-    ScheduleStage scheduleStage;
-    ExecStage execStage;
-    GlobalMemPipeline globalMemoryPipe;
-    LocalMemPipeline localMemoryPipe;
+
+
+    // Execution resources
+    //
+    // The ordering of units is:
+    // Vector ALUs
+    // Scalar ALUs
+    // GM Pipe
+    // LM Pipe
+    // Scalar Mem Pipe
+    //
+    // Note: the ordering of units is important and the code assumes the
+    // above ordering. However, there may be more than one resource of
+    // each type (e.g., 4 VALUs or 2 SALUs)
+
+    int numVectorGlobalMemUnits;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for Vector Register File->Global Memory pipe buses
+    WaitClass vrfToGlobalMemPipeBus;
+    // Resource control for Vector Global Memory execution unit
+    WaitClass vectorGlobalMemUnit;
+
+    int numVectorSharedMemUnits;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    WaitClass vrfToLocalMemPipeBus;
+    // Resource control for Vector Shared/Local Memory execution unit
+    WaitClass vectorSharedMemUnit;
+
+    int numScalarMemUnits;
+    // Resource control for scalar memory to SRF data/address bus
+    WaitClass scalarMemToSrfBus;
+    // Resource control for Scalar Register File->Scalar Memory pipe buses
+    WaitClass srfToScalarMemPipeBus;
+    // Resource control for Scalar Memory execution unit
+    WaitClass scalarMemUnit;
+
+    // vector ALU execution resources
+    int numVectorALUs;
+    std::vector<WaitClass> vectorALUs;
+
+    // scalar ALU execution resources
+    int numScalarALUs;
+    std::vector<WaitClass> scalarALUs;
+
+    // Return total number of execution units on this CU
+    int numExeUnits() const;
+    // index into readyList of the first memory unit
+    int firstMemUnit() const;
+    // index into readyList of the last memory unit
+    int lastMemUnit() const;
+    // index into scalarALUs vector of SALU used by the wavefront
+    int mapWaveToScalarAlu(Wavefront *w) const;
+    // index into readyList of SALU used by wavefront
+    int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
+    // index into readyList of Global Memory unit used by wavefront
+    int mapWaveToGlobalMem(Wavefront *w) const;
+    // index into readyList of Local Memory unit used by wavefront
+    int mapWaveToLocalMem(Wavefront *w) const;
+    // index into readyList of Scalar Memory unit used by wavefront
+    int mapWaveToScalarMem(Wavefront *w) const;
+
+    int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    int numCyclesPerStoreTransfer;  // number of cycles per vector store
+    int numCyclesPerLoadTransfer;  // number of cycles per vector load
 
     // Buffers used to communicate between various pipeline stages
 
+    // At a high level, the following intra-/inter-stage communication occurs:
+    // SCB to SCH: readyList provides per exec resource list of waves that
+    //             passed dependency and readiness checks. If selected by
+    //             scheduler, attempt to add wave to schList conditional on
+    //             RF support.
+    // SCH: schList holds waves that are gathering operands or waiting
+    //      for execution resource availability. Once ready, waves are
+    //      placed on the dispatchList as candidates for execution. A wave
+    //      may spend multiple cycles in SCH stage, on the schList due to
+    //      RF access conflicts or execution resource contention.
+    // SCH to EX: dispatchList holds waves that are ready to be executed.
+    //            LM/FLAT arbitration may remove an LM wave and place it
+    //            back on the schList. RF model may also force a wave back
+    //            to the schList if using the detailed model.
+
     // List of waves which are ready to be scheduled.
     // Each execution resource has a ready list. readyList is
     // used to communicate between scoreboardCheck stage and
     // schedule stage
-    // TODO: make enum to index readyList
     std::vector<std::vector<Wavefront*>> readyList;
 
-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList. waveStatusList is
-    // used to communicate between scoreboardCheck stage and
-    // schedule stage
-    // TODO: convert std::pair to a class to increase readability
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
-
     // List of waves which will be dispatched to
-    // each execution resource. A FILLED implies
+    // each execution resource. An EXREADY implies
     // dispatch list is non-empty and
     // execution unit has something to execute
     // this cycle. Currently, the dispatch list of
@@ -127,32 +185,67 @@ class ComputeUnit : public ClockedObject
     // and exec stage
     // TODO: convert std::pair to a class to increase readability
     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+    // track presence of dynamic instructions in the Schedule pipeline
+    // stage. This is used to check the readiness of the oldest,
+    // non-dispatched instruction of every WF in the Scoreboard stage.
+    std::unordered_set<uint64_t> pipeMap;
+
+    RegisterManager* registerManager;
+
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+    ScalarMemPipeline scalarMemoryPipe;
+
+    EventFunctionWrapper tickEvent;
 
-    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
-    int rrNextALUWp;
     typedef ComputeUnitParams Params;
     std::vector<std::vector<Wavefront*>> wfList;
     int cu_id;
 
     // array of vector register files, one per SIMD
     std::vector<VectorRegisterFile*> vrf;
-    // Number of vector ALU units (SIMDs) in CU
-    int numSIMDs;
+    // array of scalar register files, one per SIMD
+    std::vector<ScalarRegisterFile*> srf;
+
+    // Width per VALU/SIMD unit: number of work items that can be executed
+    // on the vector ALU simultaneously in a SIMD unit
+    int simdWidth;
     // number of pipe stages for bypassing data to next dependent single
     // precision vector instruction inside the vector ALU pipeline
     int spBypassPipeLength;
     // number of pipe stages for bypassing data to next dependent double
     // precision vector instruction inside the vector ALU pipeline
     int dpBypassPipeLength;
-    // number of cycles per issue period
-    int issuePeriod;
+    // number of pipe stages for scalar ALU
+    int scalarPipeStages;
+    // number of pipe stages for operand collection & distribution network
+    int operandNetworkLength;
+    // number of cycles per instruction issue period
+    Cycles issuePeriod;
+
+    // VRF to GM Bus latency
+    Cycles vrf_gm_bus_latency;
+    // SRF to Scalar Mem Bus latency
+    Cycles srf_scm_bus_latency;
+    // VRF to LM Bus latency
+    Cycles vrf_lm_bus_latency;
 
-    // Number of global and local memory execution resources in CU
-    int numGlbMemUnits;
-    int numLocMemUnits;
     // tracks the last cycle a vector instruction was executed on a SIMD
     std::vector<uint64_t> lastExecCycle;
 
+    // Track the amount of interleaving between wavefronts on each SIMD.
+    // This stat is sampled using instExecPerSimd to compute the number of
+    // instructions that have been executed on a SIMD between a WF executing
+    // two successive instructions.
+    Stats::VectorDistribution instInterleave;
+
+    // tracks the number of dyn inst executed per SIMD
+    std::vector<uint64_t> instExecPerSimd;
+
     // true if we allow a separate TLB per lane
     bool perLaneTLB;
     // if 0, TLB prefetching is off.
@@ -166,8 +259,10 @@ class ComputeUnit : public ClockedObject
     Enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;
 
-    bool xact_cas_mode;
     bool debugSegFault;
+    // Idle CU timeout in ticks
+    Tick idleCUTimeout;
+    int idleWfs;
     bool functionalTLB;
     bool localMemBarrier;
 
@@ -183,91 +278,67 @@ class ComputeUnit : public ClockedObject
 
     Shader *shader;
     uint32_t barrier_id;
-    // vector of Vector ALU (MACC) pipelines
-    std::vector<WaitClass> aluPipe;
-    // minimum issue period per SIMD unit (in cycles)
-    std::vector<WaitClass> wfWait;
-
-    // Resource control for Vector Register File->Global Memory pipe buses
-    std::vector<WaitClass> vrfToGlobalMemPipeBus;
-    // Resource control for Vector Register File->Local Memory pipe buses
-    std::vector<WaitClass> vrfToLocalMemPipeBus;
-    int nextGlbMemBus;
-    int nextLocMemBus;
-    // Resource control for global memory to VRF data/address bus
-    WaitClass glbMemToVrfBus;
-    // Resource control for local memory to VRF data/address bus
-    WaitClass locMemToVrfBus;
-
-    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
-    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
-    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
-    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
 
     Tick req_tick_latency;
     Tick resp_tick_latency;
 
-    // number of vector registers being reserved for each SIMD unit
+    /**
+     * Number of WFs to schedule to each SIMD. This vector is populated
+     * by hasDispResources(), and consumed by the subsequent call to
+     * dispWorkgroup(), to schedule the specified number of WFs to the
+     * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
+     */
+    std::vector<int> numWfsToSched;
+
+    // number of currently reserved vector registers per SIMD unit
     std::vector<int> vectorRegsReserved;
+    // number of currently reserved scalar registers per SIMD unit
+    std::vector<int> scalarRegsReserved;
     // number of vector registers per SIMD unit
-    uint32_t numVecRegsPerSimd;
-    // Support for scheduling VGPR status update events
-    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
-    std::vector<uint64_t> timestampVec;
-    std::vector<uint8_t>  statusVec;
+    int numVecRegsPerSimd;
+    // number of available scalar registers per SIMD unit
+    int numScalarRegsPerSimd;
 
-    void
-    registerEvent(uint32_t simdId,
-                  uint32_t regIdx,
-                  uint32_t operandSize,
-                  uint64_t when,
-                  uint8_t newStatus) {
-        regIdxVec.push_back(std::make_pair(simdId, regIdx));
-        timestampVec.push_back(when);
-        statusVec.push_back(newStatus);
-        if (operandSize > 4) {
-            regIdxVec.push_back(std::make_pair(simdId,
-                                               ((regIdx + 1) %
-                                                numVecRegsPerSimd)));
-            timestampVec.push_back(when);
-            statusVec.push_back(newStatus);
-        }
-    }
-
-    void updateEvents();
+    void updateReadyList(int unitId);
 
     // this hash map will keep track of page divergence
     // per memory instruction per wavefront. The hash map
     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
     std::map<Addr, int> pagesTouched;
 
+    void insertInPipeMap(Wavefront *w);
+    void deleteFromPipeMap(Wavefront *w);
+
     ComputeUnit(const Params *p);
     ~ComputeUnit();
-    int spBypassLength() { return spBypassPipeLength; };
-    int dpBypassLength() { return dpBypassPipeLength; };
-    int storeBusLength() { return numCyclesPerStoreTransfer; };
-    int loadBusLength() { return numCyclesPerLoadTransfer; };
-    int wfSize() const { return wavefrontSize; };
 
-    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    // Timing Functions
+    int oprNetPipeLength() const { return operandNetworkLength; }
+    int simdUnitWidth() const { return simdWidth; }
+    int spBypassLength() const { return spBypassPipeLength; }
+    int dpBypassLength() const { return dpBypassPipeLength; }
+    int scalarPipeLength() const { return scalarPipeStages; }
+    int storeBusLength() const { return numCyclesPerStoreTransfer; }
+    int loadBusLength() const { return numCyclesPerLoadTransfer; }
+    int wfSize() const { return wavefrontSize; }
+
     void exec();
     void initiateFetch(Wavefront *wavefront);
     void fetch(PacketPtr pkt, Wavefront *wavefront);
-    void fillKernelState(Wavefront *w, NDRange *ndr);
+    void fillKernelState(Wavefront *w, HSAQueueEntry *task);
 
     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                        NDRange *ndr);
-
-    void StartWorkgroup(NDRange *ndr);
-    int ReadyWorkgroup(NDRange *ndr);
-
-    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
-    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
-    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
-    int GlbMemUnitId() { return GLBMEM_PIPE; }
-    int ShrMemUnitId() { return LDSMEM_PIPE; }
-    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
-    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+                        HSAQueueEntry *task, bool fetchContext=false);
+
+    void doInvalidate(RequestPtr req, int kernId);
+    void doFlush(GPUDynInstPtr gpuDynInst);
+
+    void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
+    bool hasDispResources(HSAQueueEntry *task);
+
+    int cacheLineSize() const { return _cacheLineSize; }
+    int getCacheLineBits() const { return cacheLineBits; }
+
     /* This function cycles through all the wavefronts in all the phases to see
      * if all of the wavefronts which should be associated with one barrier
      * (denoted with _barrier_id), are all at the same barrier in the program
@@ -275,14 +346,15 @@ class ComputeUnit : public ClockedObject
      * return true.
      */
     int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
-    bool cedeSIMD(int simdId, int wfSlotId);
 
-    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    template<typename c0, typename c1>
+    void doSmReturn(GPUDynInstPtr gpuDynInst);
+
     virtual void init() override;
     void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
-    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
-                              bool kernelLaunch=true,
+                              bool kernelMemSync,
                               RequestPtr req=nullptr);
     void handleMemPacket(PacketPtr pkt, int memport_index);
     bool processTimingPacket(PacketPtr pkt);
@@ -292,7 +364,7 @@ class ComputeUnit : public ClockedObject
     MasterID masterId() { return _masterId; }
 
     bool isDone() const;
-    bool isSimdDone(uint32_t) const;
+    bool isVectorAluIdle(uint32_t simdId) const;
 
   protected:
     MasterID _masterId;
@@ -323,6 +395,44 @@ class ComputeUnit : public ClockedObject
     Stats::Scalar scalarMemReads;
     Stats::Formula scalarMemReadsPerWF;
 
+    Stats::Formula vectorMemReadsPerKiloInst;
+    Stats::Formula vectorMemWritesPerKiloInst;
+    Stats::Formula vectorMemInstsPerKiloInst;
+    Stats::Formula scalarMemReadsPerKiloInst;
+    Stats::Formula scalarMemWritesPerKiloInst;
+    Stats::Formula scalarMemInstsPerKiloInst;
+
+    // Cycles required to send register source (addr and data) from
+    // register files to memory pipeline, per SIMD.
+    Stats::Vector instCyclesVMemPerSimd;
+    Stats::Vector instCyclesScMemPerSimd;
+    Stats::Vector instCyclesLdsPerSimd;
+
+    Stats::Scalar globalReads;
+    Stats::Scalar globalWrites;
+    Stats::Formula globalMemInsts;
+    Stats::Scalar argReads;
+    Stats::Scalar argWrites;
+    Stats::Formula argMemInsts;
+    Stats::Scalar spillReads;
+    Stats::Scalar spillWrites;
+    Stats::Formula spillMemInsts;
+    Stats::Scalar groupReads;
+    Stats::Scalar groupWrites;
+    Stats::Formula groupMemInsts;
+    Stats::Scalar privReads;
+    Stats::Scalar privWrites;
+    Stats::Formula privMemInsts;
+    Stats::Scalar readonlyReads;
+    Stats::Scalar readonlyWrites;
+    Stats::Formula readonlyMemInsts;
+    Stats::Scalar kernargReads;
+    Stats::Scalar kernargWrites;
+    Stats::Formula kernargMemInsts;
+
+    int activeWaves;
+    Stats::Distribution waveLevelParallelism;
+
     void updateInstStats(GPUDynInstPtr gpuDynInst);
 
     // the following stats compute the avg. TLB accesslatency per
@@ -339,21 +449,48 @@ class ComputeUnit : public ClockedObject
     // over all memory instructions executed over all wavefronts
     // how many touched 0-4 pages, 4-8, ..., 60-64 pages
     Stats::Distribution pageDivergenceDist;
+    // count of non-flat global memory vector instructions executed
     Stats::Scalar dynamicGMemInstrCnt;
+    // count of flat global memory vector instructions executed
+    Stats::Scalar dynamicFlatMemInstrCnt;
     Stats::Scalar dynamicLMemInstrCnt;
 
     Stats::Scalar wgBlockedDueLdsAllocation;
-    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
-    // when the instruction is committed, this number is still incremented by 1
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+    // active when the instruction is committed, this number is still
+    // incremented by 1
     Stats::Scalar numInstrExecuted;
     // Number of cycles among successive instruction executions across all
     // wavefronts of the same CU
     Stats::Distribution execRateDist;
     // number of individual vector operations executed
     Stats::Scalar numVecOpsExecuted;
+    // number of individual f16 vector operations executed
+    Stats::Scalar numVecOpsExecutedF16;
+    // number of individual f32 vector operations executed
+    Stats::Scalar numVecOpsExecutedF32;
+    // number of individual f64 vector operations executed
+    Stats::Scalar numVecOpsExecutedF64;
+    // number of individual FMA 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedFMA16;
+    Stats::Scalar numVecOpsExecutedFMA32;
+    Stats::Scalar numVecOpsExecutedFMA64;
+    // number of individual MAC 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedMAC16;
+    Stats::Scalar numVecOpsExecutedMAC32;
+    Stats::Scalar numVecOpsExecutedMAC64;
+    // number of individual MAD 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedMAD16;
+    Stats::Scalar numVecOpsExecutedMAD32;
+    Stats::Scalar numVecOpsExecutedMAD64;
+    // total number of two op FP vector operations executed
+    Stats::Scalar numVecOpsExecutedTwoOpFP;
     // Total cycles that something is running on the GPU
     Stats::Scalar totalCycles;
     Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula vpc_f16; // vector ops per cycle
+    Stats::Formula vpc_f32; // vector ops per cycle
+    Stats::Formula vpc_f64; // vector ops per cycle
     Stats::Formula ipc; // vector instructions per cycle
     Stats::Distribution controlFlowDivergenceDist;
     Stats::Distribution activeLanesPerGMemInstrDist;
@@ -362,20 +499,16 @@ class ComputeUnit : public ClockedObject
     Stats::Formula numALUInstsExecuted;
     // number of times a WG can not start due to lack of free VGPRs in SIMDs
     Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    // number of times a WG can not start due to lack of free SGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueSgprAlloc;
     Stats::Scalar numCASOps;
     Stats::Scalar numFailedCASOps;
     Stats::Scalar completedWfs;
-    // flag per vector SIMD unit that is set when there is at least one
-    // WV that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer: Defined in the Scoreboard stage, consumed
-    // by the Execute stage.
-    std::vector<bool> vectorAluInstAvail;
-    // number of available (oldest) LDS instructions that could have
-    // been issued to the LDS at a specific issue slot
-    int shrMemInstAvail;
-    // number of available Global memory instructions that could have
-    // been issued to TCP at a specific issue slot
-    int glbMemInstAvail;
+    Stats::Scalar completedWGs;
+
+    // distrubtion in latency difference between first and last cache block
+    // arrival ticks
+    Stats::Distribution headTailLatency;
 
     void
     regStats() override;
@@ -389,8 +522,6 @@ class ComputeUnit : public ClockedObject
     int32_t
     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
 
-    int cacheLineSize() const { return _cacheLineSize; }
-
     bool
     sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
 
@@ -486,6 +617,56 @@ class ComputeUnit : public ClockedObject
 
     };
 
+    // Scalar data cache access port
+    class ScalarDataPort : public MasterPort
+    {
+      public:
+        ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
+                       PortID _index)
+            : MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index)
+        {
+            (void)index;
+        }
+
+        bool recvTimingResp(PacketPtr pkt) override;
+        void recvReqRetry() override;
+
+        struct SenderState : public Packet::SenderState
+        {
+            SenderState(GPUDynInstPtr gpuDynInst,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst), saved(sender_state)
+            {
+            }
+
+            GPUDynInstPtr _gpuDynInst;
+            Packet::SenderState *saved;
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            ScalarDataPort *scalarDataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
+                : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<PacketPtr> retries;
+
+      private:
+        ComputeUnit *computeUnit;
+        PortID index;
+    };
+
     // Instruction cache access port
     class SQCPort : public MasterPort
     {
@@ -500,10 +681,13 @@ class ComputeUnit : public ClockedObject
         {
             Wavefront *wavefront;
             Packet::SenderState *saved;
+            // kernel id to be used in handling I-Cache invalidate response
+            int kernId;
 
             SenderState(Wavefront *_wavefront, Packet::SenderState
-                    *sender_state=nullptr)
-                : wavefront(_wavefront), saved(sender_state) { }
+                    *sender_state=nullptr, int _kernId=-1)
+                : wavefront(_wavefront), saved(sender_state),
+                kernId(_kernId){ }
         };
 
         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
@@ -575,6 +759,34 @@ class ComputeUnit : public ClockedObject
         virtual void recvReqRetry();
     };
 
+    class ScalarDTLBPort : public MasterPort
+    {
+      public:
+        ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false)
+        {
+        }
+
+        struct SenderState : public Packet::SenderState
+        {
+            SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
+            GPUDynInstPtr _gpuDynInst;
+        };
+
+        bool recvTimingResp(PacketPtr pkt) override;
+        void recvReqRetry() override { assert(false); }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        std::deque<PacketPtr> retries;
+
+      private:
+        ComputeUnit *computeUnit;
+        bool stalled;
+    };
+
     class ITLBPort : public MasterPort
     {
       public:
@@ -710,6 +922,10 @@ class ComputeUnit : public ClockedObject
     std::vector<DataPort*> memPort;
     // port to the TLB hierarchy (i.e., the L1 TLB)
     std::vector<DTLBPort*> tlbPort;
+    // port to the scalar data cache
+    ScalarDataPort *scalarDataPort;
+    // port to the scalar data TLB
+    ScalarDTLBPort *scalarDTLBPort;
     // port to the SQC (i.e. the I-cache)
     SQCPort *sqcPort;
     // port to the SQC TLB (there's a separate TLB for each I-cache)
@@ -726,6 +942,14 @@ class ComputeUnit : public ClockedObject
             tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
                                         this, idx);
             return *tlbPort[idx];
+        } else if (if_name == "scalar_port") {
+            scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
+                                                idx), this, idx);
+            return *scalarDataPort;
+        } else if (if_name == "scalar_tlb_port") {
+            scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
+                                                this);
+            return *scalarDTLBPort;
         } else if (if_name == "sqc_port") {
             sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
                                   this, idx);
@@ -746,32 +970,18 @@ class ComputeUnit : public ClockedObject
         }
     }
 
-    // xact_cas_load()
-    class waveIdentifier
-    {
-      public:
-        waveIdentifier() { }
-        waveIdentifier(int _simdId, int _wfSlotId)
-          : simdId(_simdId), wfSlotId(_wfSlotId) { }
-
-        int simdId;
-        int wfSlotId;
-    };
-
-    class waveQueue
-    {
-      public:
-        std::list<waveIdentifier> waveIDQueue;
-    };
-    std::map<unsigned, waveQueue> xactCasLoadMap;
-
-    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+    InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
 
   private:
     const int _cacheLineSize;
-    uint64_t globalSeqNum;
+    int cacheLineBits;
+    InstSeqNum globalSeqNum;
     int wavefrontSize;
-    GPUStaticInst *kernelLaunchInst;
+
+    // hold the time of the arrival of the first cache block related to
+    // a particular GPUDynInst. This is used to calculate the difference
+    // between the first and last chace block arrival times.
+    std::map<GPUDynInstPtr, Tick> headTailMap;
 };
 
 #endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 99bffbd40..51f5e97fe 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -34,66 +34,76 @@
 
 #include "gpu-compute/dispatcher.hh"
 
-#include "cpu/base.hh"
 #include "debug/GPUDisp.hh"
-#include "gpu-compute/cl_driver.hh"
-#include "gpu-compute/cl_event.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "debug/GPUWgLatency.hh"
+#include "gpu-compute/gpu_command_processor.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
-#include "mem/packet_access.hh"
-
-GpuDispatcher *GpuDispatcher::instance = nullptr;
-
-GpuDispatcher::GpuDispatcher(const Params *p)
-    : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
-      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
-      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
-      shader(p->shader_pointer), driver(p->cl_driver),
-      tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
-                false, Event::CPU_Tick_Pri)
+#include "sim/syscall_emul_buf.hh"
+#include "sim/system.hh"
+
+GPUDispatcher::GPUDispatcher(const Params *p)
+    : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
+      tickEvent([this]{ exec(); },
+          "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
+      dispatchActive(false)
 {
-    shader->handshake(this);
-    driver->handshake(this);
-
-    ndRange.wg_disp_rem = false;
-    ndRange.globalWgId = 0;
-
     schedule(&tickEvent, 0);
+}
 
-    // translation port for the dispatcher
-    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+GPUDispatcher::~GPUDispatcher()
+{
+}
 
-    num_kernelLaunched
+void
+GPUDispatcher::regStats()
+{
+    numKernelLaunched
     .name(name() + ".num_kernel_launched")
     .desc("number of kernel launched")
     ;
+
+    cyclesWaitingForDispatch
+    .name(name() + ".cycles_wait_dispatch")
+    .desc("number of cycles with outstanding wavefronts "
+          "that are waiting to be dispatched")
+    ;
+}
+
+HSAQueueEntry*
+GPUDispatcher::hsaTask(int disp_id)
+{
+    assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
+    return hsaQueueEntries[disp_id];
 }
 
-GpuDispatcher *GpuDispatcherParams::create()
+void
+GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
 {
-    GpuDispatcher *dispatcher = new GpuDispatcher(this);
-    GpuDispatcher::setInstance(dispatcher);
+    gpuCmdProc = gpu_cmd_proc;
+}
 
-    return GpuDispatcher::getInstance();
+void
+GPUDispatcher::setShader(Shader *new_shader)
+{
+    shader = new_shader;
 }
 
 void
-GpuDispatcher::serialize(CheckpointOut &cp) const
+GPUDispatcher::serialize(CheckpointOut &cp) const
 {
     Tick event_tick = 0;
 
-    if (ndRange.wg_disp_rem)
-        fatal("Checkpointing not supported during active workgroup execution");
-
     if (tickEvent.scheduled())
         event_tick = tickEvent.when();
 
     SERIALIZE_SCALAR(event_tick);
-
 }
 
 void
-GpuDispatcher::unserialize(CheckpointIn &cp)
+GPUDispatcher::unserialize(CheckpointIn &cp)
 {
     Tick event_tick;
 
@@ -102,288 +112,256 @@ GpuDispatcher::unserialize(CheckpointIn &cp)
 
     UNSERIALIZE_SCALAR(event_tick);
 
-    if (event_tick)
+    if (event_tick) {
         schedule(&tickEvent, event_tick);
+    }
 }
 
-AddrRangeList
-GpuDispatcher::getAddrRanges() const
-{
-    AddrRangeList ranges;
-
-    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
-            pioAddr, pioSize);
-
-    ranges.push_back(RangeSize(pioAddr, pioSize));
-
-    return ranges;
-}
-
-Tick
-GpuDispatcher::read(PacketPtr pkt)
+/**
+ * After all relevant HSA data structures have been traversed/extracted
+ * from memory by the CP, dispatch() is called on the dispatcher. This will
+ * schedule a dispatch event that, when triggered, will attempt to dispatch
+ * the WGs associated with the given task to the CUs.
+ */
+void
+GPUDispatcher::dispatch(HSAQueueEntry *task)
 {
-    assert(pkt->getAddr() >= pioAddr);
-    assert(pkt->getAddr() < pioAddr + pioSize);
-
-    int offset = pkt->getAddr() - pioAddr;
-    pkt->allocate();
+    ++numKernelLaunched;
 
-    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+    DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
+            task->kernelName(), task->dispatchId());
 
-    if (offset < 8) {
-        assert(!offset);
-        assert(pkt->getSize() == 8);
+    execIds.push(task->dispatchId());
+    dispatchActive = true;
+    hsaQueueEntries.emplace(task->dispatchId(), task);
 
-        uint64_t retval = dispatchActive;
-        pkt->setLE(retval);
-    } else {
-        offset -= 8;
-        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
-        char *curTaskPtr = (char*)&curTask;
-
-        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
     }
-
-    pkt->makeAtomicResponse();
-
-    return pioDelay;
 }
 
-Tick
-GpuDispatcher::write(PacketPtr pkt)
+void
+GPUDispatcher::exec()
 {
-    assert(pkt->getAddr() >= pioAddr);
-    assert(pkt->getAddr() < pioAddr + pioSize);
-
-    int offset = pkt->getAddr() - pioAddr;
-
-#if TRACING_ON
-    uint64_t data_val = 0;
-
-    switch (pkt->getSize()) {
-      case 1:
-        data_val = pkt->getLE<uint8_t>();
-        break;
-      case 2:
-        data_val = pkt->getLE<uint16_t>();
-        break;
-      case 4:
-        data_val = pkt->getLE<uint32_t>();
-        break;
-      case 8:
-        data_val = pkt->getLE<uint64_t>();
-        break;
-      default:
-        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
-    }
+    int fail_count(0);
 
-    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
-            pkt->getSize());
-#endif
-    if (!offset) {
-        static int nextId = 0;
-
-        // The depends field of the qstruct, which was previously unused, is
-        // used to communicate with simulated application.
-        if (curTask.depends) {
-            HostState hs;
-            shader->ReadMem((uint64_t)(curTask.depends), &hs,
-                            sizeof(HostState), 0);
+    /**
+     * There are potentially multiple outstanding kernel launches.
+     * It is possible that the workgroups in a different kernel
+     * can fit on the GPU even if another kernel's workgroups cannot
+     */
+    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
 
-            // update event start time (in nano-seconds)
-            uint64_t start = curTick() / 1000;
+    if (execIds.size() > 0) {
+        ++cyclesWaitingForDispatch;
+    }
 
-            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
-                             &start, sizeof(uint64_t), 0);
+    /**
+     * dispatch work cannot start until the kernel's invalidate is
+     * completely finished; hence, kernel will always initiates
+     * invalidate first and keeps waiting until inv done
+     */
+    while (execIds.size() > fail_count) {
+        int exec_id = execIds.front();
+        auto task = hsaQueueEntries[exec_id];
+        bool launched(false);
+
+        // invalidate is needed before starting dispatch
+        if (shader->impl_kern_boundary_sync) {
+            // try to invalidate cache
+            shader->prepareInvalidate(task);
+        } else {
+            // kern boundary sync is not set, skip invalidate
+            task->markInvDone();
         }
 
-        // launch kernel
-        ++num_kernelLaunched;
-
-        NDRange *ndr = &(ndRangeMap[nextId]);
-        // copy dispatch info
-        ndr->q = curTask;
-
-        // update the numDispTask polled by the runtime
-        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+        /**
+         * invalidate is still ongoing, put the kernel on the queue to
+         * retry later
+         */
+        if (!task->isInvDone()){
+            execIds.push(exec_id);
+            ++fail_count;
 
-        ndr->numWgTotal = 1;
+            DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
+                " invalidate requests\n", exec_id, task->outstandingInvs());
 
-        for (int i = 0; i < 3; ++i) {
-            ndr->wgId[i] = 0;
-            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
-            ndr->numWgTotal *= ndr->numWg[i];
+            // try the next kernel_id
+            execIds.pop();
+            continue;
         }
 
-        ndr->numWgCompleted = 0;
-        ndr->globalWgId = 0;
-        ndr->wg_disp_rem = true;
-        ndr->execDone = false;
-        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
-        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
-        ndr->dispatchId = nextId;
-        ndr->curCid = pkt->req->contextId();
-        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
-        execIds.push(nextId);
-        ++nextId;
-
-        dispatchActive = true;
-
-        if (!tickEvent.scheduled()) {
-            schedule(&tickEvent, curTick() + shader->ticks(1));
-        }
-    } else {
-        // populate current task struct
-        // first 64 bits are launch reg
-        offset -= 8;
-        assert(offset < sizeof(HsaQueueEntry));
-        char *curTaskPtr = (char*)&curTask;
-        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
-    }
-
-    pkt->makeAtomicResponse();
-
-    return pioDelay;
-}
-
-
-Port &
-GpuDispatcher::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "translation_port") {
-        return *tlbPort;
-    }
-
-    return DmaDevice::getPort(if_name, idx);
-}
-
-void
-GpuDispatcher::exec()
-{
-    int fail_count = 0;
-
-    // There are potentially multiple outstanding kernel launches.
-    // It is possible that the workgroups in a different kernel
-    // can fit on the GPU even if another kernel's workgroups cannot
-    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
-
-    while (execIds.size() > fail_count) {
-        int execId = execIds.front();
-
-        while (ndRangeMap[execId].wg_disp_rem) {
-            //update the thread context
-            shader->updateContext(ndRangeMap[execId].curCid);
-
-            // attempt to dispatch_workgroup
-            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
-                // if we failed try the next kernel,
-                // it may have smaller workgroups.
-                // put it on the queue to rety latter
-                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
-                execIds.push(execId);
+        // kernel invalidate is done, start workgroup dispatch
+        while (!task->dispComplete()) {
+            // update the thread context
+            shader->updateContext(task->contextId());
+
+            // attempt to dispatch workgroup
+            DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
+                curTick(), exec_id);
+
+            if (!shader->dispatchWorkgroups(task)) {
+                /**
+                 * if we failed try the next kernel,
+                 * it may have smaller workgroups.
+                 * put it on the queue to rety latter
+                 */
+                DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
+                execIds.push(exec_id);
                 ++fail_count;
                 break;
+            } else if (!launched) {
+                launched = true;
+                DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
             }
         }
-        // let's try the next kernel_id
+
+        // try the next kernel_id
         execIds.pop();
     }
 
     DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
 
-    if (doneIds.size() && cpu) {
-        shader->hostWakeUp(cpu);
-    }
-
     while (doneIds.size()) {
-        // wakeup the CPU if any Kernels completed this cycle
-        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+        DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
         doneIds.pop();
     }
 }
 
-void
-GpuDispatcher::notifyWgCompl(Wavefront *w)
+bool
+GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
 {
-    int kern_id = w->kernId;
-    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
-    assert(ndRangeMap[kern_id].dispatchId == kern_id);
-    ndRangeMap[kern_id].numWgCompleted++;
-
-    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
-        ndRangeMap[kern_id].execDone = true;
-        doneIds.push(kern_id);
-
-        if (ndRangeMap[kern_id].addrToNotify) {
-            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
-                          0);
-        }
-
-        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+    int kern_id = wf->kernId;
+    assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
+    auto task = hsaQueueEntries[kern_id];
+    assert(task->dispatchId() == kern_id);
+
+    /**
+     * whether the next workgroup is the final one in the kernel,
+     * +1 as we check first before taking action
+     */
+    return (task->numWgCompleted() + 1 == task->numWgTotal());
+}
 
-        // update event end time (in nano-seconds)
-        if (ndRangeMap[kern_id].q.depends) {
-            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
-            uint64_t event;
-            shader->ReadMem((uint64_t)(&host_state->event), &event,
-                            sizeof(uint64_t), 0);
+/**
+ * update the counter of oustanding inv requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ */
+void
+GPUDispatcher::updateInvCounter(int kern_id, int val) {
+    assert(val == -1 || val == 1);
 
-            uint64_t end = curTick() / 1000;
+    auto task = hsaQueueEntries[kern_id];
+    task->updateOutstandingInvs(val);
 
-            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
-                             sizeof(uint64_t), 0);
-        }
+    // kernel invalidate is done, schedule dispatch work
+    if (task->isInvDone() && !tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
     }
+}
 
-    if (!tickEvent.scheduled()) {
-        schedule(&tickEvent, curTick() + shader->ticks(1));
-    }
+/**
+ * update the counter of oustanding wb requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ *
+ * return true if all wbs are done for the kernel
+ */
+bool
+GPUDispatcher::updateWbCounter(int kern_id, int val) {
+    assert(val == -1 || val == 1);
+
+    auto task = hsaQueueEntries[kern_id];
+    task->updateOutstandingWbs(val);
+
+    // true: WB is done, false: WB is still ongoing
+    return (task->outstandingWbs() == 0);
 }
 
-void
-GpuDispatcher::scheduleDispatch()
-{
-    if (!tickEvent.scheduled())
-        schedule(&tickEvent, curTick() + shader->ticks(1));
+/**
+ * get kernel's outstanding cache writeback requests
+ */
+int
+GPUDispatcher::getOutstandingWbs(int kernId) {
+    auto task = hsaQueueEntries[kernId];
+
+    return task->outstandingWbs();
 }
 
+/**
+ * When an end program instruction detects that the last WF in
+ * a WG has completed it will call this method on the dispatcher.
+ * If we detect that this is the last WG for the given task, then
+ * we ring the completion signal, which is used by the CPU to
+ * synchronize with the GPU. The HSAPP is also notified that the
+ * task has completed so it can be removed from its task queues.
+ */
 void
-GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+GPUDispatcher::notifyWgCompl(Wavefront *wf)
 {
-    if (cpu) {
-        if (off) {
-            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
-                              true);
-            val += off;
+    int kern_id = wf->kernId;
+    DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
+    auto task = hsaQueueEntries[kern_id];
+    assert(task->dispatchId() == kern_id);
+    task->notifyWgCompleted();
+
+    DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
+        curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
+
+    if (task->numWgCompleted() == task->numWgTotal()) {
+        // Notify the HSA PP that this kernel is complete
+        gpuCmdProc->hsaPacketProc()
+            .finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            // The signal value is aligned 8 bytes from
+            // the actual handle in the runtime
+            Addr signal_addr = task->completionSignal() + sizeof(Addr);
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
+                    "completion signal: %x!\n", signal_addr);
+
+            /**
+             * HACK: The semantics of the HSA signal is to decrement
+             * the current signal value. We cheat here and read out
+             * he value from main memory using functional access and
+             * then just DMA the decremented value. This is because
+             * the DMA controller does not currently support GPU
+             * atomics.
+             */
+            auto *tc = gpuCmdProc->system()->threads[0];
+            auto &virt_proxy = tc->getVirtProxy();
+            TypedBufferArg<Addr> prev_signal(signal_addr);
+            prev_signal.copyIn(virt_proxy);
+
+            Addr *new_signal = new Addr;
+            *new_signal = (Addr)*prev_signal - 1;
+
+            gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
+                new_signal, 0);
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
         }
 
-        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
-    } else {
-        panic("Cannot find host");
+        DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
+                curTick(), kern_id);
+        DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
     }
-}
-
-// helper functions for driver to retrieve GPU attributes
-int
-GpuDispatcher::getNumCUs()
-{
-    return shader->cuList.size();
-}
 
-int
-GpuDispatcher::wfSize() const
-{
-    return shader->cuList[0]->wfSize();
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
+    }
 }
 
 void
-GpuDispatcher::setFuncargsSize(int funcargs_size)
+GPUDispatcher::scheduleDispatch()
 {
-    shader->funcargs_size = funcargs_size;
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
+    }
 }
 
-uint32_t
-GpuDispatcher::getStaticContextSize() const
+GPUDispatcher *GPUDispatcherParams::create()
 {
-    return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
+    return new GPUDispatcher(this);
 }
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
index 1ffe81c10..cd282b9cb 100644
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -31,125 +31,69 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __GPU_DISPATCHER_HH__
-#define __GPU_DISPATCHER_HH__
+/**
+ * @file
+ * The GPUDispatcher is the component of the shader that is responsible
+ * for creating and dispatching WGs to the compute units. If all WGs in
+ * a kernel cannot be dispatched simultaneously, then the dispatcher will
+ * keep track of all pending WGs and dispatch them as resources become
+ * available.
+ */
+
+#ifndef __GPU_COMPUTE_DISPATCHER_HH__
+#define __GPU_COMPUTE_DISPATCHER_HH__
 
 #include <queue>
+#include <unordered_map>
 #include <vector>
 
 #include "base/statistics.hh"
-#include "dev/dma_device.hh"
-#include "gpu-compute/compute_unit.hh"
-#include "gpu-compute/ndrange.hh"
-#include "gpu-compute/qstruct.hh"
-#include "mem/port.hh"
-#include "params/GpuDispatcher.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "params/GPUDispatcher.hh"
+#include "sim/sim_object.hh"
 
-class BaseCPU;
+class GPUCommandProcessor;
+class HSAQueueEntry;
 class Shader;
+class Wavefront;
 
-class GpuDispatcher : public DmaDevice
+class GPUDispatcher : public SimObject
 {
-    public:
-        typedef GpuDispatcherParams Params;
-
-        MasterID masterId() { return _masterId; }
-
-    protected:
-        MasterID _masterId;
-
-        // Base and length of PIO register space
-        Addr pioAddr;
-        Addr pioSize;
-        Tick pioDelay;
-
-        HsaQueueEntry curTask;
-
-        std::unordered_map<int, NDRange> ndRangeMap;
-        NDRange ndRange;
-
-        // list of kernel_ids to launch
-        std::queue<int> execIds;
-        // list of kernel_ids that have finished
-        std::queue<int> doneIds;
-
-        uint64_t dispatchCount;
-        // is there a kernel in execution?
-        bool dispatchActive;
-
-        BaseCPU *cpu;
-        Shader *shader;
-        ClDriver *driver;
-        EventFunctionWrapper tickEvent;
-
-
-        static GpuDispatcher *instance;
-
-        // sycall emulation mode can have only 1 application running(?)
-        // else we have to do some pid based tagging
-        // unused
-        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
-        TranslationBuffer tlb;
-
-    public:
-        /*statistics*/
-        Stats::Scalar num_kernelLaunched;
-        GpuDispatcher(const Params *p);
-
-        ~GpuDispatcher() { }
-
-        void exec();
-        virtual void serialize(CheckpointOut &cp) const override;
-        virtual void unserialize(CheckpointIn &cp) override;
-        void notifyWgCompl(Wavefront *w);
-        void scheduleDispatch();
-        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
-
-        // using singleton so that glue code can pass pointer locations
-        // to the dispatcher. when there are multiple dispatchers, we can
-        // call something like getInstance(index)
-        static void
-         setInstance(GpuDispatcher *_instance)
-        {
-            instance = _instance;
-        }
-
-        static GpuDispatcher* getInstance() { return instance; }
-
-        class TLBPort : public MasterPort
-        {
-          public:
-
-            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
-                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
-
-          protected:
-            GpuDispatcher *dispatcher;
-
-            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
-            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
-            virtual void recvFunctional(PacketPtr pkt) { }
-            virtual void recvRangeChange() { }
-            virtual void recvReqRetry() { }
-
-        };
-
-        TLBPort *tlbPort;
-
-        Port &getPort(const std::string &if_name,
-                      PortID idx=InvalidPortID) override;
-
-        AddrRangeList getAddrRanges() const override;
-        Tick read(PacketPtr pkt) override;
-        Tick write(PacketPtr pkt) override;
-
-        // helper functions to retrieve/set GPU attributes
-        int getNumCUs();
-        int wfSize() const;
-        void setFuncargsSize(int funcargs_size);
-
-        /** Returns the size of the static hardware context of a wavefront */
-        uint32_t getStaticContextSize() const;
+  public:
+    typedef GPUDispatcherParams Params;
+
+    GPUDispatcher(const Params *p);
+    ~GPUDispatcher();
+
+    void serialize(CheckpointOut &cp) const override;
+    void unserialize(CheckpointIn &cp) override;
+    void regStats() override;
+    void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
+    void setShader(Shader *new_shader);
+    void exec();
+    bool isReachingKernelEnd(Wavefront *wf);
+    void updateInvCounter(int kern_id, int val=-1);
+    bool updateWbCounter(int kern_id, int val=-1);
+    int getOutstandingWbs(int kern_id);
+    void notifyWgCompl(Wavefront *wf);
+    void scheduleDispatch();
+    void dispatch(HSAQueueEntry *task);
+    HSAQueueEntry* hsaTask(int disp_id);
+
+  private:
+    Shader *shader;
+    GPUCommandProcessor *gpuCmdProc;
+    EventFunctionWrapper tickEvent;
+    std::unordered_map<int, HSAQueueEntry*> hsaQueueEntries;
+    // list of kernel_ids to launch
+    std::queue<int> execIds;
+    // list of kernel_ids that have finished
+    std::queue<int> doneIds;
+    // is there a kernel in execution?
+    bool dispatchActive;
+    /*statistics*/
+    Stats::Scalar numKernelLaunched;
+    Stats::Scalar cyclesWaitingForDispatch;
 };
 
-#endif // __GPU_DISPATCHER_HH__
+#endif // __GPU_COMPUTE_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
index 0640083f8..2dece180b 100644
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -33,13 +33,15 @@
 
 #include "gpu-compute/exec_stage.hh"
 
+#include <sstream>
+
+#include "base/trace.hh"
+#include "debug/GPUSched.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 
-ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
-    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
-    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
-    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false),
     thisTimeInstExecuted(false), instrExecuted (false),
     executionResourcesUsed(0)
 {
@@ -53,37 +55,18 @@ ExecStage::init(ComputeUnit *cu)
     computeUnit = cu;
     _name = computeUnit->name() + ".ExecStage";
     dispatchList = &computeUnit->dispatchList;
-    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
-    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
-    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
     idle_dur = 0;
 }
 
 void
 ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
     if (stage == IdleExec) {
-        // count cycles of no vector ALU instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-        }
-
-        // count cycles of no global memory (vector) instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-            (*glbMemInstAvail)--;
-        }
-
-        // count cycles of no shared memory (vector) instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-            (*shrMemInstAvail)--;
-        }
+        // count cycles when no instruction to a specific execution resource
+        // is executed
+        numCyclesWithNoInstrTypeIssued[unitId]++;
     } else if (stage == BusyExec) {
-        // count the number of cycles an instruction to a specific unit
-        // was issued
+        // count the number of cycles an instruction to a specific execution
+        // resource type was issued
         numCyclesWithInstrTypeIssued[unitId]++;
         thisTimeInstExecuted = true;
         instrExecuted = true;
@@ -102,14 +85,13 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
         }
 
         lastTimeInstExecuted = thisTimeInstExecuted;
-        // track the number of cycles we either issued one vector instruction
-        // or issued no instructions at all
+        // track the number of cycles we either issued at least
+        // instruction or issued no instructions at all
         if (instrExecuted) {
             numCyclesWithInstrIssued++;
         } else {
             numCyclesWithNoIssue++;
         }
-
         spc.sample(executionResourcesUsed);
     }
 }
@@ -122,25 +104,86 @@ ExecStage::initStatistics()
     thisTimeInstExecuted = false;
 }
 
+std::string
+ExecStage::dispStatusToStr(int i)
+{
+    std::string s("INVALID");
+    switch (i) {
+    case EMPTY:
+        s = "EMPTY";
+        break;
+    case SKIP:
+        s = "SKIP";
+        break;
+    case EXREADY:
+        s = "EXREADY";
+        break;
+    }
+    return s;
+}
+
+void
+ExecStage::dumpDispList()
+{
+    std::stringstream ss;
+    bool empty = true;
+    for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+        DISPATCH_STATUS s = dispatchList->at(i).second;
+        ss << i << ": " << dispStatusToStr(s);
+        if (s != EMPTY) {
+            empty = false;
+            Wavefront *w = dispatchList->at(i).first;
+            ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
+            ss << (w->instructionBuffer.front())->seqNum() << ": ";
+            ss << (w->instructionBuffer.front())->disassemble();
+        }
+        ss << "\n";
+    }
+    if (!empty) {
+        DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str());
+    }
+}
+
 void
 ExecStage::exec()
 {
     initStatistics();
-
-    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
-         // if dispatch list for this execution resource is empty,
-         // skip this execution resource this cycle
-         if (dispatchList->at(unitId).second == EMPTY) {
-             collectStatistics(IdleExec, unitId);
-             continue;
-         }
-
-         collectStatistics(BusyExec, unitId);
-         // execute an instruction for the WF
-         dispatchList->at(unitId).first->exec();
-         // clear the dispatch list entry
-         dispatchList->at(unitId).second = EMPTY;
-         dispatchList->at(unitId).first = (Wavefront*)nullptr;
+    if (Debug::GPUSched) {
+        dumpDispList();
+    }
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+        DISPATCH_STATUS s = dispatchList->at(unitId).second;
+        switch (s) {
+        case EMPTY:
+            // Do not execute if empty, waiting for VRF reads,
+            // or LM tied to GM waiting for VRF reads
+            collectStatistics(IdleExec, unitId);
+            break;
+        case EXREADY:
+        {
+            collectStatistics(BusyExec, unitId);
+            Wavefront *w = dispatchList->at(unitId).first;
+            DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
+                    unitId, w->simdId, w->wfDynId,
+                    (w->instructionBuffer.front())->disassemble());
+            DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
+            dispatchList->at(unitId).first->exec();
+            (computeUnit->scheduleStage).deleteFromSch(w);
+            dispatchList->at(unitId).second = EMPTY;
+            dispatchList->at(unitId).first->freeResources();
+            dispatchList->at(unitId).first = nullptr;
+            break;
+        }
+        case SKIP:
+            collectStatistics(BusyExec, unitId);
+            DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
+            dispatchList->at(unitId).second = EMPTY;
+            dispatchList->at(unitId).first->freeResources();
+            dispatchList->at(unitId).first = nullptr;
+            break;
+        default:
+            panic("Unknown dispatch status in exec()\n");
+        }
     }
 
     collectStatistics(PostExec, 0);
@@ -165,7 +208,7 @@ ExecStage::regStats()
         ;
 
     spc
-        .init(0, numSIMDs + numMemUnits, 1)
+        .init(0, computeUnit->numExeUnits(), 1)
         .name(name() + ".spc")
         .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
         ;
@@ -177,25 +220,36 @@ ExecStage::regStats()
         ;
 
     numCyclesWithInstrTypeIssued
-        .init(numSIMDs + numMemUnits)
-        .name(name() + ".num_cycles_with_instrtype_issue")
-        .desc("Number of cycles at least one instruction of specific type "
-              "issued")
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".num_cycles_issue_exec_rsrc")
+        .desc("Number of cycles at least one instruction issued to "
+              "execution resource type")
         ;
 
     numCyclesWithNoInstrTypeIssued
-        .init(numSIMDs + numMemUnits)
-       .name(name() + ".num_cycles_with_instr_type_no_issue")
-       .desc("Number of cycles no instruction of specific type issued")
+        .init(computeUnit->numExeUnits())
+       .name(name() + ".num_cycles_no_issue_exec_rsrc")
+       .desc("Number of clks no instructions issued to execution "
+             "resource type")
        ;
 
-    for (int i = 0; i < numSIMDs; ++i) {
-        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
-        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+    int c = 0;
+    for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
+        std::string s = "VectorALU" + std::to_string(i);
+        numCyclesWithNoInstrTypeIssued.subname(c, s);
+        numCyclesWithInstrTypeIssued.subname(c, s);
+    }
+    for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
+        std::string s = "ScalarALU" + std::to_string(i);
+        numCyclesWithNoInstrTypeIssued.subname(c, s);
+        numCyclesWithInstrTypeIssued.subname(c, s);
     }
+    numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe");
+
+    numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
 
-    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
-    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
-    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
-    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+    numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
 }
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
index 2e14a542e..670252cde 100644
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -35,6 +35,7 @@
 #define __EXEC_STAGE_HH__
 
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -53,8 +54,9 @@ enum STAT_STATUS
 
 enum DISPATCH_STATUS
 {
-    EMPTY = 0,
-    FILLED
+    EMPTY = 0, // no wave present in dispatchList slot
+    EXREADY, // wave ready for execution
+    SKIP, // extra memory resource needed, Shared Mem. only
 };
 
 // Execution stage.
@@ -72,18 +74,21 @@ class ExecStage
     void init(ComputeUnit *cu);
     void exec();
 
+    std::string dispStatusToStr(int j);
+    void dumpDispList();
+
     std::string name() { return _name; }
     void regStats();
     // number of idle cycles
     Stats::Scalar numCyclesWithNoIssue;
     // number of busy cycles
     Stats::Scalar numCyclesWithInstrIssued;
-    // number of cycles (per execution unit) during which at least one
-    // instruction was issued to that unit
+    // number of cycles during which at least one
+    // instruction was issued to an execution resource type
     Stats::Vector numCyclesWithInstrTypeIssued;
-    // number of idle cycles (per execution unit) during which the unit issued
-    // no instruction targeting that unit, even though there is at least one
-    // Wavefront with such an instruction as the oldest
+    // number of idle cycles during which the scheduler
+    // issued no instructions targeting a specific
+    // execution resource type
     Stats::Vector numCyclesWithNoInstrTypeIssued;
     // SIMDs active per cycle
     Stats::Distribution spc;
@@ -92,11 +97,6 @@ class ExecStage
     void collectStatistics(enum STAT_STATUS stage, int unitId);
     void initStatistics();
     ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-
-    // Number of memory execution resources;
-    // both global and local memory execution resources in CU
-    uint32_t numMemUnits;
 
     // List of waves which will be dispatched to
     // each execution resource. A FILLED implies
@@ -108,18 +108,12 @@ class ExecStage
     // dispatchList is used to communicate between schedule
     // and exec stage
     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
-    // flag per vector SIMD unit that is set when there is at least one
-    // WV that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer
-    std::vector<bool> *vectorAluInstAvail;
-    int *glbMemInstAvail;
-    int *shrMemInstAvail;
     bool lastTimeInstExecuted;
     bool thisTimeInstExecuted;
     bool instrExecuted;
     Stats::Scalar  numTransActiveIdle;
     Stats::Distribution idleDur;
-    uint32_t executionResourcesUsed;
+    int executionResourcesUsed;
     uint64_t idle_dur;
     std::string _name;
 };
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
index 4a2d4233f..cf0b39e70 100644
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -36,18 +36,18 @@
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/wavefront.hh"
 
-FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
-    computeUnit(nullptr)
+FetchStage::FetchStage(const ComputeUnitParams* p) :
+    numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
 {
-    for (int j = 0; j < numSIMDs; ++j) {
+    for (int j = 0; j < numVectorALUs; ++j) {
         FetchUnit newFetchUnit(p);
-        fetchUnit.push_back(newFetchUnit);
+        _fetchUnit.push_back(newFetchUnit);
     }
 }
 
 FetchStage::~FetchStage()
 {
-    fetchUnit.clear();
+    _fetchUnit.clear();
 }
 
 void
@@ -56,17 +56,17 @@ FetchStage::init(ComputeUnit *cu)
     computeUnit = cu;
     _name = computeUnit->name() + ".FetchStage";
 
-    for (int j = 0; j < numSIMDs; ++j) {
-        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
-        fetchUnit[j].init(computeUnit);
+    for (int j = 0; j < numVectorALUs; ++j) {
+        _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        _fetchUnit[j].init(computeUnit);
     }
 }
 
 void
 FetchStage::exec()
 {
-    for (int j = 0; j < numSIMDs; ++j) {
-        fetchUnit[j].exec();
+    for (int j = 0; j < numVectorALUs; ++j) {
+        _fetchUnit[j].exec();
     }
 }
 
@@ -83,13 +83,13 @@ FetchStage::processFetchReturn(PacketPtr pkt)
 
     instFetchInstReturned.sample(num_instructions);
     uint32_t simdId = wavefront->simdId;
-    fetchUnit[simdId].processFetchReturn(pkt);
+    _fetchUnit[simdId].processFetchReturn(pkt);
 }
 
 void
 FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
 {
-    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+    _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
 }
 
 void
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
index 310ce6f60..afaf81b5a 100644
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -62,14 +62,15 @@ class FetchStage
     std::string name() { return _name; }
     void regStats();
     Stats::Distribution instFetchInstReturned;
+    FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
 
   private:
-    uint32_t numSIMDs;
+    int numVectorALUs;
     ComputeUnit *computeUnit;
 
     // List of fetch units. A fetch unit is
-    // instantiated per SIMD
-    std::vector<FetchUnit> fetchUnit;
+    // instantiated per VALU/SIMD
+    std::vector<FetchUnit> _fetchUnit;
     std::string _name;
 };
 
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index c567b71db..fb04cd27e 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -45,11 +45,9 @@
 
 uint32_t FetchUnit::globalFetchUnitID;
 
-FetchUnit::FetchUnit(const ComputeUnitParams* params) :
-    timingSim(true),
-    computeUnit(nullptr),
-    fetchScheduler(params),
-    waveList(nullptr)
+FetchUnit::FetchUnit(const ComputeUnitParams* params)
+    : timingSim(true), computeUnit(nullptr), fetchScheduler(params),
+      waveList(nullptr), fetchDepth(params->fetch_depth)
 {
 }
 
@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
     timingSim = computeUnit->shader->timingSim;
     fetchQueue.clear();
     fetchStatusQueue.resize(computeUnit->shader->n_wf);
-
-    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
-        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
+
+    for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+        Wavefront *wf = waveList->at(i);
+        assert(wf->wfSlotId == i);
+        fetchStatusQueue[i] = std::make_pair(wf, false);
+        fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+        fetchBuf[i].decoder(&decoder);
     }
 
     fetchScheduler.bindList(&fetchQueue);
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
 void
 FetchUnit::exec()
 {
+    /**
+     * now we check if any of the fetch buffers have
+     * buffered instruction data that can be decoded
+     * and sent to its wavefront's instruction buffer.
+     * then we check if any of the fetch buffer entries
+     * can be released. we only check if we can
+     * release a buffer
+     */
+    for (auto &fetch_buf : fetchBuf) {
+        if (!fetch_buf.hasFreeSpace()) {
+            fetch_buf.checkWaveReleaseBuf();
+        }
+        if (fetch_buf.hasFetchDataToProcess()) {
+            fetch_buf.decodeInsts();
+        }
+    }
+
     // re-evaluate waves which are marked as not ready for fetch
     for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
         // Following code assumes 64-bit opertaion and all insts are
@@ -88,9 +108,10 @@ FetchUnit::exec()
         // 4 or less instructions and it can not have any branches to
         // prevent speculative instruction fetches
         if (!fetchStatusQueue[j].second) {
-            if (curWave->status == Wavefront::S_RUNNING &&
-                curWave->instructionBuffer.size() <= 4 &&
-                !curWave->instructionBufferHasBranch() &&
+            if ((curWave->getStatus() == Wavefront::S_RUNNING ||
+                curWave->getStatus() == Wavefront::S_WAITCNT) &&
+                fetchBuf[j].hasFreeSpace() &&
+                !curWave->stopFetch() &&
                 !curWave->pendingFetch) {
                 fetchQueue.push_back(curWave);
                 fetchStatusQueue[j].second = true;
@@ -111,45 +132,38 @@ FetchUnit::exec()
 void
 FetchUnit::initiateFetch(Wavefront *wavefront)
 {
-    // calculate the virtual address to fetch from the SQC
-    Addr vaddr = wavefront->pc();
+    assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
 
     /**
-     * the instruction buffer holds one instruction per entry, regardless
-     * of the underlying instruction's size. the PC, however, addresses
-     * instrutions on a 32b granularity so we must account for that here.
-    */
-    for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
-        vaddr +=
-            wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
-    }
-    vaddr = wavefront->basePtr +  vaddr;
+     * calculate the virtual address to fetch from the SQC. the fetch
+     * buffer holds a configurable number of cache lines. we start
+     * fetching at the address of the cache line immediately following
+     * the buffered line(s).
+     */
+    Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
 
-    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
-            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+    // this should already be aligned to a cache line
+    assert(vaddr == makeLineAddress(vaddr,
+           computeUnit->getCacheLineBits()));
 
-    // Since this is an instruction prefetch, if you're split then just finish
-    // out the current line.
-    int block_size = computeUnit->cacheLineSize();
-    // check for split accesses
-    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
-    int size = block_size;
+    // shouldn't be fetching a line that is already buffered
+    assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
 
-    if (split_addr > vaddr) {
-        // misaligned access, just grab the rest of the line
-        size = split_addr - vaddr;
-    }
+    fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
+            "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
+
+    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
 
     // set up virtual request
     RequestPtr req = std::make_shared<Request>(
-        vaddr, size, Request::INST_FETCH,
+        vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
         computeUnit->masterId(), 0, 0, nullptr);
 
     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    // This fetchBlock is kind of faux right now - because the translations so
-    // far don't actually return Data
-    uint64_t fetchBlock;
-    pkt->dataStatic(&fetchBlock);
 
     if (timingSim) {
         // SenderState needed on Return
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
             computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
             pkt->req->getPaddr());
 
-    // this is necessary because the GPU TLB receives packets instead of
-    // requests. when the translation is complete, all relevent fields in the
-    // request will be populated, but not in the packet. here we create the
-    // new packet so we can set the size, addr, and proper flags.
+    /**
+     * this is necessary because the GPU TLB receives packets instead of
+     * requests. when the translation is complete, all relevent fields in
+     * the request will be populated, but not in the packet. here we create
+     * the new packet so we can set the size, addr, and proper flags.
+     */
     PacketPtr oldPkt = pkt;
     pkt = new Packet(oldPkt->req, oldPkt->cmd);
     delete oldPkt;
 
-    TheGpuISA::RawMachInst *data =
-        new TheGpuISA::RawMachInst[pkt->req->getSize() /
-        sizeof(TheGpuISA::RawMachInst)];
-
-    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+    /**
+     * we should have reserved an entry in the fetch buffer
+     * for this cache line. here we get the pointer to the
+     * entry used to buffer this request's line data.
+     */
+    pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
+                    .reservedBuf(pkt->req->getVaddr()));
 
     // New SenderState for the memory access
     pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
     Wavefront *wavefront = sender_state->wavefront;
 
     DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
-            "%d bytes, %d instructions!\n", computeUnit->cu_id,
-            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
-            pkt->req->getSize(), pkt->req->getSize() /
-            sizeof(TheGpuISA::RawMachInst));
+            "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
 
     if (wavefront->dropFetch) {
         assert(wavefront->instructionBuffer.empty());
+        assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
         wavefront->dropFetch = false;
     } else {
-        TheGpuISA::RawMachInst *inst_index_ptr =
-            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
-
-        assert(wavefront->instructionBuffer.size() <= 4);
-
-        for (int i = 0; i < pkt->req->getSize() /
-             sizeof(TheGpuISA::RawMachInst); ++i) {
-            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
-
-            assert(inst_ptr);
-
-            if (inst_ptr->instSize() == 8) {
-                /**
-                 * this instruction occupies 2 consecutive
-                 * entries in the instruction array, the
-                 * second of which contains a nullptr. so if
-                 * this inst is 8 bytes we advance two entries
-                 * instead of 1
-                 */
-                ++i;
-            }
-
-            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
-                    computeUnit->cu_id, wavefront->simdId,
-                    wavefront->wfSlotId, inst_ptr->disassemble());
-
-            GPUDynInstPtr gpuDynInst =
-                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
-                                             computeUnit->getAndIncSeqNum());
-
-            wavefront->instructionBuffer.push_back(gpuDynInst);
-        }
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
     }
 
     wavefront->pendingFetch = false;
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
     delete pkt;
 }
 
+void
+FetchUnit::flushBuf(int wfSlotId)
+{
+    fetchBuf.at(wfSlotId).flushBuf();
+}
+
 void
 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 {
     waveList = wave_list;
 }
+
+/** FetchBufDesc */
+void
+FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
+                                     Wavefront *wf)
+{
+    wavefront = wf;
+    fetchDepth = fetch_depth;
+    maxIbSize = wavefront->maxIbSize;
+    cacheLineSize = cache_line_size;
+    maxFbSize = cacheLineSize * fetchDepth;
+
+    // Calculate the number of bits to address a cache line
+    panic_if(!isPowerOf2(cacheLineSize),
+        "Cache line size should be a power of two.");
+    cacheLineBits = floorLog2(cacheLineSize);
+
+    bufStart = new uint8_t[maxFbSize];
+    readPtr = bufStart;
+    bufEnd = bufStart + maxFbSize;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.emplace_back(readPtr + i * cacheLineSize);
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::flushBuf()
+{
+    restartFromBranch = true;
+    /**
+     * free list may have some entries
+     * so we clear it here to avoid duplicates
+     */
+    freeList.clear();
+    bufferedPCs.clear();
+    reservedPCs.clear();
+    readPtr = bufStart;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.push_back(bufStart + i * cacheLineSize);
+    }
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
+            "buffer\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId);
+}
+
+Addr
+FetchUnit::FetchBufDesc::nextFetchAddr()
+{
+    Addr next_line = 0;
+
+    if (bufferedAndReservedLines()) {
+        Addr last_line_fetched = 0;
+        if (!reservedLines()) {
+            /**
+             * get the PC of the most recently fetched cache line,
+             * then return the address of the next line.
+             */
+            last_line_fetched = bufferedPCs.rbegin()->first;
+        } else {
+            last_line_fetched = reservedPCs.rbegin()->first;
+        }
+
+        next_line = last_line_fetched + cacheLineSize;
+
+        /**
+         * should not be trying to fetch a line that has already
+         * been fetched.
+         */
+        assert(bufferedPCs.find(next_line) == bufferedPCs.end());
+        assert(reservedPCs.find(next_line) == reservedPCs.end());
+    } else {
+        /**
+         * we do not have any buffered cache lines yet, so we
+         * assume this is the initial fetch, or the first fetch
+         * after a branch, and get the PC directly from the WF.
+         * in the case of a branch, we may not start at the
+         * beginning of a cache line, so we adjust the readPtr by
+         * the current PC's offset from the start of the line.
+         */
+        next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
+        readPtr = bufStart;
+
+        /**
+         * if we are here we have no buffered lines. in the case we flushed
+         * the buffer due to a branch, we may need to start fetching from
+         * some offset from the start of the fetch buffer, so we adjust for
+         * that here.
+         */
+        if (restartFromBranch) {
+            restartFromBranch = false;
+            int byte_offset
+                = wavefront->pc() - makeLineAddress(wavefront->pc(),
+                                    cacheLineBits);
+            readPtr += byte_offset;
+        }
+    }
+
+    return next_line;
+}
+
+void
+FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
+{
+    // we should have free buffer space, and the line
+    // at vaddr should not already be cached.
+    assert(hasFreeSpace());
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    assert(reservedPCs.find(vaddr) == reservedPCs.end());
+    assert(bufferedAndReservedLines() < fetchDepth);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
+            "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * we reserve buffer space, by moving it out of the
+     * free list, however we do not mark the buffered
+     * line as valid until the fetch unit for this buffer
+     * has receieved the response from the memory system.
+     */
+    uint8_t *inst_buf = freeList.front();
+    reservedPCs.emplace(vaddr, inst_buf);
+    freeList.pop_front();
+}
+
+void
+FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+{
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
+            wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * this address should have an entry reserved in the
+     * fetch buffer already, however it should be invalid
+     * until the fetch completes.
+     */
+    auto reserved_pc = reservedPCs.find(vaddr);
+    assert(reserved_pc != reservedPCs.end());
+    bufferedPCs.emplace(vaddr, reserved_pc->second);
+
+    if (readPtr == bufEnd) {
+        readPtr = bufStart;
+    }
+
+    reserved_pc->second = nullptr;
+    reservedPCs.erase(reserved_pc);
+}
+
+bool
+FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
+{
+    return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
+}
+
+void
+FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
+{
+    Addr cur_wave_pc = roundDown(wavefront->pc(),
+                                 wavefront->computeUnit->cacheLineSize());
+    if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
+                "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
+                wavefront->wfDynId, cur_wave_pc);
+
+        // should be reserved, but not buffered yet
+        assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
+
+        return;
+    }
+
+    auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
+    auto oldest_buffered_pc = bufferedPCs.begin();
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
+            "(PC = %#x) can be released.\n", wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
+            wavefront->pc());
+
+#ifdef DEBUG
+    int idx = 0;
+    for (const auto &buf_pc : bufferedPCs) {
+        DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
+        ++idx;
+    }
+#endif
+
+    // if we haven't buffered data for this PC, we shouldn't
+    // be fetching from it.
+    assert(current_buffered_pc != bufferedPCs.end());
+
+    /**
+     * we're using a std::map so the addresses are sorted. if this
+     * PC is not the oldest one in the map, we must be fetching from
+     * a newer block, and we can release the oldest PC's fetch buffer
+     * entry back to the free list.
+     */
+    if (current_buffered_pc != oldest_buffered_pc) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
+                "removing it from the fetch buffer.\n", wavefront->simdId,
+                wavefront->wfSlotId, wavefront->wfDynId,
+                oldest_buffered_pc->first);
+
+        freeList.emplace_back(oldest_buffered_pc->second);
+        oldest_buffered_pc->second = nullptr;
+        bufferedPCs.erase(oldest_buffered_pc);
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
+                wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+                bufferedLines());
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeInsts()
+{
+    assert(readPtr);
+
+    if (splitDecode()) {
+        decodeSplitInst();
+    }
+
+    while (wavefront->instructionBuffer.size() < maxIbSize
+           && hasFetchDataToProcess()) {
+        if (splitDecode()) {
+            decodeSplitInst();
+        } else {
+            TheGpuISA::MachInst mach_inst
+                = reinterpret_cast<TheGpuISA::MachInst>(readPtr);
+            GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+            readPtr += gpu_static_inst->instSize();
+
+            assert(readPtr <= bufEnd);
+
+            GPUDynInstPtr gpu_dyn_inst
+                = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                               wavefront, gpu_static_inst,
+                                               wavefront->computeUnit->
+                                                   getAndIncSeqNum());
+            wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+            DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
+                    "%d bytes remain.\n", wavefront->simdId,
+                    wavefront->wfSlotId, wavefront->wfDynId,
+                    gpu_static_inst->disassemble(),
+                    gpu_static_inst->instSize(),
+                    fetchBytesRemaining());
+        }
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeSplitInst()
+{
+    TheGpuISA::RawMachInst split_inst = 0;
+    int dword_size = sizeof(uint32_t);
+    int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
+
+    for (int i = 0; i < num_dwords; ++i) {
+        ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
+        if (readPtr + dword_size >= bufEnd) {
+            readPtr = bufStart;
+        }
+    }
+
+    assert(readPtr == bufStart);
+
+    TheGpuISA::MachInst mach_inst
+        = reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
+    GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+    readPtr += (gpu_static_inst->instSize() - dword_size);
+    assert(readPtr < bufEnd);
+
+    GPUDynInstPtr gpu_dyn_inst
+        = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                       wavefront, gpu_static_inst,
+                                       wavefront->computeUnit->
+                                           getAndIncSeqNum());
+    wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
+            "(%d bytes). %d bytes remain in %d buffered lines.\n",
+            wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+            gpu_static_inst->disassemble(), split_inst,
+            gpu_static_inst->instSize(), fetchBytesRemaining(),
+            bufferedLines());
+}
+
+bool
+FetchUnit::FetchBufDesc::splitDecode() const
+{
+    /**
+     * if a read of a raw instruction would go beyond the end
+     * of the fetch buffer, then we must perform a split decode.
+     */
+    bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
+
+    return is_split;
+}
+
+int
+FetchUnit::FetchBufDesc::fetchBytesRemaining() const
+{
+    int bytes_remaining = 0;
+
+    if (bufferedLines() && readPtr != bufEnd) {
+        auto last_buf_pc = bufferedPCs.rbegin();
+        uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
+        int byte_diff = end_ptr - readPtr;
+
+        if (end_ptr > readPtr) {
+            bytes_remaining = byte_diff;
+        } else if (end_ptr < readPtr) {
+            bytes_remaining = bufferedBytes() + byte_diff;
+        }
+    }
+
+    assert(bytes_remaining <= bufferedBytes());
+    return bytes_remaining;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 48ffdc110..2cfe3f0fe 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -36,7 +36,6 @@
 
 #include <string>
 #include <utility>
-#include <vector>
 
 #include "arch/gpu_decoder.hh"
 #include "base/statistics.hh"
@@ -58,9 +57,170 @@ class FetchUnit
     void initiateFetch(Wavefront *wavefront);
     void fetch(PacketPtr pkt, Wavefront *wavefront);
     void processFetchReturn(PacketPtr pkt);
+    void flushBuf(int wfSlotId);
     static uint32_t globalFetchUnitID;
 
   private:
+    /**
+     * fetch buffer descriptor. holds buffered
+     * instruction data in the fetch unit.
+     */
+    class FetchBufDesc
+    {
+      public:
+        FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
+            readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
+            cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
+            _decoder(nullptr)
+        {
+        }
+
+        ~FetchBufDesc()
+        {
+            delete[] bufStart;
+        }
+
+        /**
+         * allocate the fetch buffer space, and set the fetch depth
+         * (number of lines that may be buffered), fetch size
+         * (cache line size), and parent WF for this fetch buffer.
+         */
+        void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);
+
+        int
+        bufferedAndReservedLines() const
+        {
+            return bufferedLines() + reservedLines();
+        }
+
+        int bufferedLines() const { return bufferedPCs.size(); }
+        int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
+        int reservedLines() const { return reservedPCs.size(); }
+        bool hasFreeSpace() const { return !freeList.empty(); }
+        void flushBuf();
+        Addr nextFetchAddr();
+
+        /**
+         * reserve an entry in the fetch buffer for PC = vaddr,
+         */
+        void reserveBuf(Addr vaddr);
+
+        /**
+         * return a pointer to the raw fetch buffer data.
+         * this allows the fetch pkt to use this data directly
+         * to avoid unnecessary memcpy and malloc/new.
+         */
+        uint8_t*
+        reservedBuf(Addr vaddr) const
+        {
+            auto reserved_pc = reservedPCs.find(vaddr);
+            assert(reserved_pc != reservedPCs.end());
+            assert(reserved_pc == reservedPCs.begin());
+
+            return reserved_pc->second;
+        }
+
+        void fetchDone(Addr vaddr);
+
+        /**
+         * checks if the buffer contains valid data. this essentially
+         * tells fetch when there is data remaining that needs to be
+         * decoded into the WF's IB.
+         */
+        bool hasFetchDataToProcess() const;
+
+        /**
+         * each time the fetch stage is ticked, we check if there
+         * are any data in the fetch buffer that may be decoded and
+         * sent to the IB. because we are modeling the fetch buffer
+         * as a circular buffer, it is possible that an instruction
+         * can straddle the end/beginning of the fetch buffer, so
+         * decodeSplitInsts() handles that case.
+         */
+        void decodeInsts();
+
+        /**
+         * checks if the wavefront can release any of its fetch
+         * buffer entries. this will occur when the WF's PC goes
+         * beyond any of the currently buffered cache lines.
+         */
+        void checkWaveReleaseBuf();
+
+        void
+        decoder(TheGpuISA::Decoder *dec)
+        {
+            _decoder = dec;
+        }
+
+        bool
+        pcBuffered(Addr pc) const
+        {
+            bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
+                            && reservedPCs.find(pc) != reservedPCs.end();
+
+            return buffered;
+        }
+
+        /**
+         * calculates the number of fetched bytes that have yet
+         * to be decoded.
+         */
+        int fetchBytesRemaining() const;
+
+      private:
+        void decodeSplitInst();
+
+        /**
+         * check if the next instruction to be processed out of
+         * the fetch buffer is split across the end/beginning of
+         * the fetch buffer.
+         */
+        bool splitDecode() const;
+
+        /**
+         * the set of PCs (fetch addresses) that are currently
+         * buffered. bufferedPCs are valid, reservedPCs are
+         * waiting for their buffers to be filled with valid
+         * fetch data.
+         */
+        std::map<Addr, uint8_t*> bufferedPCs;
+        std::map<Addr, uint8_t*> reservedPCs;
+
+        /**
+         * represents the fetch buffer free list. holds buffer space
+         * that is currently free. each pointer in this array must
+         * have enough space to hold a cache line. in reality we
+         * have one actual fetch buffer: 'bufStart', these pointers
+         * point to addresses within bufStart that are aligned to the
+         * cache line size.
+         */
+        std::deque<uint8_t*> freeList;
+
+        /**
+         * raw instruction buffer. holds cache line data associated with
+         * the set of PCs (fetch addresses) that are buffered here.
+         */
+        uint8_t *bufStart;
+        uint8_t *bufEnd;
+        /**
+         * pointer that points to the next chunk of inst data to be
+         * decoded.
+         */
+        uint8_t *readPtr;
+        // how many lines the fetch unit may buffer
+        int fetchDepth;
+        // maximum size (in number of insts) of the WF's IB
+        int maxIbSize;
+        // maximum size (in bytes) of this fetch buffer
+        int maxFbSize;
+        int cacheLineSize;
+        int cacheLineBits;
+        bool restartFromBranch;
+        // wavefront whose IB is serviced by this fetch buffer
+        Wavefront *wavefront;
+        TheGpuISA::Decoder *_decoder;
+    };
+
     bool timingSim;
     ComputeUnit *computeUnit;
     TheGpuISA::Decoder decoder;
@@ -82,6 +242,15 @@ class FetchUnit
 
     // Pointer to list of waves dispatched on to this SIMD unit
     std::vector<Wavefront*> *waveList;
+    // holds the fetch buffers. each wave has 1 entry.
+    std::vector<FetchBufDesc> fetchBuf;
+    /**
+     * number of cache lines we can fetch and buffer.
+     * this includes the currently fetched line (i.e., the
+     * line that corresponds to the WF's current PC), as
+     * well as any lines that may be prefetched.
+     */
+    int fetchDepth;
 };
 
 #endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 64778f011..0bbacc44c 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -31,12 +31,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "gpu-compute/global_memory_pipeline.hh"
-
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
 #include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
@@ -44,7 +45,7 @@
 
 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
     computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+    maxWaveRequests(p->max_wave_requests), inflightStores(0),
     inflightLoads(0)
 {
 }
@@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
     return true;
 }
 
+void
+GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
+    assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
+    mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
+}
+
+bool
+GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
+{
+    // Ensure we haven't exceeded the maximum number of vmem requests
+    // for this wavefront
+    if ((mp->wavefront()->outstandingReqsRdGm
+         + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -87,42 +113,60 @@ GlobalMemPipeline::exec()
 
     // check the VRF to see if the operands of a load (or load component
     // of an atomic) are accessible
-    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+    if (m && (m->isLoad() || m->isAtomicRet())) {
         w = m->wavefront();
 
-        accessVrf =
-            w->computeUnit->vrf[w->simdId]->
-                vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
+        accessVrf = w->computeUnit->vrf[w->simdId]->
+            canScheduleWriteOperandsFromLoad(w, m);
+
     }
 
     if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && m->statusBitVector == VectorMask(0) &&
-        (computeUnit->shader->coissue_return ||
-        computeUnit->wfWait.at(m->pipeId).rdy())) {
+        accessVrf && (computeUnit->shader->coissue_return ||
+        computeUnit->vectorGlobalMemUnit.rdy())) {
 
         w = m->wavefront();
 
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
+                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
 
+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->vrf[w->simdId]->
+            scheduleWriteOperandsFromLoad(w, m);
+        }
+
         completeRequest(m);
 
-        // Decrement outstanding register count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        Tick accessTime = curTick() - m->getAccessTime();
 
-        if (m->isStore() || m->isAtomic()) {
+        // Decrement outstanding requests count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        if (m->isStore() || m->isAtomic() || m->isMemSync()) {
+            computeUnit->shader->sampleStore(accessTime);
             computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                              m->time, -1);
         }
 
-        if (m->isLoad() || m->isAtomic()) {
+        if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
+            computeUnit->shader->sampleLoad(accessTime);
             computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                              m->time, -1);
         }
 
+        w->validateRequestCounters();
+
+        // Generate stats for round-trip time for vectory memory insts
+        // going all the way to memory and stats for individual cache
+        // blocks generated by the instruction.
+        m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
+        computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+
         // Mark write bus busy for appropriate amount of time
         computeUnit->glbMemToVrfBus.set(m->time);
         if (!computeUnit->shader->coissue_return)
-            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+            w->computeUnit->vectorGlobalMemUnit.set(m->time);
     }
 
     // If pipeline has executed a global memory instruction
@@ -148,13 +192,13 @@ GlobalMemPipeline::exec()
                 mp->disassemble(), mp->seqNum());
         // Memfences will not return tokens and must be issued so we should
         // not request one as this will deplete the token count until deadlock
-        if (!mp->isMemFence()) {
+        if (!mp->isMemSync()) {
             assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
             mp->computeUnit()->getTokenManager()->acquireTokens(1);
         }
         mp->initiateAcc(mp);
 
-        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+        if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
             /**
              * if we are not in out-of-order data delivery mode
              * then we keep the responses sorted in program order.
@@ -178,19 +222,11 @@ GlobalMemPipeline::exec()
 GPUDynInstPtr
 GlobalMemPipeline::getNextReadyResp()
 {
-    if (outOfOrderDataDelivery) {
-        if (!gmReturnedLoads.empty()) {
-            return gmReturnedLoads.front();
-        } else if (!gmReturnedStores.empty()) {
-            return gmReturnedStores.front();
-        }
-    } else {
-        if (!gmOrderedRespBuffer.empty()) {
-            auto mem_req = gmOrderedRespBuffer.begin();
+    if (!gmOrderedRespBuffer.empty()) {
+        auto mem_req = gmOrderedRespBuffer.begin();
 
-            if (mem_req->second.second) {
-                return mem_req->second.first;
-            }
+        if (mem_req->second.second) {
+            return mem_req->second.first;
         }
     }
 
@@ -208,51 +244,33 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
         --inflightStores;
     }
 
-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(!gmReturnedLoads.empty());
-            gmReturnedLoads.pop();
-        } else if (gpuDynInst->isStore()) {
-            assert(!gmReturnedStores.empty());
-            gmReturnedStores.pop();
-        }
-    } else {
-        // we should only pop the oldest requst, and it
-        // should be marked as done if we are here
-        assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
-        assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
-        assert(gmOrderedRespBuffer.begin()->second.second);
-        // remove this instruction from the buffer by its
-        // unique seq ID
-        gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
-    }
+    // we should only pop the oldest requst, and it
+    // should be marked as done if we are here
+    assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+    assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+    assert(gmOrderedRespBuffer.begin()->second.second);
+    // remove this instruction from the buffer by its
+    // unique seq ID
+    gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 }
 
 void
 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 {
+    gpuDynInst->setAccessTime(curTick());
+    gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
     gmIssuedRequests.push(gpuDynInst);
 }
 
 void
 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 {
-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(isGMLdRespFIFOWrRdy());
-            gmReturnedLoads.push(gpuDynInst);
-        } else {
-            assert(isGMStRespFIFOWrRdy());
-            gmReturnedStores.push(gpuDynInst);
-        }
-    } else {
-        auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
-        // if we are getting a response for this mem request,
-        // then it ought to already be in the ordered response
-        // buffer
-        assert(mem_req != gmOrderedRespBuffer.end());
-        mem_req->second.second = true;
-    }
+    auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+    // if we are getting a response for this mem request,
+    // then it ought to already be in the ordered response
+    // buffer
+    assert(mem_req != gmOrderedRespBuffer.end());
+    mem_req->second.second = true;
 }
 
 void
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index 2f83185a9..6fb1db7b4 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -60,52 +60,34 @@ class GlobalMemPipeline
     void init(ComputeUnit *cu);
     void exec();
 
-    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
-    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
-
     /**
-     * find the next ready response to service. for OoO mode we
-     * simply pop the oldest (based on when the response was
-     * received) response in the response FIFOs. for in-order mode
-     * we pop the oldest (in program order) response, and only if
-     * it is marked as done.
+     * Find the next ready response to service. In order to ensure
+     * that no waitcnts are violated, we pop the oldest (in program order)
+     * response, and only if it is marked as done. This is because waitcnt
+     * values expect memory operations to complete and decrement their
+     * counter values in program order.
      */
     GPUDynInstPtr getNextReadyResp();
 
     /**
      * once a memory request is finished we remove it from the
-     * buffer. this method determines which response buffer
-     * we're using based on the mode (in-order vs. OoO).
+     * buffer.
      */
     void completeRequest(GPUDynInstPtr gpuDynInst);
 
     /**
-     * issues a request to the pipeline - i.e., enqueue it
-     * in the request buffer.
+     * Issues a request to the pipeline (i.e., enqueue it
+     * in the request buffer).
      */
     void issueRequest(GPUDynInstPtr gpuDynInst);
 
     /**
-     * this method handles responses sent to this GM pipeline by the
-     * CU. in the case of in-order delivery it simply marks the reqeust
-     * as done in the ordered buffer to indicate that the requst is
-     * finished. for out-of-order data delivery, the requests are enqueued
-     * (in the order in which they are received) in the response FIFOs.
+     * This method handles responses sent to this GM pipeline by the
+     * CU. Simply marks the reqeust as done in the ordered buffer to
+     * indicate that the requst is finished.
      */
     void handleResponse(GPUDynInstPtr gpuDynInst);
 
-    bool
-    isGMLdRespFIFOWrRdy() const
-    {
-        return gmReturnedLoads.size() < gmQueueSize;
-    }
-
-    bool
-    isGMStRespFIFOWrRdy() const
-    {
-        return gmReturnedStores.size() < gmQueueSize;
-    }
-
     bool
     isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
     {
@@ -114,7 +96,6 @@ class GlobalMemPipeline
 
     const std::string &name() const { return _name; }
     void regStats();
-
     void
     incLoadVRFBankConflictCycles(int num_cycles)
     {
@@ -122,12 +103,15 @@ class GlobalMemPipeline
     }
 
     bool coalescerReady(GPUDynInstPtr mp) const;
+    bool outstandingReqsCheck(GPUDynInstPtr mp) const;
+
+    void acqCoalescerToken(GPUDynInstPtr mp);
 
   private:
     ComputeUnit *computeUnit;
     std::string _name;
     int gmQueueSize;
-    bool outOfOrderDataDelivery;
+    int maxWaveRequests;
 
     // number of cycles of delaying the update of a VGPR that is the
     // target of a load instruction (or the load component of an atomic)
@@ -143,12 +127,11 @@ class GlobalMemPipeline
     int globalMemSize;
 
     /*
-     * this buffer holds the memory responses when in-order data
-     * deilvery is used - the responses are ordered by their unique
-     * sequence number, which is monotonically increasing. when a
-     * memory request returns its "done" flag is set to true. during
-     * each tick the the GM pipeline will check if the oldest request
-     * is finished, and if so it will be removed from the queue.
+     * This buffer holds the memory responses in order data - the responses
+     * are ordered by their unique sequence number, which is monotonically
+     * increasing. When a memory request returns its "done" flag is set to
+     * true. During each tick the the GM pipeline will check if the oldest
+     * request is finished, and if so it will be removed from the queue.
      *
      * key:   memory instruction's sequence ID
      *
@@ -161,14 +144,6 @@ class GlobalMemPipeline
     // Global Memory Request FIFO: all global memory requests
     // are issued to this FIFO from the memory pipelines
     std::queue<GPUDynInstPtr> gmIssuedRequests;
-
-    // Globa Store Response FIFO: all responses of global memory
-    // stores are sent to this FIFO from TCP
-    std::queue<GPUDynInstPtr> gmReturnedStores;
-
-    // Global Load Response FIFO: all responses of global memory
-    // loads are sent to this FIFO from TCP
-    std::queue<GPUDynInstPtr> gmReturnedLoads;
 };
 
 #endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
new file mode 100644
index 000000000..b6205ac13
--- /dev/null
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_command_processor.hh"
+
+#include "debug/GPUCommandProc.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "params/GPUCommandProcessor.hh"
+
+GPUCommandProcessor::GPUCommandProcessor(const Params *p)
+    : HSADevice(p), dispatcher(*p->dispatcher)
+{
+    dispatcher.setCommandProcessor(this);
+}
+
+/**
+ * submitDispatchPkt() is the entry point into the CP from the HSAPP
+ * and is only meant to be used with AQL kernel dispatch packets.
+ * After the HSAPP receives and extracts an AQL packet, it sends
+ * it to the CP, which is responsible for gathering all relevant
+ * information about a task, initializing CU state, and sending
+ * it to the dispatcher for WG creation and dispatch.
+ *
+ * First we need capture all information from the the AQL pkt and
+ * the code object, then store it in an HSAQueueEntry. Once the
+ * packet and code are extracted, we extract information from the
+ * queue descriptor that the CP needs to perform state initialization
+ * on the CU. Finally we call dispatch() to send the task to the
+ * dispatcher. When the task completely finishes, we call finishPkt()
+ * on the HSA packet processor in order to remove the packet from the
+ * queue, and notify the runtime that the task has completed.
+ */
+void
+GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                                       Addr host_pkt_addr)
+{
+    static int dynamic_task_id = 0;
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
+
+    /**
+     * we need to read a pointer in the application's address
+     * space to pull out the kernel code descriptor.
+     */
+    auto *tc = sys->threads[0];
+    auto &virt_proxy = tc->getVirtProxy();
+
+    /**
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     */
+    AMDKernelCode akc;
+    virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
+        sizeof(AMDKernelCode));
+
+    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
+        "kernel object\n", akc.kernel_code_entry_byte_offset);
+
+    Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+        + akc.kernel_code_entry_byte_offset;
+
+    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
+        machine_code_addr);
+
+    Addr kern_name_addr(0);
+    virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+        (uint8_t*)&kern_name_addr, 0x8);
+
+    std::string kernel_name;
+    virt_proxy.readString(kernel_name, kern_name_addr);
+
+    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
+
+    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
+        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+
+    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
+        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
+        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
+        disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
+        disp_pkt->grid_size_x, disp_pkt->grid_size_y,
+        disp_pkt->grid_size_z, disp_pkt->kernarg_address,
+        disp_pkt->completion_signal);
+
+    DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
+        "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
+        "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
+        task->numScalarRegs(), task->codeAddr(), 0, 0);
+
+    initABI(task);
+    ++dynamic_task_id;
+}
+
+/**
+ * submitVendorPkt() is for accepting vendor-specific packets from
+ * the HSAPP. Vendor-specific packets may be used by the runtime to
+ * send commands to the HSA device that are specific to a particular
+ * vendor. The vendor-specific packets should be defined by the vendor
+ * in the runtime.
+ */
+
+/**
+ * TODO: For now we simply tell the HSAPP to finish the packet,
+ *       however a future patch will update this method to provide
+ *       the proper handling of any required vendor-specific packets.
+ *       In the version of ROCm that is currently supported (1.6)
+ *       the runtime will send packets that direct the CP to
+ *       invalidate the GPUs caches. We do this automatically on
+ *       each kernel launch in the CU, so this is safe for now.
+ */
+void
+GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+    Addr host_pkt_addr)
+{
+    hsaPP->finishPkt(raw_pkt, queue_id);
+}
+
+/**
+ * Once the CP has finished extracting all relevant information about
+ * a task and has initialized the ABI state, we send a description of
+ * the task to the dispatcher. The dispatcher will create and dispatch
+ * WGs to the CUs.
+ */
+void
+GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
+{
+    dispatcher.dispatch(task);
+}
+
+/**
+ * The CP is responsible for traversing all HSA-ABI-related data
+ * structures from memory and initializing the ABI state.
+ * Information provided by the MQD, AQL packet, and code object
+ * metadata will be used to initialze register file state.
+ */
+void
+GPUCommandProcessor::initABI(HSAQueueEntry *task)
+{
+    auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
+
+    Addr hostReadIdxPtr
+        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
+
+    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
+        sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
+            &readDispIdOffEvent->readDispIdOffset);
+}
+
+System*
+GPUCommandProcessor::system()
+{
+    return sys;
+}
+
+AddrRangeList
+GPUCommandProcessor::getAddrRanges() const
+{
+    AddrRangeList ranges;
+    return ranges;
+}
+
+void
+GPUCommandProcessor::setShader(Shader *shader)
+{
+    _shader = shader;
+}
+
+Shader*
+GPUCommandProcessor::shader()
+{
+    return _shader;
+}
+
+GPUCommandProcessor*
+GPUCommandProcessorParams::create()
+{
+    return new GPUCommandProcessor(this);
+}
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
new file mode 100644
index 000000000..7253dd421
--- /dev/null
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUCommandProcessor (CP) is responsible for accepting commands, in
+ * the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
+ * works with several components, including the HSAPP and the dispatcher.
+ * When the HSAPP sends a ready task to the CP, it will perform the necessary
+ * operations to extract relevant data structures from memory, such as the
+ * AQL queue descriptor and AQL packet, and initializes register state for the
+ * task's wavefronts.
+ */
+
+#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+
+#include "dev/hsa/hsa_device.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
+
+struct GPUCommandProcessorParams;
+class GPUDispatcher;
+class Shader;
+
+class GPUCommandProcessor : public HSADevice
+{
+  public:
+    typedef GPUCommandProcessorParams Params;
+
+    GPUCommandProcessor() = delete;
+    GPUCommandProcessor(const Params *p);
+
+    void setShader(Shader *shader);
+    Shader* shader();
+
+    void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                           Addr host_pkt_addr) override;
+    void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+                         Addr host_pkt_addr) override;
+    void dispatchPkt(HSAQueueEntry *task);
+
+    Tick write(PacketPtr pkt) override { return 0; }
+    Tick read(PacketPtr pkt) override { return 0; }
+    AddrRangeList getAddrRanges() const override;
+    System *system();
+
+  private:
+    Shader *_shader;
+    GPUDispatcher &dispatcher;
+
+    void initABI(HSAQueueEntry *task);
+
+    /**
+     * Perform a DMA read of the read_dispatch_id_field_base_byte_offset
+     * field, which follows directly after the read_dispatch_id (the read
+     * pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
+     * (MQD)), to find the base address of the MQD. The MQD is the runtime's
+     * soft representation of a HW queue descriptor (HQD).
+     *
+     * Any fields below the read dispatch ID in the amd_hsa_queue_t should
+     * not change according to the HSA standard, therefore we should be able
+     * to get them based on their known relative position to the read dispatch
+     * ID.
+     */
+    class ReadDispIdOffsetDmaEvent : public DmaCallback
+    {
+      public:
+        ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc,
+                                 HSAQueueEntry *task)
+            : DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc),
+              _task(task)
+        {
+        }
+
+        void
+        process() override
+        {
+            /**
+             * Now that the read pointer's offset from the base of
+             * the MQD is known, we can use that to calculate the
+             * the address of the MQD itself, the dispatcher will
+             * DMA that into the HSAQueueEntry when a kernel is
+             * launched.
+             */
+            _task->hostAMDQueueAddr
+                = gpuCmdProc.hsaPP->getQueueDesc(_task->queueId())
+                    ->hostReadIndexPtr - readDispIdOffset;
+
+            /**
+             * DMA a copy of the MQD into the task. Some fields of
+             * the MQD will be used to initialize register state.
+             */
+            auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task);
+            gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr,
+                                   sizeof(_amd_queue_t), mqdDmaEvent,
+                                   &_task->amdQueue);
+        }
+
+        uint32_t readDispIdOffset;
+
+      private:
+        GPUCommandProcessor &gpuCmdProc;
+        HSAQueueEntry *_task;
+    };
+
+    /**
+     * Perform a DMA read of the MQD that corresponds to a hardware
+     * queue descriptor (HQD). We store a copy of the MQD in the
+     * HSAQueueEntry object so we can send a copy of it along with
+     * a dispatch packet, which is needed to initialize register
+     * state.
+     */
+    class MQDDmaEvent : public DmaCallback
+    {
+      public:
+        MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task)
+            : DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task)
+        {
+        }
+
+        void
+        process() override
+        {
+            gpuCmdProc.dispatchPkt(_task);
+        }
+
+      private:
+        GPUCommandProcessor &gpuCmdProc;
+        HSAQueueEntry *_task;
+    };
+};
+
+#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
new file mode 100644
index 000000000..287c2a19a
--- /dev/null
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ *          Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_compute_driver.hh"
+
+#include "cpu/thread_context.hh"
+#include "debug/GPUDriver.hh"
+#include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_ioctl.h"
+#include "params/GPUComputeDriver.hh"
+#include "sim/syscall_emul_buf.hh"
+
+GPUComputeDriver::GPUComputeDriver(Params *p)
+    : HSADriver(p)
+{
+    DPRINTF(GPUDriver, "Constructing KFD: device\n");
+}
+
+int
+GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
+{
+    auto &virt_proxy = tc->getVirtProxy();
+
+    switch (req) {
+        case AMDKFD_IOC_GET_VERSION:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
+
+            TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
+            args->major_version = 1;
+            args->minor_version = 0;
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_CREATE_QUEUE:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
+
+            allocateQueue(virt_proxy, ioc_buf);
+
+            DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
+          }
+          break;
+        case AMDKFD_IOC_DESTROY_QUEUE:
+          {
+            TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
+                    "queue offset %d\n", args->queue_id);
+            device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id);
+          }
+          break;
+        case AMDKFD_IOC_SET_MEMORY_POLICY:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_CLOCK_COUNTERS:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
+
+            TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+
+            // Set nanosecond resolution
+            args->system_clock_freq = 1000000000;
+
+            /**
+             * Derive all clock counters based on the tick. All
+             * device clocks are identical and perfectly in sync.
+             */
+            uint64_t elapsed_nsec = curTick() / SimClock::Int::ns;
+            args->gpu_clock_counter = elapsed_nsec;
+            args->cpu_clock_counter = elapsed_nsec;
+            args->system_clock_counter = elapsed_nsec;
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_GET_PROCESS_APERTURES:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
+
+            TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
+            args->num_of_nodes = 1;
+
+            /**
+             * Set the GPUVM/LDS/Scratch APEs exactly as they
+             * are in the real driver, see the KFD driver
+             * in the ROCm Linux kernel source:
+             * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+             */
+            for (int i = 0; i < args->num_of_nodes; ++i) {
+                /**
+                 * While the GPU node numbers start at 0, we add 1
+                 * to force the count to start at 1. This is to
+                 * ensure that the base/limit addresses are
+                 * calculated correctly.
+                 */
+                args->process_apertures[i].scratch_base
+                    = scratchApeBase(i + 1);
+                args->process_apertures[i].scratch_limit =
+                    scratchApeLimit(args->process_apertures[i].scratch_base);
+
+                args->process_apertures[i].lds_base = ldsApeBase(i + 1);
+                args->process_apertures[i].lds_limit =
+                    ldsApeLimit(args->process_apertures[i].lds_base);
+
+                args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
+                args->process_apertures[i].gpuvm_limit =
+                    gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
+
+                // NOTE: Must match ID populated by hsaTopology.py
+                args->process_apertures[i].gpu_id = 2765;
+
+                DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].gpuvm_base);
+                DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].gpuvm_limit);
+
+                DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].lds_base);
+                DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].lds_limit);
+
+                DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].scratch_base);
+                DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].scratch_limit);
+
+                /**
+                 * The CPU's 64b address space can only use the
+                 * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
+                 * therefore we must ensure that the apertures do not
+                 * fall in the CPU's address space.
+                 */
+                assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+                       47) != 0);
+            }
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_UPDATE_QUEUE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
+          }
+          break;
+        case AMDKFD_IOC_CREATE_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_DESTROY_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_RESET_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_WAIT_EVENTS:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_REGISTER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_UNREGISTER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_ADDRESS_WATCH:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_WAVE_CONTROL:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
+          }
+          break;
+        case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
+          }
+        case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_CU_MASK:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
+                 "\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_TRAP_HANDLER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
+          {
+            DPRINTF(GPUDriver,
+                    "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
+
+            TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
+                ioc_args(ioc_buf);
+
+            ioc_args.copyIn(virt_proxy);
+            ioc_args->num_of_nodes = 1;
+
+            for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
+                TypedBufferArg<kfd_process_device_apertures> ape_args
+                    (ioc_args->kfd_process_device_apertures_ptr);
+
+                ape_args->scratch_base = scratchApeBase(i + 1);
+                ape_args->scratch_limit =
+                    scratchApeLimit(ape_args->scratch_base);
+                ape_args->lds_base = ldsApeBase(i + 1);
+                ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
+                ape_args->gpuvm_base = gpuVmApeBase(i + 1);
+                ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
+
+                ape_args->gpu_id = 2765;
+
+                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);
+
+                ape_args.copyOut(virt_proxy);
+            }
+
+            ioc_args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_GET_DMABUF_INFO:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
+          }
+          break;
+        case AMDKFD_IOC_IMPORT_DMABUF:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_TILE_CONFIG:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
+          }
+          break;
+        case AMDKFD_IOC_IPC_IMPORT_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
+          }
+          break;
+        case AMDKFD_IOC_IPC_EXPORT_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
+          }
+          break;
+        case AMDKFD_IOC_CROSS_MEMORY_COPY:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
+          }
+          break;
+        case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
+          }
+          break;
+        default:
+          fatal("%s: bad ioctl %d\n", req);
+          break;
+    }
+    return 0;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x1000000000000L;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+}
+
+Addr
+GPUComputeDriver::scratchApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x100000000L;
+}
+
+Addr
+GPUComputeDriver::scratchApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+Addr
+GPUComputeDriver::ldsApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x0;
+}
+
+Addr
+GPUComputeDriver::ldsApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+GPUComputeDriver*
+GPUComputeDriverParams::create()
+{
+    return new GPUComputeDriver(this);
+}
diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh
new file mode 100644
index 000000000..b13531de4
--- /dev/null
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ *          Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
+ * agent. Other GPU devices, or other HSA agents, should not derive
+ * from this class. Instead device-specific implementations of an
+ * HSADriver should be provided for each unique device.
+ */
+
+#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+
+#include "dev/hsa/hsa_driver.hh"
+
+struct GPUComputeDriverParams;
+
+class GPUComputeDriver final : public HSADriver
+{
+  public:
+    typedef GPUComputeDriverParams Params;
+    GPUComputeDriver(Params *p);
+    int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
+
+  private:
+    /**
+     * The aperture (APE) base/limit pairs are set
+     * statically at startup by the real KFD. AMD
+     * x86_64 CPUs only use the areas in the 64b
+     * address space where VA[63:47] == 0x1ffff or
+     * VA[63:47] = 0. These methods generate the APE
+     * base/limit pairs in exactly the same way as
+     * the real KFD does, which ensures these APEs do
+     * not fall into the CPU's address space
+     *
+     * see the macros in the KFD driver in the ROCm
+     * Linux kernel source:
+     *
+     * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+     */
+    Addr gpuVmApeBase(int gpuNum) const;
+    Addr gpuVmApeLimit(Addr apeBase) const;
+    Addr scratchApeBase(int gpuNum) const;
+    Addr scratchApeLimit(Addr apeBase) const;
+    Addr ldsApeBase(int gpuNum) const;
+    Addr ldsApeLimit(Addr apeBase) const;
+};
+
+#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 0c729b784..74b963b73 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -35,26 +35,50 @@
 
 #include "debug/GPUMem.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
-                       GPUStaticInst *static_inst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
-      n_reg(0), useContinuation(false),
-      statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum)
+                       GPUStaticInst *static_inst, InstSeqNum instSeqNum)
+    : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
+      (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+      _staticInst(static_inst), _seqNum(instSeqNum)
 {
     tlbHitLevel.assign(computeUnit()->wfSize(), -1);
-    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    // vector instructions can have up to 4 source/destination operands
+    d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
     a_data = new uint8_t[computeUnit()->wfSize() * 8];
     x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    // scalar loads can read up to 16 Dwords of data (see publicly
+    // available GCN3 ISA manual)
+    scalar_data = new uint8_t[16 * sizeof(uint32_t)];
+    for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
+        scalar_data[i] = 0;
+    }
     for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
         a_data[i] = 0;
         x_data[i] = 0;
     }
-    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+    for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
         d_data[i] = 0;
     }
+    time = 0;
+
+    cu_id = _cu->cu_id;
+    if (_wf) {
+        simdId = _wf->simdId;
+        wfDynId = _wf->wfDynId;
+        kern_id = _wf->kernId;
+        wg_id = _wf->wgId;
+        wfSlotId = _wf->wfSlotId;
+    } else {
+        simdId = -1;
+        wfDynId = -1;
+        kern_id = -1;
+        wg_id = -1;
+        wfSlotId = -1;
+    }
 }
 
 GPUDynInst::~GPUDynInst()
@@ -62,6 +86,8 @@ GPUDynInst::~GPUDynInst()
     delete[] d_data;
     delete[] a_data;
     delete[] x_data;
+    delete[] scalar_data;
+    delete _staticInst;
 }
 
 void
@@ -82,6 +108,36 @@ GPUDynInst::numDstRegOperands()
     return _staticInst->numDstRegOperands();
 }
 
+int
+GPUDynInst::numSrcVecOperands()
+{
+    return _staticInst->numSrcVecOperands();
+}
+
+int
+GPUDynInst::numDstVecOperands()
+{
+    return _staticInst->numDstVecOperands();
+}
+
+int
+GPUDynInst::numSrcVecDWORDs()
+{
+    return _staticInst->numSrcVecDWORDs();
+}
+
+int
+GPUDynInst::numDstVecDWORDs()
+{
+    return _staticInst->numDstVecDWORDs();
+}
+
+int
+GPUDynInst::numOpdDWORDs(int operandIdx)
+{
+    return _staticInst->numOpdDWORDs(operandIdx);
+}
+
 int
 GPUDynInst::getNumOperands()
 {
@@ -100,12 +156,6 @@ GPUDynInst::isScalarRegister(int operandIdx)
     return _staticInst->isScalarRegister(operandIdx);
 }
 
-bool
-GPUDynInst::isCondRegister(int operandIdx)
-{
-    return _staticInst->isCondRegister(operandIdx);
-}
-
 int
 GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
 {
@@ -130,13 +180,82 @@ GPUDynInst::isSrcOperand(int operandIdx)
     return _staticInst->isSrcOperand(operandIdx);
 }
 
+bool
+GPUDynInst::hasSourceSgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasSourceVgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasDestinationSgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::srcIsVgpr(int index) const
+{
+    assert(index >= 0 && index < _staticInst->getNumOperands());
+    if (_staticInst->isVectorRegister(index) &&
+        _staticInst->isSrcOperand(index)) {
+        return true;
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasDestinationVgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr,
+                     const std::string& extStr) const
+{
+    return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
+        _staticInst->opcode().find(extStr) != std::string::npos;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr) const
+{
+    return _staticInst->opcode().find(opcodeStr) != std::string::npos;
+}
+
 const std::string&
 GPUDynInst::disassemble() const
 {
     return _staticInst->disassemble();
 }
 
-uint64_t
+InstSeqNum
 GPUDynInst::seqNum() const
 {
     return _seqNum;
@@ -148,6 +267,40 @@ GPUDynInst::executedAs()
     return _staticInst->executed_as;
 }
 
+bool
+GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
+{
+    assert(s);
+    for (int i = 0; i < getNumOperands(); ++i) {
+        if (isVectorRegister(i) && isSrcOperand(i)) {
+            for (int j = 0; j < s->getNumOperands(); ++j) {
+                if (s->isVectorRegister(j) && s->isDstOperand(j)) {
+                    if (i == j)
+                        return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
+{
+    assert(s);
+    for (int i = 0; i < getNumOperands(); ++i) {
+        if (isScalarRegister(i) && isSrcOperand(i)) {
+            for (int j = 0; j < s->getNumOperands(); ++j) {
+                if (s->isScalarRegister(j) && s->isDstOperand(j)) {
+                    if (i == j)
+                        return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
 // Process a memory instruction and (if necessary) submit timing request
 void
 GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
@@ -156,12 +309,15 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
             cu->cu_id, simdId, wfSlotId, exec_mask);
 
     _staticInst->initiateAcc(gpuDynInst);
-    time = 0;
 }
 
 void
 GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
+            "%#x\n complete",
+            cu->cu_id, simdId, wfSlotId, exec_mask);
+
     _staticInst->completeAcc(gpuDynInst);
 }
 
@@ -181,12 +337,42 @@ GPUDynInst::isBranch() const
     return _staticInst->isBranch();
 }
 
+bool
+GPUDynInst::isCondBranch() const
+{
+    return _staticInst->isCondBranch();
+}
+
 bool
 GPUDynInst::isNop() const
 {
     return _staticInst->isNop();
 }
 
+bool
+GPUDynInst::isEndOfKernel() const
+{
+    return _staticInst->isEndOfKernel();
+}
+
+bool
+GPUDynInst::isKernelLaunch() const
+{
+    return _staticInst->isKernelLaunch();
+}
+
+bool
+GPUDynInst::isSDWAInst() const
+{
+    return _staticInst->isSDWAInst();
+}
+
+bool
+GPUDynInst::isDPPInst() const
+{
+    return _staticInst->isDPPInst();
+}
+
 bool
 GPUDynInst::isReturn() const
 {
@@ -218,9 +404,9 @@ GPUDynInst::isBarrier() const
 }
 
 bool
-GPUDynInst::isMemFence() const
+GPUDynInst::isMemSync() const
 {
-    return _staticInst->isMemFence();
+    return _staticInst->isMemSync();
 }
 
 bool
@@ -265,6 +451,12 @@ GPUDynInst::isAtomicRet() const
     return _staticInst->isAtomicRet();
 }
 
+bool
+GPUDynInst::isVector() const
+{
+    return !_staticInst->isScalar();
+}
+
 bool
 GPUDynInst::isScalar() const
 {
@@ -295,6 +487,78 @@ GPUDynInst::writesVCC() const
     return _staticInst->writesVCC();
 }
 
+bool
+GPUDynInst::readsMode() const
+{
+    return _staticInst->readsMode();
+}
+
+bool
+GPUDynInst::writesMode() const
+{
+    return _staticInst->writesMode();
+}
+
+bool
+GPUDynInst::readsEXEC() const
+{
+    return _staticInst->readsEXEC();
+}
+
+bool
+GPUDynInst::writesEXEC() const
+{
+    return _staticInst->writesEXEC();
+}
+
+bool
+GPUDynInst::ignoreExec() const
+{
+    return _staticInst->ignoreExec();
+}
+
+bool
+GPUDynInst::writesExecMask() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        return _staticInst->isDstOperand(i) &&
+            _staticInst->isExecMaskRegister(i);
+    }
+    return false;
+}
+
+bool
+GPUDynInst::readsExecMask() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        return _staticInst->isSrcOperand(i) &&
+            _staticInst->isExecMaskRegister(i);
+    }
+    return false;
+}
+
+bool
+GPUDynInst::writesFlatScratch() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+            return _staticInst->isFlatScratchRegister(i);
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::readsFlatScratch() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+            return _staticInst->isFlatScratchRegister(i);
+        }
+    }
+    return false;
+}
+
 bool
 GPUDynInst::isAtomicAnd() const
 {
@@ -421,81 +685,241 @@ GPUDynInst::isSpillSeg() const
 }
 
 bool
-GPUDynInst::isWorkitemScope() const
-{
-    return _staticInst->isWorkitemScope();
-}
-
-bool
-GPUDynInst::isWavefrontScope() const
+GPUDynInst::isGloballyCoherent() const
 {
-    return _staticInst->isWavefrontScope();
+    return _staticInst->isGloballyCoherent();
 }
 
 bool
-GPUDynInst::isWorkgroupScope() const
+GPUDynInst::isSystemCoherent() const
 {
-    return _staticInst->isWorkgroupScope();
+    return _staticInst->isSystemCoherent();
 }
 
 bool
-GPUDynInst::isDeviceScope() const
+GPUDynInst::isF16() const
 {
-    return _staticInst->isDeviceScope();
+    return _staticInst->isF16();
 }
 
 bool
-GPUDynInst::isSystemScope() const
+GPUDynInst::isF32() const
 {
-    return _staticInst->isSystemScope();
+    return _staticInst->isF32();
 }
 
 bool
-GPUDynInst::isNoScope() const
+GPUDynInst::isF64() const
 {
-    return _staticInst->isNoScope();
+    return _staticInst->isF64();
 }
 
 bool
-GPUDynInst::isRelaxedOrder() const
+GPUDynInst::isFMA() const
 {
-    return _staticInst->isRelaxedOrder();
+    return _staticInst->isFMA();
 }
 
 bool
-GPUDynInst::isAcquire() const
+GPUDynInst::isMAC() const
 {
-    return _staticInst->isAcquire();
+    return _staticInst->isMAC();
 }
 
 bool
-GPUDynInst::isRelease() const
+GPUDynInst::isMAD() const
 {
-    return _staticInst->isRelease();
+    return _staticInst->isMAD();
 }
 
-bool
-GPUDynInst::isAcquireRelease() const
-{
-    return _staticInst->isAcquireRelease();
-}
+void
+GPUDynInst::doApertureCheck(const VectorMask &mask)
+{
+    assert(mask.any());
+    // find the segment of the first active address, after
+    // that we check that all other active addresses also
+    // fall within the same APE
+    for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+        if (mask[lane]) {
+            if (computeUnit()->shader->isLdsApe(addr[lane])) {
+                // group segment
+                staticInstruction()->executed_as = Enums::SC_GROUP;
+                break;
+            } else if (computeUnit()->shader->isScratchApe(addr[lane])) {
+                // private segment
+                staticInstruction()->executed_as = Enums::SC_PRIVATE;
+                break;
+            } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
+                // we won't support GPUVM
+                fatal("flat access is in GPUVM APE\n");
+            } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
+                       bits(addr[lane], 63, 47)) {
+                // we are in the "hole", this is a memory violation
+                fatal("flat access at addr %#x has a memory violation\n",
+                      addr[lane]);
+            } else {
+                // global memory segment
+                staticInstruction()->executed_as = Enums::SC_GLOBAL;
+                break;
+            }
+        }
+    }
 
-bool
-GPUDynInst::isNoOrder() const
-{
-    return _staticInst->isNoOrder();
+    // we should have found the segment
+    assert(executedAs() != Enums::SC_NONE);
+
+    // flat accesses should not straddle multiple APEs so we
+    // must check that all addresses fall within the same APE
+    if (executedAs() == Enums::SC_GROUP) {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was LDS,
+                // all the rest should be
+                assert(computeUnit()->shader->isLdsApe(addr[lane]));
+            }
+        }
+    } else if (executedAs() == Enums::SC_PRIVATE) {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was private,
+                // all the rest should be
+                assert(computeUnit()->shader->isScratchApe(addr[lane]));
+            }
+        }
+    } else {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was global,
+                // all the rest should be. because we don't have an
+                // explicit range of the global segment, we just make
+                // sure that the address fall in no other APE and that
+                // it is not a memory violation
+                assert(!computeUnit()->shader->isLdsApe(addr[lane]));
+                assert(!computeUnit()->shader->isScratchApe(addr[lane]));
+                assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
+                assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
+                       && bits(addr[lane], 63, 47)));
+            }
+        }
+    }
 }
 
-bool
-GPUDynInst::isGloballyCoherent() const
-{
-    return _staticInst->isGloballyCoherent();
+void
+GPUDynInst::resolveFlatSegment(const VectorMask &mask)
+{
+    doApertureCheck(mask);
+
+
+    // Now that we know the aperature, do the following:
+    // 1. Transform the flat address to its segmented equivalent.
+    // 2. Set the execUnitId based an the aperture check.
+    // 3. Decrement any extra resources that were reserved. Other
+    //    resources are released as normal, below.
+    if (executedAs() == Enums::SC_GLOBAL) {
+        // no transormation for global segment
+        wavefront()->execUnitId =  wavefront()->flatGmUnitId;
+        if (isLoad()) {
+            wavefront()->rdLmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrLmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->wrLmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else if (executedAs() == Enums::SC_GROUP) {
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // flat address calculation goes here.
+                // addr[lane] = segmented address
+                panic("Flat group memory operation is unimplemented!\n");
+            }
+        }
+        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        if (isLoad()) {
+            wavefront()->rdGmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrGmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->rdGmReqsInPipe--;
+            wavefront()->wrGmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else if (executedAs() == Enums::SC_PRIVATE) {
+        /**
+         * Flat instructions may resolve to the private segment (scratch),
+         * which is backed by main memory and provides per-lane scratch
+         * memory. Flat addressing uses apertures - registers that specify
+         * the address range in the VA space where LDS/private memory is
+         * mapped. The value of which is set by the kernel mode driver.
+         * These apertures use addresses that are not used by x86 CPUs.
+         * When the address of a Flat operation falls into one of the
+         * apertures, the Flat operation is redirected to either LDS or
+         * to the private memory segment.
+         *
+         * For private memory the SW runtime will allocate some space in
+         * the VA space for each AQL queue. The base address of which is
+         * stored in scalar registers per the AMD GPU ABI. The amd_queue_t
+         * scratch_backing_memory_location provides the base address in
+         * memory for the queue's private segment. Various other fields
+         * loaded into register state during kernel launch specify per-WF
+         * and per-work-item offsets so that individual lanes may access
+         * their private segment allocation.
+         *
+         * For more details about flat addressing see:
+         *     http://rocm-documentation.readthedocs.io/en/latest/
+         *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
+         *
+         *     https://github.com/ROCm-Developer-Tools/
+         *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
+         *     #flat-addressing
+         */
+
+        uint32_t numSgprs = wavefront()->maxSgprs;
+        uint32_t physSgprIdx =
+            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+                                                          numSgprs - 3);
+        uint32_t offset =
+            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+        physSgprIdx =
+            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+                                                          numSgprs - 4);
+        uint32_t size =
+            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                addr[lane] = addr[lane] + lane * size + offset +
+                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
+                    wavefront()->computeUnit->shader->getScratchBase();
+            }
+        }
+        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        if (isLoad()) {
+            wavefront()->rdGmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrGmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->rdGmReqsInPipe--;
+            wavefront()->wrGmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else {
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                panic("flat addr %#llx maps to bad segment %d\n",
+                      addr[lane], executedAs());
+            }
+        }
+    }
 }
 
-bool
-GPUDynInst::isSystemCoherent() const
+TheGpuISA::ScalarRegU32
+GPUDynInst::srcLiteral() const
 {
-    return _staticInst->isSystemCoherent();
+    return _staticInst->srcLiteral();
 }
 
 void
@@ -504,6 +928,8 @@ GPUDynInst::updateStats()
     if (_staticInst->isLocalMem()) {
         // access to LDS (shared) memory
         cu->dynamicLMemInstrCnt++;
+    } else if (_staticInst->isFlat()) {
+        cu->dynamicFlatMemInstrCnt++;
     } else {
         // access to global memory
 
@@ -536,3 +962,28 @@ GPUDynInst::updateStats()
         cu->dynamicGMemInstrCnt++;
     }
 }
+
+void
+GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
+{
+    // Only take the first measurement in the case of coalescing
+    if (roundTripTime.size() > hopId)
+        return;
+
+    roundTripTime.push_back(currentTime);
+}
+
+void
+GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
+{
+    if (lineAddressTime.count(addr)) {
+        if (lineAddressTime[addr].size() > hopId) {
+            return;
+        }
+
+        lineAddressTime[addr].push_back(currentTime);
+    } else if (hopId == 0) {
+        auto addressTimeVec = std::vector<Tick> { currentTime };
+        lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
+    }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index bee08e3df..392b57d12 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -39,7 +39,6 @@
 
 #include "base/amo.hh"
 #include "base/logging.hh"
-#include "enums/MemType.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
@@ -68,20 +67,10 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
         } else {
             computeUnit->numFailedCASOps++;
         }
-
-        if (computeUnit->xact_cas_mode) {
-            computeUnit->xactCasLoadMap.clear();
-        }
     }
     AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
 };
 
-typedef enum
-{
-    VT_32,
-    VT_64,
-} vgpr_type;
-
 class GPUDynInst : public GPUExecContext
 {
   public:
@@ -91,27 +80,51 @@ class GPUDynInst : public GPUExecContext
     void execute(GPUDynInstPtr gpuDynInst);
     int numSrcRegOperands();
     int numDstRegOperands();
+    int numDstVecOperands();
+    int numSrcVecOperands();
+    int numSrcVecDWORDs();
+    int numDstVecDWORDs();
+    int numOpdDWORDs(int operandIdx);
     int getNumOperands();
     bool isVectorRegister(int operandIdx);
     bool isScalarRegister(int operandIdx);
-    bool isCondRegister(int operandIdx);
     int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
     int getOperandSize(int operandIdx);
     bool isDstOperand(int operandIdx);
     bool isSrcOperand(int operandIdx);
 
+    bool hasDestinationSgpr() const;
+    bool hasSourceSgpr() const;
+    bool hasDestinationVgpr() const;
+    bool hasSourceVgpr() const;
+
+    bool hasSgprRawDependence(GPUDynInstPtr s);
+    bool hasVgprRawDependence(GPUDynInstPtr s);
+
+    // returns true if the string "opcodeStr" is found in the
+    // opcode of the instruction
+    bool isOpcode(const std::string& opcodeStr) const;
+    bool isOpcode(const std::string& opcodeStr,
+                  const std::string& extStr) const;
+    // returns true if source operand at "index" is a vector register
+    bool srcIsVgpr(int index) const;
+
     const std::string &disassemble() const;
 
-    uint64_t seqNum() const;
+    InstSeqNum seqNum() const;
 
     Enums::StorageClassType executedAs();
 
-    // The address of the memory operation
+    // virtual address for scalar memory operations
+    Addr scalarAddr;
+    // virtual addressies for vector memory operations
     std::vector<Addr> addr;
     Addr pAddr;
 
-    // The data to get written
+    // vector data to get written
     uint8_t *d_data;
+    // scalar data to be transferred
+    uint8_t *scalar_data;
     // Additional data (for atomics)
     uint8_t *a_data;
     // Additional data (for atomics)
@@ -119,19 +132,6 @@ class GPUDynInst : public GPUExecContext
     // The execution mask
     VectorMask exec_mask;
 
-    // The memory type (M_U32, M_S32, ...)
-    Enums::MemType m_type;
-
-    // The equivalency class
-    int equiv;
-    // The return VGPR type (VT_32 or VT_64)
-    vgpr_type v_type;
-    // Number of VGPR's accessed (1, 2, or 4)
-    int n_reg;
-    // The return VGPR index
-    int dst_reg;
-    // There can be max 4 dest regs>
-    int dst_reg_vec[4];
     // SIMD where the WF of the memory instruction has been mapped to
     int simdId;
     // unique id of the WF where the memory instruction belongs to
@@ -140,21 +140,16 @@ class GPUDynInst : public GPUExecContext
     int kern_id;
     // The CU id of the requesting wf
     int cu_id;
+    // The workgroup id of the requesting wf
+    int wg_id;
     // HW slot id where the WF is mapped to inside a SIMD unit
     int wfSlotId;
     // execution pipeline id where the memory instruction has been scheduled
-    int pipeId;
+    int execUnitId;
     // The execution time of this operation
     Tick time;
     // The latency of this operation
     WaitClass latency;
-    // A list of bank conflicts for the 4 cycles.
-    uint32_t bc[4];
-
-    // A pointer to ROM
-    uint8_t *rom;
-    // The size of the READONLY segment
-    int sz_rom;
 
     // Initiate the specified memory operation, by creating a
     // memory request and sending it off to the memory system.
@@ -168,16 +163,23 @@ class GPUDynInst : public GPUExecContext
 
     GPUStaticInst* staticInstruction() { return _staticInst; }
 
+    TheGpuISA::ScalarRegU32 srcLiteral() const;
+
     bool isALU() const;
     bool isBranch() const;
+    bool isCondBranch() const;
     bool isNop() const;
     bool isReturn() const;
+    bool isEndOfKernel() const;
+    bool isKernelLaunch() const;
+    bool isSDWAInst() const;
+    bool isDPPInst() const;
     bool isUnconditionalJump() const;
     bool isSpecialOp() const;
     bool isWaitcnt() const;
 
     bool isBarrier() const;
-    bool isMemFence() const;
+    bool isMemSync() const;
     bool isMemRef() const;
     bool isFlat() const;
     bool isLoad() const;
@@ -188,10 +190,20 @@ class GPUDynInst : public GPUExecContext
     bool isAtomicRet() const;
 
     bool isScalar() const;
+    bool isVector() const;
     bool readsSCC() const;
     bool writesSCC() const;
     bool readsVCC() const;
     bool writesVCC() const;
+    bool readsEXEC() const;
+    bool writesEXEC() const;
+    bool readsMode() const;
+    bool writesMode() const;
+    bool ignoreExec() const;
+    bool readsFlatScratch() const;
+    bool writesFlatScratch() const;
+    bool readsExecMask() const;
+    bool writesExecMask() const;
 
     bool isAtomicAnd() const;
     bool isAtomicOr() const;
@@ -217,39 +229,25 @@ class GPUDynInst : public GPUExecContext
     bool isReadOnlySeg() const;
     bool isSpillSeg() const;
 
-    bool isWorkitemScope() const;
-    bool isWavefrontScope() const;
-    bool isWorkgroupScope() const;
-    bool isDeviceScope() const;
-    bool isSystemScope() const;
-    bool isNoScope() const;
-
-    bool isRelaxedOrder() const;
-    bool isAcquire() const;
-    bool isRelease() const;
-    bool isAcquireRelease() const;
-    bool isNoOrder() const;
-
     bool isGloballyCoherent() const;
     bool isSystemCoherent() const;
 
-    /*
-     * Loads/stores/atomics may have acquire/release semantics associated
-     * withthem. Some protocols want to see the acquire/release as separate
-     * requests from the load/store/atomic. We implement that separation
-     * using continuations (i.e., a function pointer with an object associated
-     * with it). When, for example, the front-end generates a store with
-     * release semantics, we will first issue a normal store and set the
-     * continuation in the GPUDynInst to a function that generate a
-     * release request. That continuation will be called when the normal
-     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
-     * continuation will be called in the context of the same GPUDynInst
-     * that generated the initial store.
-     */
-    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
-
-    // when true, call execContinuation when response arrives
-    bool useContinuation;
+    bool isF16() const;
+    bool isF32() const;
+    bool isF64() const;
+
+    bool isFMA() const;
+    bool isMAC() const;
+    bool isMAD() const;
+
+    // for FLAT memory ops. check the segment address
+    // against the APE registers to see if it falls
+    // within one of the APE ranges for LDS/SCRATCH/GPUVM.
+    // if it does not fall into one of the three APEs, it
+    // will be a regular global access.
+    void doApertureCheck(const VectorMask &mask);
+    // Function to resolve a flat accesses during execution stage.
+    void resolveFlatSegment(const VectorMask &mask);
 
     template<typename c0> AtomicOpFunctorPtr
     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
@@ -282,62 +280,31 @@ class GPUDynInst : public GPUExecContext
     }
 
     void
-    setRequestFlags(RequestPtr req, bool setMemOrder=true)
+    setRequestFlags(RequestPtr req) const
     {
-        // currently these are the easy scopes to deduce
-        if (isPrivateSeg()) {
-            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
-        } else if (isSpillSeg()) {
-            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
-        } else if (isGlobalSeg()) {
-            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
-        } else if (isReadOnlySeg()) {
-            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
-        } else if (isGroupSeg()) {
-            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
-        } else if (isFlat()) {
-            panic("TODO: translate to correct scope");
-        } else {
-            fatal("%s has bad segment type\n", disassemble());
+        if (isGloballyCoherent()) {
+            req->setCacheCoherenceFlags(Request::GLC_BIT);
         }
 
-        if (isWavefrontScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::WAVEFRONT_SCOPE);
-        } else if (isWorkgroupScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::WORKGROUP_SCOPE);
-        } else if (isDeviceScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::DEVICE_SCOPE);
-        } else if (isSystemScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::SYSTEM_SCOPE);
-        } else if (!isNoScope() && !isWorkitemScope()) {
-            fatal("%s has bad scope type\n", disassemble());
+        if (isSystemCoherent()) {
+            req->setCacheCoherenceFlags(Request::SLC_BIT);
         }
 
-        if (setMemOrder) {
-            // set acquire and release flags
-            if (isAcquire()) {
-                req->setFlags(Request::ACQUIRE);
-            } else if (isRelease()) {
-                req->setFlags(Request::RELEASE);
-            } else if (isAcquireRelease()) {
-                req->setFlags(Request::ACQUIRE | Request::RELEASE);
-            } else if (!isNoOrder()) {
-                fatal("%s has bad memory order\n", disassemble());
-            }
-        }
-
-        // set atomic type
-        // currently, the instruction genenerator only produces atomic return
-        // but a magic instruction can produce atomic no return
         if (isAtomicRet()) {
             req->setFlags(Request::ATOMIC_RETURN_OP);
         } else if (isAtomicNoRet()) {
             req->setFlags(Request::ATOMIC_NO_RETURN_OP);
         }
+
+        if (isMemSync()) {
+            // the path for kernel launch and kernel end is different
+            // from non-kernel mem sync.
+            assert(!isKernelLaunch());
+            assert(!isEndOfKernel());
+
+            // must be wbinv inst if not kernel launch/end
+            req->setCacheCoherenceFlags(Request::ACQUIRE);
+        }
     }
 
     // Map returned packets and the addresses they satisfy with which lane they
@@ -348,12 +315,39 @@ class GPUDynInst : public GPUExecContext
     // Track the status of memory requests per lane, a bit per lane
     VectorMask statusBitVector;
     // for ld_v# or st_v#
-    std::vector<int> statusVector;
     std::vector<int> tlbHitLevel;
 
+    // for misaligned scalar ops we track the number
+    // of outstanding reqs here
+    int numScalarReqs;
+
+    Tick getAccessTime() const { return accessTime; }
+
+    void setAccessTime(Tick currentTime) { accessTime = currentTime; }
+
+    void profileRoundTripTime(Tick currentTime, int hopId);
+    std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
+
+    void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
+    const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
+    { return lineAddressTime; }
+
+    // inst used to save/restore a wavefront context
+    bool isSaveRestore;
   private:
     GPUStaticInst *_staticInst;
-    uint64_t _seqNum;
+    const InstSeqNum _seqNum;
+
+    // the time the request was started
+    Tick accessTime = -1;
+
+    // hold the tick when the instruction arrives at certain hop points
+    // on it's way to main memory
+    std::vector<Tick> roundTripTime;
+
+    // hold each cache block address for the instruction and a vector
+    // to hold the tick when the block arrives at certain hop points
+    std::map<Addr, std::vector<Tick>> lineAddressTime;
 };
 
 #endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
index 154d2b8ed..2411e9e84 100644
--- a/src/gpu-compute/gpu_exec_context.cc
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -59,8 +59,8 @@ GPUExecContext::readMiscReg(int opIdx) const
 }
 
 void
-GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal)
+GPUExecContext::writeMiscReg(int opIdx, RegVal val)
 {
     assert(gpuISA);
-    gpuISA->writeMiscReg(opIdx, operandVal);
+    gpuISA->writeMiscReg(opIdx, val);
 }
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
index 49c0315ba..9ab1580df 100644
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -34,10 +34,10 @@
 #include "gpu-compute/gpu_static_inst.hh"
 
 GPUStaticInst::GPUStaticInst(const std::string &opcode)
-    : executed_as(Enums::SC_NONE), opcode(opcode),
-      _instNum(0), _instAddr(0)
+    : executed_as(Enums::SC_NONE), _opcode(opcode),
+      _instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1),
+      srcVecDWORDs(-1), dstVecDWORDs(-1)
 {
-    setFlag(NoOrder);
 }
 
 const std::string&
@@ -50,3 +50,80 @@ GPUStaticInst::disassemble()
 
     return disassembly;
 }
+
+int
+GPUStaticInst::numSrcVecOperands()
+{
+    if (srcVecOperands > -1)
+        return srcVecOperands;
+
+    srcVecOperands = 0;
+    if (!isScalar()) {
+        for (int k = 0; k < getNumOperands(); ++k) {
+            if (isVectorRegister(k) && isSrcOperand(k))
+                srcVecOperands++;
+        }
+    }
+    return srcVecOperands;
+}
+
+int
+GPUStaticInst::numDstVecOperands()
+{
+    if (dstVecOperands > -1)
+        return dstVecOperands;
+
+    dstVecOperands = 0;
+    if (!isScalar()) {
+        for (int k = 0; k < getNumOperands(); ++k) {
+            if (isVectorRegister(k) && isDstOperand(k))
+                dstVecOperands++;
+        }
+    }
+    return dstVecOperands;
+}
+
+int
+GPUStaticInst::numSrcVecDWORDs()
+{
+    if (srcVecDWORDs > -1) {
+        return srcVecDWORDs;
+    }
+
+    srcVecDWORDs = 0;
+    if (!isScalar()) {
+        for (int i = 0; i < getNumOperands(); i++) {
+            if (isVectorRegister(i) && isSrcOperand(i)) {
+                int dwords = numOpdDWORDs(i);
+                srcVecDWORDs += dwords;
+            }
+        }
+    }
+    return srcVecDWORDs;
+}
+
+int
+GPUStaticInst::numDstVecDWORDs()
+{
+    if (dstVecDWORDs > -1) {
+        return dstVecDWORDs;
+    }
+
+    dstVecDWORDs = 0;
+    if (!isScalar()) {
+        for (int i = 0; i < getNumOperands(); i++) {
+            if (isVectorRegister(i) && isDstOperand(i)) {
+                int dwords = numOpdDWORDs(i);
+                dstVecDWORDs += dwords;
+            }
+        }
+    }
+    return dstVecDWORDs;
+}
+
+int
+GPUStaticInst::numOpdDWORDs(int operandIdx)
+{
+    return getOperandSize(operandIdx) <= 4 ? 1
+        : getOperandSize(operandIdx) / 4;
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index ee5a98e77..88fd9f991 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -59,6 +59,7 @@ class GPUStaticInst : public GPUStaticInstFlags
 {
   public:
     GPUStaticInst(const std::string &opcode);
+    virtual ~GPUStaticInst() { }
     void instAddr(int inst_addr) { _instAddr = inst_addr; }
     int instAddr() const { return _instAddr; }
     int nextInstAddr() const { return _instAddr + instSize(); }
@@ -71,15 +72,18 @@ class GPUStaticInst : public GPUStaticInstFlags
 
     int ipdInstNum() const { return _ipdInstNum; }
 
+    virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; }
+
     virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
     virtual void generateDisassembly() = 0;
     const std::string& disassemble();
     virtual int getNumOperands() = 0;
-    virtual bool isCondRegister(int operandIndex) = 0;
     virtual bool isScalarRegister(int operandIndex) = 0;
     virtual bool isVectorRegister(int operandIndex) = 0;
     virtual bool isSrcOperand(int operandIndex) = 0;
     virtual bool isDstOperand(int operandIndex) = 0;
+    virtual bool isFlatScratchRegister(int opIdx) = 0;
+    virtual bool isExecMaskRegister(int opIdx) = 0;
     virtual int getOperandSize(int operandIndex) = 0;
 
     virtual int getRegisterIndex(int operandIndex,
@@ -88,12 +92,24 @@ class GPUStaticInst : public GPUStaticInstFlags
     virtual int numDstRegOperands() = 0;
     virtual int numSrcRegOperands() = 0;
 
-    virtual bool isValid() const = 0;
+    virtual int coalescerTokenCount() const { return 0; }
+
+    int numDstVecOperands();
+    int numSrcVecOperands();
+    int numDstVecDWORDs();
+    int numSrcVecDWORDs();
+
+    int numOpdDWORDs(int operandIdx);
 
     bool isALU() const { return _flags[ALU]; }
     bool isBranch() const { return _flags[Branch]; }
+    bool isCondBranch() const { return _flags[CondBranch]; }
     bool isNop() const { return _flags[Nop]; }
     bool isReturn() const { return _flags[Return]; }
+    bool isEndOfKernel() const { return _flags[EndOfKernel]; }
+    bool isKernelLaunch() const { return _flags[KernelLaunch]; }
+    bool isSDWAInst() const { return _flags[IsSDWA]; }
+    bool isDPPInst() const { return _flags[IsDPP]; }
 
     bool
     isUnconditionalJump() const
@@ -105,7 +121,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isWaitcnt() const { return _flags[Waitcnt]; }
 
     bool isBarrier() const { return _flags[MemBarrier]; }
-    bool isMemFence() const { return _flags[MemFence]; }
+    bool isMemSync() const { return _flags[MemSync]; }
     bool isMemRef() const { return _flags[MemoryRef]; }
     bool isFlat() const { return _flags[Flat]; }
     bool isLoad() const { return _flags[Load]; }
@@ -125,6 +141,13 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool writesSCC() const { return _flags[WritesSCC]; }
     bool readsVCC() const { return _flags[ReadsVCC]; }
     bool writesVCC() const { return _flags[WritesVCC]; }
+    // Identify instructions that implicitly read the Execute mask
+    // as a source operand but not to dictate which threads execute.
+    bool readsEXEC() const { return _flags[ReadsEXEC]; }
+    bool writesEXEC() const { return _flags[WritesEXEC]; }
+    bool readsMode() const { return _flags[ReadsMode]; }
+    bool writesMode() const { return _flags[WritesMode]; }
+    bool ignoreExec() const { return _flags[IgnoreExec]; }
 
     bool isAtomicAnd() const { return _flags[AtomicAnd]; }
     bool isAtomicOr() const { return _flags[AtomicOr]; }
@@ -166,34 +189,29 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; }
     bool isSpillSeg() const { return _flags[SpillSegment]; }
 
-    bool isWorkitemScope() const { return _flags[WorkitemScope]; }
-    bool isWavefrontScope() const { return _flags[WavefrontScope]; }
-    bool isWorkgroupScope() const { return _flags[WorkgroupScope]; }
-    bool isDeviceScope() const { return _flags[DeviceScope]; }
-    bool isSystemScope() const { return _flags[SystemScope]; }
-    bool isNoScope() const { return _flags[NoScope]; }
-
-    bool isRelaxedOrder() const { return _flags[RelaxedOrder]; }
-    bool isAcquire() const { return _flags[Acquire]; }
-    bool isRelease() const { return _flags[Release]; }
-    bool isAcquireRelease() const { return _flags[AcquireRelease]; }
-    bool isNoOrder() const { return _flags[NoOrder]; }
-
     /**
-     * Coherence domain of a memory instruction. Only valid for
-     * machine ISA. The coherence domain specifies where it is
-     * possible to perform memory synchronization, e.g., acquire
-     * or release, from the shader kernel.
+     * Coherence domain of a memory instruction. The coherence domain
+     * specifies where it is possible to perform memory synchronization
+     * (e.g., acquire or release) from the shader kernel.
      *
-     * isGloballyCoherent(): returns true if kernel is sharing memory
-     * with other work-items on the same device (GPU)
+     * isGloballyCoherent(): returns true if WIs share same device
+     * isSystemCoherent(): returns true if WIs or threads in different
+     *                     devices share memory
      *
-     * isSystemCoherent(): returns true if kernel is sharing memory
-     * with other work-items on a different device (GPU) or the host (CPU)
      */
     bool isGloballyCoherent() const { return _flags[GloballyCoherent]; }
     bool isSystemCoherent() const { return _flags[SystemCoherent]; }
 
+    // Floating-point instructions
+    bool isF16() const { return _flags[F16]; }
+    bool isF32() const { return _flags[F32]; }
+    bool isF64() const { return _flags[F64]; }
+
+    // FMA, MAC, MAD instructions
+    bool isFMA() const { return _flags[FMA]; }
+    bool isMAC() const { return _flags[MAC]; }
+    bool isMAD() const { return _flags[MAD]; }
+
     virtual int instSize() const = 0;
 
     // only used for memory instructions
@@ -217,37 +235,36 @@ class GPUStaticInst : public GPUStaticInstFlags
     // For flat memory accesses
     Enums::StorageClassType executed_as;
 
-    void setFlag(Flags flag) { _flags[flag] = true; }
-
-    virtual void
-    execLdAcq(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execLdAcq() on a non-load instruction.\n");
-    }
-
-    virtual void
-    execSt(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execLdAcq() on a non-load instruction.\n");
-    }
-
-    virtual void
-    execAtomic(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execAtomic() on a non-atomic instruction.\n");
-    }
-
-    virtual void
-    execAtomicAcq(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+    void setFlag(Flags flag) {
+        _flags[flag] = true;
+
+        if (isGroupSeg()) {
+            executed_as = Enums::SC_GROUP;
+        } else if (isGlobalSeg()) {
+            executed_as = Enums::SC_GLOBAL;
+        } else if (isPrivateSeg()) {
+            executed_as = Enums::SC_PRIVATE;
+        } else if (isSpillSeg()) {
+            executed_as = Enums::SC_SPILL;
+        } else if (isReadOnlySeg()) {
+            executed_as = Enums::SC_READONLY;
+        } else if (isKernArgSeg()) {
+            executed_as = Enums::SC_KERNARG;
+        } else if (isArgSeg()) {
+            executed_as = Enums::SC_ARG;
+        }
     }
+    const std::string& opcode() const { return _opcode; }
 
   protected:
-    const std::string opcode;
+    const std::string _opcode;
     std::string disassembly;
     int _instNum;
     int _instAddr;
+    int srcVecOperands;
+    int dstVecOperands;
+    int srcVecDWORDs;
+    int dstVecDWORDs;
     /**
      * Identifier of the immediate post-dominator instruction.
      */
@@ -262,9 +279,9 @@ class KernelLaunchStaticInst : public GPUStaticInst
     KernelLaunchStaticInst() : GPUStaticInst("kernel_launch")
     {
         setFlag(Nop);
+        setFlag(KernelLaunch);
+        setFlag(MemSync);
         setFlag(Scalar);
-        setFlag(Acquire);
-        setFlag(SystemScope);
         setFlag(GlobalSegment);
     }
 
@@ -277,11 +294,14 @@ class KernelLaunchStaticInst : public GPUStaticInst
     void
     generateDisassembly() override
     {
-        disassembly = opcode;
+        disassembly = _opcode;
     }
 
     int getNumOperands() override { return 0; }
-    bool isCondRegister(int operandIndex) override { return false; }
+    bool isFlatScratchRegister(int opIdx) override { return false; }
+    // return true if the Execute mask is explicitly used as a source
+    // register operand
+    bool isExecMaskRegister(int opIdx) override { return false; }
     bool isScalarRegister(int operandIndex) override { return false; }
     bool isVectorRegister(int operandIndex) override { return false; }
     bool isSrcOperand(int operandIndex) override { return false; }
@@ -296,7 +316,6 @@ class KernelLaunchStaticInst : public GPUStaticInst
 
     int numDstRegOperands() override { return 0; }
     int numSrcRegOperands() override { return 0; }
-    bool isValid() const override { return true; }
     int instSize() const override { return 0; }
 };
 
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
index 37a8b03a2..a37618d32 100644
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -74,7 +74,6 @@ namespace X86ISA
         allocationPolicy = p->allocationPolicy;
         hasMemSidePort = false;
         accessDistance = p->accessDistance;
-        clock = p->clk_domain->clockPeriod();
 
         tlb.assign(size, TlbEntry());
 
@@ -624,8 +623,8 @@ namespace X86ISA
     {
         bool delayedResponse;
 
-        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
-                                 latency);
+        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
+            false, latency);
     }
 
     void
@@ -803,13 +802,13 @@ namespace X86ISA
         }
 
         /*
-         * We now know the TLB lookup outcome (if it's a hit or a miss), as well
-         * as the TLB access latency.
+         * We now know the TLB lookup outcome (if it's a hit or a miss), as
+         * well as the TLB access latency.
          *
          * We create and schedule a new TLBEvent which will help us take the
-         * appropriate actions (e.g., update TLB on a hit, send request to lower
-         * level TLB on a miss, or start a page walk if this was the last-level
-         * TLB)
+         * appropriate actions (e.g., update TLB on a hit, send request to
+         * lower level TLB on a miss, or start a page walk if this was the
+         * last-level TLB)
          */
         TLBEvent *tlb_event =
             new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
@@ -823,15 +822,15 @@ namespace X86ISA
         assert(tlb_event);
 
         DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
-                curTick() + this->ticks(hitLatency));
+                curTick() + cyclesToTicks(Cycles(hitLatency)));
 
-        schedule(tlb_event, curTick() + this->ticks(hitLatency));
+        schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
     }
 
-    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
-                               PacketPtr _pkt)
-        : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
-        outcome(tlb_outcome), pkt(_pkt)
+    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
+        tlbOutcome tlb_outcome, PacketPtr _pkt)
+            : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+              outcome(tlb_outcome), pkt(_pkt)
     {
     }
 
@@ -848,7 +847,8 @@ namespace X86ISA
         bool storeCheck = flags & (StoreCheck << FlagShift);
 
         // Do paging protection checks.
-        bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+        bool inUser
+            = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
         CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
 
         bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
@@ -874,10 +874,9 @@ namespace X86ISA
      * The latter calls handelHit with TLB miss as tlbOutcome.
      */
     void
-    GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
-            PacketPtr pkt)
+    GpuTLB::handleTranslationReturn(Addr virt_page_addr,
+        tlbOutcome tlb_outcome, PacketPtr pkt)
     {
-
         assert(pkt);
         Addr vaddr = pkt->req->getVaddr();
 
@@ -890,15 +889,18 @@ namespace X86ISA
         TlbEntry *local_entry, *new_entry;
 
         if (tlb_outcome == TLB_HIT) {
-            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
+                vaddr);
             local_entry = sender_state->tlbEntry;
         } else {
             DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
                     vaddr);
 
-            // We are returning either from a page walk or from a hit at a lower
-            // TLB level. The senderState should be "carrying" a pointer to the
-            // correct TLBEntry.
+            /**
+             * We are returning either from a page walk or from a hit at a
+             * lower TLB level. The senderState should be "carrying" a pointer
+             * to the correct TLBEntry.
+             */
             new_entry = sender_state->tlbEntry;
             assert(new_entry);
             local_entry = new_entry;
@@ -1024,7 +1026,8 @@ namespace X86ISA
                 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
                 assert(tlb_event);
                 tlb_event->updateOutcome(PAGE_WALK);
-                schedule(tlb_event, curTick() + ticks(missLatency2));
+                schedule(tlb_event,
+                         curTick() + cyclesToTicks(Cycles(missLatency2)));
             }
         } else if (outcome == PAGE_WALK) {
             if (update_stats)
@@ -1095,7 +1098,7 @@ namespace X86ISA
         return virtPageAddr;
     }
 
-    /*
+    /**
      * recvTiming receives a coalesced timing request from a TLBCoalescer
      * and it calls issueTLBLookup()
      * It only rejects the packet if we have exceeded the max
@@ -1145,9 +1148,11 @@ namespace X86ISA
             DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
                     "%#x\n", vaddr);
 
-            // We are returning either from a page walk or from a hit at a lower
-            // TLB level. The senderState should be "carrying" a pointer to the
-            // correct TLBEntry.
+            /**
+             * We are returning either from a page walk or from a hit at a
+             * lower TLB level. The senderState should be "carrying" a pointer
+             * to the correct TLBEntry.
+             */
             new_entry = sender_state->tlbEntry;
             assert(new_entry);
             local_entry = new_entry;
@@ -1267,8 +1272,8 @@ namespace X86ISA
                 } else {
                     // If this was a prefetch, then do the normal thing if it
                     // was a successful translation.  Otherwise, send an empty
-                    // TLB entry back so that it can be figured out as empty and
-                    // handled accordingly.
+                    // TLB entry back so that it can be figured out as empty
+                    // and handled accordingly.
                     if (pte) {
                         DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
                                 pte->paddr);
@@ -1343,7 +1348,7 @@ namespace X86ISA
         assert(virt_page_addr == tlb_event->getTLBEventVaddr());
 
         tlb_event->updateOutcome(MISS_RETURN);
-        tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+        tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());
 
         return true;
     }
@@ -1393,8 +1398,8 @@ namespace X86ISA
         tmp_access_info.sumDistance = 0;
         tmp_access_info.meanDistance = 0;
 
-        ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
-                                  tmp_access_info));
+        ret = TLBFootprint.insert(
+            AccessPatternTable::value_type(virt_page_addr, tmp_access_info));
 
         bool first_page_access = ret.second;
 
@@ -1428,74 +1433,74 @@ namespace X86ISA
             page_stat_file = simout.create(name().c_str())->stream();
 
             // print header
-            *page_stat_file << "page,max_access_distance,mean_access_distance, "
-                            << "stddev_distance" << std::endl;
+            *page_stat_file
+                << "page,max_access_distance,mean_access_distance, "
+                << "stddev_distance" << std::endl;
         }
 
         // update avg. reuse distance footprint
-        AccessPatternTable::iterator iter, iter_begin, iter_end;
         unsigned int sum_avg_reuse_distance_per_page = 0;
 
         // iterate through all pages seen by this TLB
-        for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
-            sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
-                                               iter->second.accessesPerPage;
+        for (auto &iter : TLBFootprint) {
+            sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
+                                               iter.second.accessesPerPage;
 
             if (accessDistance) {
-                unsigned int tmp = iter->second.localTLBAccesses[0];
+                unsigned int tmp = iter.second.localTLBAccesses[0];
                 unsigned int prev = tmp;
 
-                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
                     if (i) {
                         tmp = prev + 1;
                     }
 
-                    prev = iter->second.localTLBAccesses[i];
+                    prev = iter.second.localTLBAccesses[i];
                     // update the localTLBAccesses value
                     // with the actual differece
-                    iter->second.localTLBAccesses[i] -= tmp;
+                    iter.second.localTLBAccesses[i] -= tmp;
                     // compute the sum of AccessDistance per page
                     // used later for mean
-                    iter->second.sumDistance +=
-                        iter->second.localTLBAccesses[i];
+                    iter.second.sumDistance +=
+                        iter.second.localTLBAccesses[i];
                 }
 
-                iter->second.meanDistance =
-                    iter->second.sumDistance / iter->second.accessesPerPage;
+                iter.second.meanDistance =
+                    iter.second.sumDistance / iter.second.accessesPerPage;
 
                 // compute std_dev and max  (we need a second round because we
                 // need to know the mean value
                 unsigned int max_distance = 0;
                 unsigned int stddev_distance = 0;
 
-                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
                     unsigned int tmp_access_distance =
-                        iter->second.localTLBAccesses[i];
+                        iter.second.localTLBAccesses[i];
 
                     if (tmp_access_distance > max_distance) {
                         max_distance = tmp_access_distance;
                     }
 
                     unsigned int diff =
-                        tmp_access_distance - iter->second.meanDistance;
+                        tmp_access_distance - iter.second.meanDistance;
                     stddev_distance += pow(diff, 2);
 
                 }
 
                 stddev_distance =
-                    sqrt(stddev_distance/iter->second.accessesPerPage);
+                    sqrt(stddev_distance/iter.second.accessesPerPage);
 
                 if (page_stat_file) {
-                    *page_stat_file << std::hex << iter->first << ",";
+                    *page_stat_file << std::hex << iter.first << ",";
                     *page_stat_file << std::dec << max_distance << ",";
-                    *page_stat_file << std::dec << iter->second.meanDistance
+                    *page_stat_file << std::dec << iter.second.meanDistance
                                     << ",";
                     *page_stat_file << std::dec << stddev_distance;
                     *page_stat_file << std::endl;
                 }
 
                 // erase the localTLBAccesses array
-                iter->second.localTLBAccesses.clear();
+                iter.second.localTLBAccesses.clear();
             }
         }
 
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
index dbd3a16f3..9186b33fe 100644
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -69,26 +69,7 @@ namespace X86ISA
 
         uint32_t configAddress;
 
-        // TLB clock: will inherit clock from shader's clock period in terms
-        // of nuber of ticks of curTime (aka global simulation clock)
-        // The assignment of TLB clock from shader clock is done in the python
-        // config files.
-        int clock;
-
       public:
-        // clock related functions ; maps to-and-from Simulation ticks and
-        // object clocks.
-        Tick frequency() const { return SimClock::Frequency / clock; }
-
-        Tick
-        ticks(int numCycles) const
-        {
-            return (Tick)clock * numCycles;
-        }
-
-        Tick curCycle() const { return curTick() / clock; }
-        Tick tickToCycles(Tick val) const { return val / clock;}
-
         typedef X86GPUTLBParams Params;
         GpuTLB(const Params *p);
         ~GpuTLB();
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
new file mode 100644
index 000000000..a6917db3e
--- /dev/null
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * HSAQueuEntry is the simulator's internal representation of an
+ * AQL queue entry (task). It encasulates all of the relevant info
+ * about a task, which is gathered from various runtime data
+ * structures including: the AQL MQD, the AQL packet, and the code
+ * object.
+ */
+
+#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+
+#include <bitset>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "dev/hsa/hsa_queue.hh"
+#include "gpu-compute/kernel_code.hh"
+
+class HSAQueueEntry
+{
+  public:
+    HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
+                  int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
+                  Addr host_pkt_addr, Addr code_addr)
+        : kernName(kernel_name),
+          _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
+                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
+                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
+          _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
+                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
+                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
+          numVgprs(akc->workitem_vgpr_count),
+          numSgprs(akc->wavefront_sgpr_count),
+          _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
+          _hostDispPktAddr(host_pkt_addr),
+          _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
+                            ->completion_signal),
+          codeAddress(code_addr),
+          kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
+          _outstandingInvs(-1), _outstandingWbs(0),
+          _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+                   group_segment_size),
+          _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+                         private_segment_size),
+          _contextId(0), _wgId{{ 0, 0, 0 }},
+          _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
+          _globalWgId(0), dispatchComplete(false)
+
+    {
+        initialVgprState.reset();
+        initialSgprState.reset();
+
+        for (int i = 0; i < MAX_DIM; ++i) {
+            _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
+            _numWgTotal *= _numWg[i];
+        }
+
+        parseKernelCode(akc);
+    }
+
+    const std::string&
+    kernelName() const
+    {
+        return kernName;
+    }
+
+    int
+    wgSize(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _wgSize[dim];
+    }
+
+    int
+    gridSize(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _gridSize[dim];
+    }
+
+    int
+    numVectorRegs() const
+    {
+        return numVgprs;
+    }
+
+    int
+    numScalarRegs() const
+    {
+        return numSgprs;
+    }
+
+    uint32_t
+    queueId() const
+    {
+        return _queueId;
+    }
+
+    int
+    dispatchId() const
+    {
+        return _dispatchId;
+    }
+
+    void*
+    dispPktPtr()
+    {
+        return dispPkt;
+    }
+
+    Addr
+    hostDispPktAddr() const
+    {
+        return _hostDispPktAddr;
+    }
+
+    Addr
+    completionSignal() const
+    {
+        return _completionSignal;
+    }
+
+    Addr
+    codeAddr() const
+    {
+        return codeAddress;
+    }
+
+    Addr
+    kernargAddr() const
+    {
+        return kernargAddress;
+    }
+
+    int
+    ldsSize() const
+    {
+        return _ldsSize;
+    }
+
+    int privMemPerItem() const { return _privMemPerItem; }
+
+    int
+    contextId() const
+    {
+        return _contextId;
+    }
+
+    bool
+    dispComplete() const
+    {
+        return dispatchComplete;
+    }
+
+    int
+    wgId(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _wgId[dim];
+    }
+
+    void
+    wgId(int dim, int val)
+    {
+        assert(dim < MAX_DIM);
+        _wgId[dim] = val;
+    }
+
+    int
+    globalWgId() const
+    {
+        return _globalWgId;
+    }
+
+    void
+    globalWgId(int val)
+    {
+        _globalWgId = val;
+    }
+
+    int
+    numWg(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _numWg[dim];
+    }
+
+    void
+    notifyWgCompleted()
+    {
+        ++_numWgCompleted;
+    }
+
+    int
+    numWgCompleted() const
+    {
+        return _numWgCompleted;
+    }
+
+    int
+    numWgTotal() const
+    {
+        return _numWgTotal;
+    }
+
+    void
+    markWgDispatch()
+    {
+        ++_wgId[0];
+        ++_globalWgId;
+
+        if (wgId(0) * wgSize(0) >= gridSize(0)) {
+            _wgId[0] = 0;
+            ++_wgId[1];
+
+            if (wgId(1) * wgSize(1) >= gridSize(1)) {
+                _wgId[1] = 0;
+                ++_wgId[2];
+
+                if (wgId(2) * wgSize(2) >= gridSize(2)) {
+                    dispatchComplete = true;
+                }
+            }
+        }
+    }
+
+    int
+    numWgAtBarrier() const
+    {
+        return numWgArrivedAtBarrier;
+    }
+
+    bool vgprBitEnabled(int bit) const
+    {
+        return initialVgprState.test(bit);
+    }
+
+    bool sgprBitEnabled(int bit) const
+    {
+        return initialSgprState.test(bit);
+    }
+
+    /**
+     * Host-side addr of the amd_queue_t on which
+     * this task was queued.
+     */
+    Addr hostAMDQueueAddr;
+
+    /**
+     * Keep a copy of the AMD HSA queue because we
+     * need info from some of its fields to initialize
+     * register state.
+     */
+    _amd_queue_t amdQueue;
+
+    // the maximum number of dimensions for a grid or workgroup
+    const static int MAX_DIM = 3;
+
+    /* getter */
+    int
+    outstandingInvs() {
+        return _outstandingInvs;
+    }
+
+    /**
+     * Whether invalidate has started or finished -1 is the
+     * initial value indicating inv has not started for the
+     * kernel.
+     */
+    bool
+    isInvStarted()
+    {
+        return (_outstandingInvs != -1);
+    }
+
+    /**
+     * update the number of pending invalidate requests
+     *
+     * val: negative to decrement, positive to increment
+     */
+    void
+    updateOutstandingInvs(int val)
+    {
+        _outstandingInvs += val;
+        assert(_outstandingInvs >= 0);
+    }
+
+    /**
+     * Forcefully change the state to be inv done.
+     */
+    void
+    markInvDone()
+    {
+        _outstandingInvs = 0;
+    }
+
+    /**
+     * Is invalidate done?
+     */
+    bool
+    isInvDone() const
+    {
+        assert(_outstandingInvs >= 0);
+        return (_outstandingInvs == 0);
+    }
+
+    int
+    outstandingWbs() const
+    {
+        return _outstandingWbs;
+    }
+
+    /**
+     * Update the number of pending writeback requests.
+     *
+     * val: negative to decrement, positive to increment
+     */
+    void
+    updateOutstandingWbs(int val)
+    {
+        _outstandingWbs += val;
+        assert(_outstandingWbs >= 0);
+    }
+
+  private:
+    void
+    parseKernelCode(AMDKernelCode *akc)
+    {
+        /** set the enable bits for the initial SGPR state */
+        initialSgprState.set(PrivateSegBuf,
+            akc->enable_sgpr_private_segment_buffer);
+        initialSgprState.set(DispatchPtr,
+            akc->enable_sgpr_dispatch_ptr);
+        initialSgprState.set(QueuePtr,
+            akc->enable_sgpr_queue_ptr);
+        initialSgprState.set(KernargSegPtr,
+            akc->enable_sgpr_kernarg_segment_ptr);
+        initialSgprState.set(DispatchId,
+            akc->enable_sgpr_dispatch_id);
+        initialSgprState.set(FlatScratchInit,
+            akc->enable_sgpr_flat_scratch_init);
+        initialSgprState.set(PrivateSegSize,
+            akc->enable_sgpr_private_segment_size);
+        initialSgprState.set(GridWorkgroupCountX,
+            akc->enable_sgpr_grid_workgroup_count_x);
+        initialSgprState.set(GridWorkgroupCountY,
+            akc->enable_sgpr_grid_workgroup_count_y);
+        initialSgprState.set(GridWorkgroupCountZ,
+            akc->enable_sgpr_grid_workgroup_count_z);
+        initialSgprState.set(WorkgroupIdX,
+            akc->enable_sgpr_workgroup_id_x);
+        initialSgprState.set(WorkgroupIdY,
+            akc->enable_sgpr_workgroup_id_y);
+        initialSgprState.set(WorkgroupIdZ,
+            akc->enable_sgpr_workgroup_id_z);
+        initialSgprState.set(WorkgroupInfo,
+            akc->enable_sgpr_workgroup_info);
+        initialSgprState.set(PrivSegWaveByteOffset,
+            akc->enable_sgpr_private_segment_wave_byte_offset);
+
+        /**
+         * set the enable bits for the initial VGPR state. the
+         * workitem Id in the X dimension is always initialized.
+         */
+        initialVgprState.set(WorkitemIdX, true);
+        initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
+        initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
+    }
+
+    // name of the kernel associated with the AQL entry
+    std::string kernName;
+    // workgroup Size (3 dimensions)
+    std::array<int, MAX_DIM> _wgSize;
+    // grid Size (3 dimensions)
+    std::array<int, MAX_DIM> _gridSize;
+    // total number of VGPRs per work-item
+    int numVgprs;
+    // total number of SGPRs per wavefront
+    int numSgprs;
+    // id of AQL queue in which this entry is placed
+    uint32_t _queueId;
+    int _dispatchId;
+    // raw AQL packet pointer
+    void *dispPkt;
+    // host-side addr of the dispatch packet
+    Addr _hostDispPktAddr;
+    // pointer to bool
+    Addr _completionSignal;
+    // base address of the raw machine code
+    Addr codeAddress;
+    // base address of the kernel args
+    Addr kernargAddress;
+    /**
+     * Number of outstanding invs for the kernel.
+     * values:
+     *  -1: initial value, invalidate has not started for the kernel
+     *  0: 1)-1->0, about to start (a transient state, added in the same cycle)
+     *     2)+1->0, all inv requests are finished, i.e., invalidate done
+     *  ?: positive value, indicating the number of pending inv requests
+     */
+    int _outstandingInvs;
+    /**
+     * Number of outstanding wbs for the kernel
+     * values:
+     *  0: 1)initial value, flush has not started for the kernel
+     *     2)+1->0: all wb requests are finished, i.e., flush done
+     *  ?: positive value, indicating the number of pending wb requests
+     */
+    int _outstandingWbs;
+    int _ldsSize;
+    int _privMemPerItem;
+    int _contextId;
+    std::array<int, MAX_DIM> _wgId;
+    std::array<int, MAX_DIM> _numWg;
+    int _numWgTotal;
+    int numWgArrivedAtBarrier;
+    // The number of completed work groups
+    int _numWgCompleted;
+    int _globalWgId;
+    bool dispatchComplete;
+
+    std::bitset<NumVectorInitFields> initialVgprState;
+    std::bitset<NumScalarInitFields> initialSgprState;
+};
+
+#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
new file mode 100644
index 000000000..b3560c7e5
--- /dev/null
+++ b/src/gpu-compute/kernel_code.hh
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
+#define __GPU_COMPUTE_KERNEL_CODE_HH__
+
+#include <bitset>
+#include <cstdint>
+
+/**
+ * these enums represent the indices into the
+ * initialRegState bitfields in HsaKernelInfo.
+ * each bit specifies whether or not the
+ * particular piece of state that the bit
+ * corresponds to should be initialized into
+ * the VGPRs/SGPRs. the order in which the
+ * fields are placed matters, as all enabled
+ * pieces of state will be initialized into
+ * contiguous registers in the same order
+ * as their position in the bitfield - which
+ * is specified in the HSA ABI.
+ */
+enum ScalarRegInitFields : int
+{
+    PrivateSegBuf = 0,
+    DispatchPtr = 1,
+    QueuePtr = 2,
+    KernargSegPtr = 3,
+    DispatchId = 4,
+    FlatScratchInit = 5,
+    PrivateSegSize = 6,
+    GridWorkgroupCountX = 7,
+    GridWorkgroupCountY = 8,
+    GridWorkgroupCountZ = 9,
+    WorkgroupIdX = 10,
+    WorkgroupIdY = 11,
+    WorkgroupIdZ = 12,
+    WorkgroupInfo = 13,
+    PrivSegWaveByteOffset = 14,
+    NumScalarInitFields = 15
+};
+
+enum VectorRegInitFields : int
+{
+    WorkitemIdX = 0,
+    WorkitemIdY = 1,
+    WorkitemIdZ = 2,
+    NumVectorInitFields = 3
+};
+
+struct AMDKernelCode
+{
+    uint32_t amd_kernel_code_version_major;
+    uint32_t amd_kernel_code_version_minor;
+    uint16_t amd_machine_kind;
+    uint16_t amd_machine_version_major;
+    uint16_t amd_machine_version_minor;
+    uint16_t amd_machine_version_stepping;
+    int64_t kernel_code_entry_byte_offset;
+    int64_t kernel_code_prefetch_byte_offset;
+    uint64_t kernel_code_prefetch_byte_size;
+    uint64_t max_scratch_backing_memory_byte_size;
+
+    /**
+     * The fields below are used to set program settings for
+     * compute shaders. Here they are primarily used to setup
+     * initial register state. See the following for full details
+     * about kernel launch, state initialization, and the AMD kernel
+     * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
+     *              blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
+     *              #initial-kernel-register-state
+     */
+
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC1 register
+    uint32_t granulated_workitem_vgpr_count : 6;
+    uint32_t granulated_wavefront_sgpr_count : 4;
+    uint32_t priority : 2;
+    uint32_t float_mode_round_32 : 2;
+    uint32_t float_mode_round_16_64 : 2;
+    uint32_t float_mode_denorm_32 : 2;
+    uint32_t float_mode_denorm_16_64 : 2;
+    uint32_t priv : 1;
+    uint32_t enable_dx10_clamp : 1;
+    uint32_t debug_mode : 1;
+    uint32_t enable_ieee_mode : 1;
+    uint32_t bulky : 1;
+    uint32_t cdbg_user : 1;
+    uint32_t compute_pgm_rsrc1_reserved : 6;
+    // end COMPUTE_PGM_RSRC1 register
+
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC2 register
+    uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+    uint32_t user_sgpr_count : 5;
+    uint32_t enable_trap_handler : 1;
+    uint32_t enable_sgpr_workgroup_id_x : 1;
+    uint32_t enable_sgpr_workgroup_id_y : 1;
+    uint32_t enable_sgpr_workgroup_id_z : 1;
+    uint32_t enable_sgpr_workgroup_info : 1;
+    uint32_t enable_vgpr_workitem_id_y : 1;
+    uint32_t enable_vgpr_workitem_id_z : 1;
+    uint32_t enable_exception_address_watch : 1;
+    uint32_t enable_exception_memory_violation : 1;
+    uint32_t granulated_lds_size : 9;
+    uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
+    uint32_t enable_exception_fp_denormal_source : 1;
+    uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
+    uint32_t enable_exception_ieee_754_fp_overflow : 1;
+    uint32_t enable_exception_ieee_754_fp_underflow : 1;
+    uint32_t enable_exception_ieee_754_fp_inexact : 1;
+    uint32_t enable_exception_int_divide_by_zero : 1;
+    uint32_t compute_pgm_rsrc2_reserved : 1;
+    // end COMPUTE_PGM_RSRC2
+
+    // the 32b below here represent the fields of
+    // KERNEL_CODE_PROPERTIES
+    uint32_t enable_sgpr_private_segment_buffer : 1;
+    uint32_t enable_sgpr_dispatch_ptr : 1;
+    uint32_t enable_sgpr_queue_ptr : 1;
+    uint32_t enable_sgpr_kernarg_segment_ptr : 1;
+    uint32_t enable_sgpr_dispatch_id : 1;
+    uint32_t enable_sgpr_flat_scratch_init : 1;
+    uint32_t enable_sgpr_private_segment_size : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_x : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_y : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_z : 1;
+    uint32_t kernel_code_properties_reserved1 : 6;
+    uint32_t enable_ordered_append_gds : 1;
+    uint32_t private_element_size : 2;
+    uint32_t is_ptr64 : 1;
+    uint32_t is_dynamic_callstack : 1;
+    uint32_t is_debug_enabled : 1;
+    uint32_t is_xnack_enabled : 1;
+    uint32_t kernel_code_properties_reserved2 : 9;
+    // end KERNEL_CODE_PROPERTIES
+
+    uint32_t workitem_private_segment_byte_size;
+    uint32_t workgroup_group_segment_byte_size;
+    uint32_t gds_segment_byte_size;
+    uint64_t kernarg_segment_byte_size;
+    uint32_t workgroup_fbarrier_count;
+    uint16_t wavefront_sgpr_count;
+    uint16_t workitem_vgpr_count;
+    uint16_t reserved_vgpr_first;
+    uint16_t reserved_vgpr_count;
+    uint16_t reserved_sgpr_first;
+    uint16_t reserved_sgpr_count;
+    uint16_t debug_wavefront_private_segment_offset_sgpr;
+    uint16_t debug_private_segment_buffer_sgpr;
+    uint8_t kernarg_segment_alignment;
+    uint8_t group_segment_alignment;
+    uint8_t private_segment_alignment;
+    uint8_t wavefront_size;
+    int32_t call_convention;
+    uint8_t reserved[12];
+    uint64_t runtime_loader_kernel_symbol;
+    uint64_t control_directives[16];
+};
+
+#endif // __GPU_COMPUTE_KERNEL_CODE_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
index d56562b79..58c5d986e 100644
--- a/src/gpu-compute/lds_state.cc
+++ b/src/gpu-compute/lds_state.cc
@@ -210,8 +210,8 @@ LdsState::processPacket(PacketPtr packet)
         parent->loadBusLength();
     // delay for accessing the LDS
     Tick processingTime =
-        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
-        parent->shader->ticks(busLength);
+        parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
+        parent->cyclesToTicks(Cycles(busLength));
     // choose (delay + last packet in queue) or (now + delay) as the time to
     // return this
     Tick doneAt = earliestReturnTime() + processingTime;
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
index c4934a657..58171e30c 100644
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -41,7 +41,6 @@
 #include <utility>
 #include <vector>
 
-#include "enums/MemType.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
@@ -50,8 +49,8 @@
 class ComputeUnit;
 
 /**
- * this represents a slice of the overall LDS, intended to be associated with an
- * individual workgroup
+ * this represents a slice of the overall LDS, intended to be associated with
+ * an individual workgroup
  */
 class LdsChunk
 {
@@ -71,7 +70,8 @@ class LdsChunk
     read(const uint32_t index)
     {
         fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
-        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+            "chunk");
         T *p0 = (T *) (&(chunk.at(index)));
         return *p0;
     }
@@ -84,7 +84,8 @@ class LdsChunk
     write(const uint32_t index, const T value)
     {
         fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
-        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+            "chunk");
         T *p0 = (T *) (&(chunk.at(index)));
         *p0 = value;
     }
@@ -203,14 +204,16 @@ class LdsState: public ClockedObject
 
   protected:
 
-    // the lds reference counter
-    // The key is the workgroup ID and dispatch ID
-    // The value is the number of wavefronts that reference this LDS, as
-    // wavefronts are launched, the counter goes up for that workgroup and when
-    // they return it decreases, once it reaches 0 then this chunk of the LDS is
-    // returned to the available pool. However,it is deallocated on the 1->0
-    // transition, not whenever the counter is 0 as it always starts with 0 when
-    // the workgroup asks for space
+    /**
+     * the lds reference counter
+     * The key is the workgroup ID and dispatch ID
+     * The value is the number of wavefronts that reference this LDS, as
+     * wavefronts are launched, the counter goes up for that workgroup and when
+     * they return it decreases, once it reaches 0 then this chunk of the LDS
+     * is returned to the available pool. However,it is deallocated on the 1->0
+     * transition, not whenever the counter is 0 as it always starts with 0
+     * when the workgroup asks for space
+     */
     std::unordered_map<uint32_t,
                        std::unordered_map<uint32_t, int32_t>> refCounter;
 
@@ -356,22 +359,41 @@ class LdsState: public ClockedObject
             const uint32_t size)
     {
         if (chunkMap.find(dispatchId) != chunkMap.end()) {
-            fatal_if(
+            panic_if(
                 chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
                 "duplicate workgroup ID asking for space in the LDS "
                 "did[%d] wgid[%d]", dispatchId, wgId);
         }
 
-        fatal_if(bytesAllocated + size > maximumSize,
-                 "request would ask for more space than is available");
+        if (bytesAllocated + size > maximumSize) {
+            return nullptr;
+        } else {
+            bytesAllocated += size;
+
+            auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+            panic_if(!value.second, "was unable to allocate a new chunkMap");
+
+            // make an entry for this workgroup
+            refCounter[dispatchId][wgId] = 0;
 
-        bytesAllocated += size;
+            return &chunkMap[dispatchId][wgId];
+        }
+    }
+
+    /*
+     * return pointer to lds chunk for wgid
+     */
+    LdsChunk *
+    getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
+    {
+      fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
+          "fetch for unknown dispatch ID did[%d]", dispatchId);
 
-        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
-        // make an entry for this workgroup
-        refCounter[dispatchId][wgId] = 0;
+      fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
+          "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
+          wgId, dispatchId);
 
-        return &chunkMap[dispatchId][wgId];
+      return &chunkMap[dispatchId][wgId];
     }
 
     bool
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 68c5afa4a..b31ed6f4a 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -33,6 +33,7 @@
 
 #include "gpu-compute/local_memory_pipeline.hh"
 
+#include "debug/GPUMem.hh"
 #include "debug/GPUPort.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
@@ -62,24 +63,31 @@ LocalMemPipeline::exec()
     bool accessVrf = true;
     Wavefront *w = nullptr;
 
-    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+    if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) {
         w = m->wavefront();
 
-        accessVrf =
-            w->computeUnit->vrf[w->simdId]->
-            vrfOperandAccessReady(m->seqNum(), w, m,
-                                  VrfAccessType::WRITE);
+        accessVrf = w->computeUnit->vrf[w->simdId]->
+            canScheduleWriteOperandsFromLoad(w, m);
+
     }
 
     if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
-        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
-                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
+        computeUnit->locMemToVrfBus.rdy()
+        && (computeUnit->shader->coissue_return
+        || computeUnit->vectorSharedMemUnit.rdy())) {
 
         lmReturnedRequests.pop();
         w = m->wavefront();
 
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
+                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
 
+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->vrf[w->simdId]->
+                scheduleWriteOperandsFromLoad(w, m);
+        }
+
         // Decrement outstanding request count
         computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
 
@@ -96,7 +104,7 @@ LocalMemPipeline::exec()
         // Mark write bus busy for appropriate amount of time
         computeUnit->locMemToVrfBus.set(m->time);
         if (computeUnit->shader->coissue_return == 0)
-            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+            w->computeUnit->vectorSharedMemUnit.set(m->time);
     }
 
     // If pipeline has executed a local memory instruction
@@ -114,6 +122,13 @@ LocalMemPipeline::exec()
     }
 }
 
+void
+LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+    gpuDynInst->setAccessTime(curTick());
+    lmIssuedRequests.push(gpuDynInst);
+}
+
 void
 LocalMemPipeline::regStats()
 {
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
index dba938d6a..d9ab485b2 100644
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -58,10 +58,11 @@ class LocalMemPipeline
     LocalMemPipeline(const ComputeUnitParams *params);
     void init(ComputeUnit *cu);
     void exec();
-
-    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
     std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
 
+    void issueRequest(GPUDynInstPtr gpuDynInst);
+
+
     bool
     isLMRespFIFOWrRdy() const
     {
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
index 731a9977a..0b573e8fe 100644
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -39,34 +39,62 @@
 #include <memory>
 
 #include "base/logging.hh"
+#include "sim/clocked_object.hh"
 
 class GPUDynInst;
 
-typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits>
+    VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
 
+enum InstMemoryHop : int {
+    Initiate = 0,
+    CoalsrSend = 1,
+    CoalsrRecv = 2,
+    GMEnqueue = 3,
+    Complete = 4,
+    InstMemoryHopMax = 5
+};
+
+enum BlockMemoryHop : int {
+    BlockSend = 0,
+    BlockRecv = 1
+};
+
 class WaitClass
 {
   public:
-    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
-    void init(uint64_t *_tcnt, uint32_t _numStages=0)
+    WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { }
+
+    WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0)
+        : nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject),
+          numStages(_numStages) { }
+
+    void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
     {
-        tcnt = _tcnt;
+        clockedObject = _clockedObject;
         numStages = _numStages;
     }
 
-    void set(uint32_t i)
+    void set(uint64_t i)
     {
-        fatal_if(nxtAvail > *tcnt,
+        fatal_if(nxtAvail > clockedObject->clockEdge(),
                  "Can't allocate resource because it is busy!!!");
-        nxtAvail = *tcnt + i;
+        nxtAvail = clockedObject->clockEdge() + i;
+    }
+    void preset(uint64_t delay)
+    {
+        lookAheadAvail = std::max(lookAheadAvail, delay +
+                (clockedObject->clockEdge()) - numStages);
+    }
+    bool rdy(Cycles cycles = Cycles(0)) const
+    {
+        return clockedObject->clockEdge(cycles) >= nxtAvail;
     }
-    void preset(uint32_t delay)
+    bool prerdy() const
     {
-        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+        return clockedObject->clockEdge() >= lookAheadAvail;
     }
-    bool rdy() const { return *tcnt >= nxtAvail; }
-    bool prerdy() const { return *tcnt >= lookAheadAvail; }
 
   private:
     // timestamp indicating when resource will be available
@@ -75,11 +103,11 @@ class WaitClass
     // pending uses of the resource (when there is a cycle gap between
     // rdy() and set()
     uint64_t lookAheadAvail;
-    // current timestamp
-    uint64_t *tcnt;
+    // clockedObject for current timestamp
+    ClockedObject *clockedObject;
     // number of stages between checking if a resource is ready and
     // setting the resource's utilization
-    uint32_t numStages;
+    uint64_t numStages;
 };
 
 class Float16
@@ -93,7 +121,7 @@ class Float16
 
     Float16(float x)
     {
-        uint32_t ai = *(uint32_t *)&x;
+        uint32_t ai = *(reinterpret_cast<uint32_t *>(&x));
 
         uint32_t s = (ai >> 31) & 0x1;
         uint32_t exp = (ai >> 23) & 0xff;
@@ -139,7 +167,7 @@ class Float16
         val1 |= (exp << 23);
         val1 |= (mant << 13);
 
-        return *(float*)&val1;
+        return *(reinterpret_cast<float *>(&val1));
     }
 };
 
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
index 890e0d112..6c95ca25a 100644
--- a/src/gpu-compute/pool_manager.cc
+++ b/src/gpu-compute/pool_manager.cc
@@ -33,8 +33,8 @@
 
 #include "gpu-compute/pool_manager.hh"
 
-PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
-    : _minAllocation(minAlloc), _poolSize(poolSize)
+PoolManager::PoolManager(const PoolManagerParams *p)
+    : SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size)
 {
-    assert(poolSize > 0);
+    assert(_poolSize > 0);
 }
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
index bab8b6ddf..9bbaa6459 100644
--- a/src/gpu-compute/pool_manager.hh
+++ b/src/gpu-compute/pool_manager.hh
@@ -38,11 +38,15 @@
 #include <cstdint>
 #include <string>
 
+#include "params/PoolManager.hh"
+#include "sim/sim_object.hh"
+
 // Pool Manager Logic
-class PoolManager
+class PoolManager : public SimObject
 {
   public:
-    PoolManager(uint32_t minAlloc, uint32_t poolSize);
+    PoolManager(const PoolManagerParams *p);
+    virtual ~PoolManager() { _poolSize = 0; }
     uint32_t minAllocation() { return _minAllocation; }
     virtual std::string printRegion() = 0;
     virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc
new file mode 100644
index 000000000..eb6474cd2
--- /dev/null
+++ b/src/gpu-compute/register_file.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#include "gpu-compute/register_file.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFile.hh"
+
+RegisterFile::RegisterFile(const RegisterFileParams *p)
+    : SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs)
+{
+    fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
+    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+    busy.clear();
+    busy.resize(_numRegs, 0);
+}
+
+RegisterFile::~RegisterFile()
+{
+}
+
+void
+RegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+}
+
+std::string
+RegisterFile::dump() const
+{
+    std::stringstream ss;
+    ss << "Busy: ";
+    for (int i = 0; i < busy.size(); i++) {
+        ss << (int)busy[i];
+    }
+    ss << "\n";
+    return ss.str();
+}
+
+// Scoreboard functions
+
+bool
+RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    return true;
+}
+
+bool
+RegisterFile::regBusy(int idx) const
+{
+    return busy.at(idx);
+}
+
+void
+RegisterFile::markReg(int regIdx, bool value)
+{
+    DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n",
+            simdId, regIdx, (int)value);
+    busy.at(regIdx) = value;
+}
+
+void
+RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay)
+{
+    DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n",
+            simdId, regIdx, curTick() + delay);
+    schedule(new MarkRegFreeScbEvent(this, regIdx),
+             curTick() + delay);
+}
+
+void
+RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay)
+{
+    DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n",
+            simdId, regIdx, curTick() + delay);
+    schedule(new MarkRegBusyScbEvent(this, regIdx),
+             curTick() + delay);
+}
+
+// Schedule functions
+bool
+RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+// Exec functions
+void
+RegisterFile::exec()
+{
+}
+
+void
+RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+RegisterFile*
+RegisterFileParams::create()
+{
+    return new RegisterFile(this);
+}
+
+// Events
+
+// Mark a register as free in the scoreboard/busy vector
+void
+RegisterFile::MarkRegFreeScbEvent::process()
+{
+    rf->markReg(regIdx, false);
+}
+
+// Mark a register as busy in the scoreboard/busy vector
+void
+RegisterFile::MarkRegBusyScbEvent::process()
+{
+    rf->markReg(regIdx, true);
+}
+
+void
+RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
+{
+}
+
+void
+RegisterFile::regStats()
+{
+    registerReads
+        .name(name() + ".register_reads")
+        .desc("Total number of DWORDs read from register file")
+        ;
+
+    registerWrites
+        .name(name() + ".register_writes")
+        .desc("Total number of DWORDS written to register file")
+        ;
+
+    sramReads
+        .name(name() + ".sram_reads")
+        .desc("Total number of register file bank SRAM activations for reads")
+        ;
+
+    sramWrites
+        .name(name() + ".sram_writes")
+        .desc("Total number of register file bank SRAM activations for writes")
+        ;
+}
diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh
new file mode 100644
index 000000000..4bd705a5e
--- /dev/null
+++ b/src/gpu-compute/register_file.hh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#ifndef __REGISTER_FILE_HH__
+#define __REGISTER_FILE_HH__
+
+#include <limits>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class PoolManager;
+class Wavefront;
+
+struct RegisterFileParams;
+
+// Abstract Register File
+// This register file class can be inherited from to create both
+// scalar and vector register files.
+class RegisterFile : public SimObject
+{
+  public:
+    RegisterFile(const RegisterFileParams *p);
+    virtual ~RegisterFile();
+    virtual void setParent(ComputeUnit *_computeUnit);
+    int numRegs() const { return _numRegs; }
+    virtual void regStats() override;
+
+    // State functions
+
+    // Scoreboard functions
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+    virtual bool regBusy(int idx) const;
+    virtual void markReg(int regIdx, bool value);
+
+    // Abstract Register Event
+    class RegisterEvent : public Event
+    {
+      protected:
+        RegisterFile *rf;
+        int regIdx;
+
+      public:
+        RegisterEvent(RegisterFile *_rf, int _regIdx)
+            : rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); }
+    };
+
+    // Register Event to mark a register as free in the scoreboard/busy vector
+    class MarkRegFreeScbEvent : public RegisterEvent
+    {
+      public:
+        MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx)
+            : RegisterEvent(_rf, _regIdx) { }
+        void process();
+    };
+
+    // Register Event to mark a register as busy in the scoreboard/busy vector
+    class MarkRegBusyScbEvent : public RegisterEvent
+    {
+      public:
+        MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx)
+            : RegisterEvent(_rf, _regIdx) { }
+        void process();
+    };
+
+    // Schedule an event to mark a register as free/busy in
+    // the scoreboard/busy vector. Delay is already in Ticks
+    virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay);
+    virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay);
+
+    // Schedule functions
+
+    // The following functions are called by the SCH stage when attempting
+    // to move a wave from the readyList to the schList.
+    // canSchedule* checks if the RF is ready to provide operands for
+    // the instruction, while schedule* requests the RF to begin reading
+    // and writing of operands. Calling schedule* may only occur
+    // immediately after canSchedule* was called and returned True
+    virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+
+    // The following function is called to check if all operands
+    // have been read for the given instruction
+    virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii);
+
+    // The following two functions are only called by returning loads to
+    // check if the register file can support the incoming writes
+    virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w,
+                                                  GPUDynInstPtr ii);
+    // Queue the register writes. Assumes canScheduleWriteOperandsFromLoad
+    // was called immediately prior and returned True
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii);
+
+    // ExecRF is invoked every cycle by the compute unit and may be
+    // used to model detailed timing of the register file.
+    virtual void exec();
+
+    // Called to inform RF that an instruction is executing
+    // to schedule events for writeback, etc., as needed
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+    // Debug functions
+    virtual std::string dump() const;
+
+    virtual void dispatchInstruction(GPUDynInstPtr ii);
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId;
+
+    // flag indicating if a register is busy
+    std::vector<bool> busy;
+
+    // numer of registers in this register file
+    int _numRegs;
+    // Stats
+    // Total number of register reads, incremented once per DWORD per thread
+    Stats::Scalar registerReads;
+    // Total number of register writes, incremented once per DWORD per thread
+    Stats::Scalar registerWrites;
+
+    // Number of register file SRAM activations for reads.
+    // The register file may be implemented with multiple SRAMs. This stat
+    // tracks how many times the SRAMs are accessed for reads.
+    Stats::Scalar sramReads;
+    // Number of register file SRAM activations for writes
+    Stats::Scalar sramWrites;
+};
+
+#endif // __REGISTER_FILE_HH__
diff --git a/src/gpu-compute/register_manager.cc b/src/gpu-compute/register_manager.cc
new file mode 100644
index 000000000..65c126066
--- /dev/null
+++ b/src/gpu-compute/register_manager.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mark Wyse
+ */
+
+#include "gpu-compute/register_manager.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/static_register_manager_policy.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterManager.hh"
+
+RegisterManager::RegisterManager(const RegisterManagerParams *p)
+    : SimObject(p), srfPoolMgrs(p->srf_pool_managers),
+      vrfPoolMgrs(p->vrf_pool_managers)
+{
+    if (p->policy == "static") {
+        policy = new StaticRegisterManagerPolicy();
+    } else {
+        fatal("Unimplemented Register Manager Policy");
+    }
+
+}
+
+RegisterManager::~RegisterManager()
+{
+    for (auto mgr : srfPoolMgrs) {
+        delete mgr;
+    }
+    for (auto mgr : vrfPoolMgrs) {
+        delete mgr;
+    }
+}
+
+void
+RegisterManager::exec()
+{
+    policy->exec();
+}
+
+void
+RegisterManager::setParent(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    policy->setParent(computeUnit);
+    for (int i = 0; i < srfPoolMgrs.size(); i++) {
+        fatal_if(computeUnit->srf[i]->numRegs() %
+                 srfPoolMgrs[i]->minAllocation(),
+                 "Min SGPR allocation is not multiple of VRF size\n");
+    }
+    for (int i = 0; i < vrfPoolMgrs.size(); i++) {
+        fatal_if(computeUnit->vrf[i]->numRegs() %
+                 vrfPoolMgrs[i]->minAllocation(),
+                 "Min VGPG allocation is not multiple of VRF size\n");
+    }
+}
+
+// compute mapping for vector register
+int
+RegisterManager::mapVgpr(Wavefront* w, int vgprIndex)
+{
+    return policy->mapVgpr(w, vgprIndex);
+}
+
+// compute mapping for scalar register
+int
+RegisterManager::mapSgpr(Wavefront* w, int sgprIndex)
+{
+    return policy->mapSgpr(w, sgprIndex);
+}
+
+// check if we can allocate registers
+bool
+RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
+{
+    return policy->canAllocateVgprs(simdId, nWfs, demandPerWf);
+}
+
+bool
+RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
+{
+    return policy->canAllocateSgprs(simdId, nWfs, demandPerWf);
+}
+
+// allocate registers
+void
+RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand,
+                                   int scalarDemand)
+{
+    policy->allocateRegisters(w, vectorDemand, scalarDemand);
+}
+
+void
+RegisterManager::freeRegisters(Wavefront* w)
+{
+    policy->freeRegisters(w);
+}
+
+void
+RegisterManager::regStats()
+{
+    policy->regStats();
+}
+
+RegisterManager*
+RegisterManagerParams::create()
+{
+    return new RegisterManager(this);
+}
diff --git a/src/gpu-compute/register_manager.hh b/src/gpu-compute/register_manager.hh
new file mode 100644
index 000000000..60acf9533
--- /dev/null
+++ b/src/gpu-compute/register_manager.hh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_HH__
+#define __REGISTER_MANAGER_HH__
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/register_manager_policy.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterManagerParams;
+
+/*
+ * Rename stage.
+ */
+class RegisterManager : public SimObject
+{
+  public:
+    RegisterManager(const RegisterManagerParams* params);
+    ~RegisterManager();
+    void setParent(ComputeUnit *cu);
+    void exec();
+
+    // Stats related variables and methods
+    void regStats();
+
+    // lookup virtual to physical register translation
+    int mapVgpr(Wavefront* w, int vgprIndex);
+    int mapSgpr(Wavefront* w, int sgprIndex);
+
+    // check if we can allocate registers
+    bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf);
+    bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf);
+
+    // allocate registers
+    void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand);
+
+    // free all registers used by the WF
+    void freeRegisters(Wavefront *w);
+
+    std::vector<PoolManager*> srfPoolMgrs;
+    std::vector<PoolManager*> vrfPoolMgrs;
+
+  private:
+    RegisterManagerPolicy *policy;
+
+    ComputeUnit *computeUnit;
+
+    std::string _name;
+};
+
+#endif // __REGISTER_MANAGER_HH__
diff --git a/src/gpu-compute/register_manager_policy.hh b/src/gpu-compute/register_manager_policy.hh
new file mode 100644
index 000000000..2a5a2eb1e
--- /dev/null
+++ b/src/gpu-compute/register_manager_policy.hh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_POLICY_HH__
+#define __REGISTER_MANAGER_POLICY_HH__
+
+#include <cstdint>
+
+class ComputeUnit;
+class HSAQueueEntry;
+class Wavefront;
+
+/**
+ * Register Manager Policy abstract class
+ *
+ * A Register Manager Policy implements all of the functionality
+ * of the Register Manager, including register mapping, allocation,
+ * and freeing. Different policies may be implemented that support
+ * different architectures or different methods of mapping and
+ * allocation.
+ */
+class RegisterManagerPolicy
+{
+  public:
+    virtual void setParent(ComputeUnit *_cu) { cu = _cu; }
+
+    // Execute: called by RenameStage::execute()
+    virtual void exec() = 0;
+
+    // provide virtual to physical register mapping
+    virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0;
+    virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0;
+
+    // check if requested number of vector registers can be allocated
+    virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0;
+    // check if requested number of scalar registers can be allocated
+    // machine ISA only
+    virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0;
+
+    // allocate vector registers and reserve from register pool
+    virtual void allocateRegisters(Wavefront *w, int vectorDemand,
+        int scalarDemand) = 0;
+
+    // free all remaining registers held by specified WF
+    virtual void freeRegisters(Wavefront *w) = 0;
+
+    // stats
+    virtual void regStats() = 0;
+
+  protected:
+    ComputeUnit *cu;
+};
+
+#endif // __REGISTER_MANAGER_POLICY_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
index aaba1d340..75a098151 100644
--- a/src/gpu-compute/rr_scheduling_policy.hh
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -36,6 +36,7 @@
 
 #include <vector>
 
+#include "base/logging.hh"
 #include "gpu-compute/scheduling_policy.hh"
 #include "gpu-compute/wavefront.hh"
 
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
new file mode 100644
index 000000000..c8823b8a6
--- /dev/null
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#include "gpu-compute/scalar_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), queueSize(p->scalar_mem_queue_size),
+    inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+ScalarMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScalarMemPipeline";
+}
+
+void
+ScalarMemPipeline::exec()
+{
+    // afind oldest scalar request whose data has arrived
+    GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() :
+        !returnedStores.empty() ? returnedStores.front() : nullptr;
+
+    Wavefront *w = nullptr;
+
+    bool accessSrf = true;
+    // check the SRF to see if the operands of a load (or load component
+    // of an atomic) are accessible
+    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+        w = m->wavefront();
+
+        accessSrf =
+            w->computeUnit->srf[w->simdId]->
+                canScheduleWriteOperandsFromLoad(w, m);
+    }
+
+    if ((!returnedStores.empty() || !returnedLoads.empty()) &&
+        m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
+        accessSrf &&
+        (computeUnit->shader->coissue_return ||
+         computeUnit->scalarMemUnit.rdy())) {
+
+        w = m->wavefront();
+
+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->srf[w->simdId]->
+                scheduleWriteOperandsFromLoad(w, m);
+        }
+
+        m->completeAcc(m);
+
+        if (m->isLoad() || m->isAtomic()) {
+            returnedLoads.pop();
+            assert(inflightLoads > 0);
+            --inflightLoads;
+        } else {
+            returnedStores.pop();
+            assert(inflightStores > 0);
+            --inflightStores;
+        }
+
+        // Decrement outstanding register count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+        if (m->isStore() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
+                                             m->time, -1);
+        }
+
+        if (m->isLoad() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
+                                             m->time, -1);
+        }
+
+        // Mark write bus busy for appropriate amount of time
+        computeUnit->scalarMemToSrfBus.set(m->time);
+        if (!computeUnit->shader->coissue_return)
+            w->computeUnit->scalarMemUnit.set(m->time);
+    }
+
+    // If pipeline has executed a global memory instruction
+    // execute global memory packets and issue global
+    // memory packets to DTLB
+    if (!issuedRequests.empty()) {
+        GPUDynInstPtr mp = issuedRequests.front();
+        if (mp->isLoad() || mp->isAtomic()) {
+
+            if (inflightLoads >= queueSize) {
+                return;
+            } else {
+                ++inflightLoads;
+            }
+        } else {
+            if (inflightStores >= queueSize) {
+                return;
+            } else {
+                ++inflightStores;
+            }
+        }
+        mp->initiateAcc(mp);
+        issuedRequests.pop();
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
+                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+    }
+}
+
+void
+ScalarMemPipeline::regStats()
+{
+}
diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh
new file mode 100644
index 000000000..1944477cf
--- /dev/null
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file scalar_memory_pipeline.hh
+ *
+ * The scalar memory pipeline issues global memory packets
+ * from the scalar ALU to the DTLB and L1 Scalar Data Cache.
+ * The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This exec() method also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class ScalarMemPipeline
+{
+  public:
+    ScalarMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
+    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return returnedStores; }
+    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return returnedLoads; }
+
+    bool
+    isGMLdRespFIFOWrRdy() const
+    {
+        return returnedLoads.size() < queueSize;
+    }
+
+    bool
+    isGMStRespFIFOWrRdy() const
+    {
+        return returnedStores.size() < queueSize;
+    }
+
+    bool
+    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (issuedRequests.size() + pendReqs) < queueSize;
+    }
+
+    const std::string &name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int queueSize;
+
+    // Counters to track and limit the inflight scalar loads and stores
+    // generated by this memory pipeline.
+    int inflightStores;
+    int inflightLoads;
+
+    // Scalar Memory Request FIFO: all global memory scalar requests
+    // are issued to this FIFO from the scalar memory pipelines
+    std::queue<GPUDynInstPtr> issuedRequests;
+
+    // Scalar Store Response FIFO: all responses of global memory
+    // scalar stores are sent to this FIFO from L1 Scalar Data Cache
+    std::queue<GPUDynInstPtr> returnedStores;
+
+    // Scalar Load Response FIFO: all responses of global memory
+    // scalar loads are sent to this FIFO from L1 Scalar Data Cache
+    std::queue<GPUDynInstPtr> returnedLoads;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/scalar_register_file.cc b/src/gpu-compute/scalar_register_file.cc
new file mode 100644
index 000000000..150587676
--- /dev/null
+++ b/src/gpu-compute/scalar_register_file.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#include "gpu-compute/scalar_register_file.hh"
+
+#include "base/logging.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ScalarRegisterFile.hh"
+
+ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p)
+    : RegisterFile(p)
+{
+    regFile.resize(numRegs(), 0);
+}
+
+bool
+ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int pSgpr =
+                    computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+                if (regBusy(pSgpr)) {
+                    if (ii->isDstOperand(i)) {
+                        w->numTimesBlockedDueWAXDependencies++;
+                    } else if (ii->isSrcOperand(i)) {
+                        DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+                                w->wfDynId, ii->disassemble(), pSgpr);
+                        w->numTimesBlockedDueRAWDependencies++;
+                    }
+                    return false;
+                }
+            } // nRegs
+        } // isScalar
+    } // operand
+    return true;
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg =
+                    computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+                // mark the destination scalar register as busy
+                markReg(physReg, true);
+            }
+        }
+    }
+}
+
+void
+ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+    for (int i = 0; i < ii->getNumOperands(); i++) {
+        if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+            int DWORDs = ii->getOperandSize(i) <= 4 ? 1
+                : ii->getOperandSize(i) / 4;
+            registerReads += DWORDs;
+        }
+    }
+
+    if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) {
+        Cycles delay(computeUnit->scalarPipeLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+                int sgprIdx = ii->getRegisterIndex(i, ii);
+                int nRegs = ii->getOperandSize(i) <= 4 ? 1
+                    : ii->getOperandSize(i) / 4;
+                for (int j = 0; j < nRegs; j++) {
+                    int physReg = computeUnit->registerManager->
+                        mapSgpr(w, sgprIdx + j);
+                    enqRegFreeEvent(physReg, tickDelay);
+                }
+
+                registerWrites += nRegs;
+            }
+        }
+    }
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
+                                                  GPUDynInstPtr ii)
+{
+    assert(ii->isLoad() || ii->isAtomicRet());
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager->
+                    mapSgpr(w, sgprIdx + j);
+                enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+            }
+
+            registerWrites += nRegs;
+        }
+    }
+}
+
+ScalarRegisterFile*
+ScalarRegisterFileParams::create()
+{
+    return new ScalarRegisterFile(this);
+}
diff --git a/src/gpu-compute/scalar_register_file.hh b/src/gpu-compute/scalar_register_file.hh
new file mode 100644
index 000000000..8002334b3
--- /dev/null
+++ b/src/gpu-compute/scalar_register_file.hh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+
+#include "arch/gpu_isa.hh"
+#include "base/statistics.hh"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+struct ScalarRegisterFileParams;
+
+// Scalar Register File
+class ScalarRegisterFile : public RegisterFile
+{
+  public:
+    using ScalarRegU32 = TheGpuISA::ScalarRegU32;
+
+    ScalarRegisterFile(const ScalarRegisterFileParams *p);
+    ~ScalarRegisterFile() { }
+
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+    virtual void scheduleWriteOperands(Wavefront *w,
+                                       GPUDynInstPtr ii) override;
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii) override;
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
+
+    void
+    setParent(ComputeUnit *_computeUnit) override
+    {
+        RegisterFile::setParent(_computeUnit);
+    }
+
+    // Read a register that is writeable (e.g., a DST operand)
+    ScalarRegU32&
+    readWriteable(int regIdx)
+    {
+        return regFile[regIdx];
+    }
+
+    // Read a register that is not writeable (e.g., src operand)
+    ScalarRegU32
+    read(int regIdx) const
+    {
+        return regFile[regIdx];
+    }
+
+    // Write a register
+    void
+    write(int regIdx, ScalarRegU32 value)
+    {
+        regFile[regIdx] = value;
+    }
+
+    void
+    printReg(Wavefront *wf, int regIdx) const
+    {
+        DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId,
+            wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]);
+    }
+
+  private:
+    std::vector<ScalarRegU32> regFile;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 63ab8db7b..949eed155 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -33,24 +33,36 @@
 
 #include "gpu-compute/schedule_stage.hh"
 
+#include <unordered_set>
+
+#include "debug/GPUSched.hh"
+#include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
-    : numSIMDs(p->num_SIMDs),
-      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
+    : vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
+      scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
+      locMemBusRdy(false), locMemIssueRdy(false)
 {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+    for (int j = 0; j < cu->numExeUnits(); ++j) {
         scheduler.emplace_back(p);
     }
+    wavesInSch.clear();
+    schList.resize(cu->numExeUnits());
+    for (auto &dq : schList) {
+        dq.clear();
+    }
 }
 
 ScheduleStage::~ScheduleStage()
 {
     scheduler.clear();
-    waveStatusList.clear();
+    wavesInSch.clear();
+    schList.clear();
 }
 
 void
@@ -59,90 +71,775 @@ ScheduleStage::init(ComputeUnit *cu)
     computeUnit = cu;
     _name = computeUnit->name() + ".ScheduleStage";
 
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+    fatal_if(scheduler.size() != computeUnit->readyList.size(),
+             "Scheduler should have same number of entries as CU's readyList");
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
         scheduler[j].bindList(&computeUnit->readyList[j]);
     }
 
-    for (int j = 0; j < numSIMDs; ++j) {
-        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+    dispatchList = &computeUnit->dispatchList;
+
+    assert(computeUnit->numVectorGlobalMemUnits == 1);
+    assert(computeUnit->numVectorSharedMemUnits == 1);
+}
+
+void
+ScheduleStage::exec()
+{
+    // Update readyList
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        // delete all ready wavefronts whose instruction buffers are now
+        // empty because the last instruction was executed
+        computeUnit->updateReadyList(j);
+        /**
+         * Remove any wave that already has an instruction present in SCH
+         * waiting for RF reads to complete. This prevents out of order
+         * execution within a wave.
+         */
+        for (auto wIt = computeUnit->readyList.at(j).begin();
+             wIt != computeUnit->readyList.at(j).end();) {
+            if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
+                *wIt = nullptr;
+                wIt = computeUnit->readyList.at(j).erase(wIt);
+            } else {
+                wIt++;
+            }
+        }
+    }
+
+    // Attempt to add another wave for each EXE type to schList queues
+    // VMEM resources are iterated first, effectively giving priority
+    // to VMEM over VALU for scheduling read of operands to the RFs.
+    // Scalar Memory are iterated after VMEM
+
+    // Iterate VMEM and SMEM
+    int firstMemUnit = computeUnit->firstMemUnit();
+    int lastMemUnit = computeUnit->lastMemUnit();
+    for (int j = firstMemUnit; j <= lastMemUnit; j++) {
+        int readyListSize = computeUnit->readyList[j].size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *w = scheduler[j].chooseWave();
+        if (!addToSchList(j, w)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            w->schCycles++;
+            addToSchListStalls[j]++;
+        }
     }
 
-    dispatchList = &computeUnit->dispatchList;
+    // Iterate everything else
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        // skip the VMEM resources
+        if (j >= firstMemUnit && j <= lastMemUnit) {
+            continue;
+        }
+        int readyListSize = computeUnit->readyList[j].size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *w = scheduler[j].chooseWave();
+        if (!addToSchList(j, w)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            w->schCycles++;
+            addToSchListStalls[j]++;
+        }
+    }
+
+    // At this point, the schList queue per EXE type may contain
+    // multiple waves, in order of age (oldest to youngest).
+    // Wave may be in RFBUSY, indicating they are waiting for registers
+    // to be read, or in RFREADY, indicating they are candidates for
+    // the dispatchList and execution
+
+    // Iterate schList queues and check if any of the waves have finished
+    // reading their operands, moving those waves to RFREADY status
+    checkRfOperandReadComplete();
+
+    // Fill the dispatch list with the oldest wave of each EXE type that
+    // is ready to execute
+    // Wave is picked if status in schList is RFREADY and it passes resource
+    // ready checks similar to those currently in SCB
+    fillDispatchList();
+
+    // Resource arbitration on waves in dispatchList
+    // Losing waves are re-inserted to the schList at a location determined
+    // by wave age
+
+    // Arbitrate access to the VRF->LDS bus
+    arbitrateVrfToLdsBus();
+
+    // Schedule write operations to the register files
+    scheduleRfDestOperands();
+
+    // Lastly, reserve resources for waves that are ready to execute.
+    reserveResources();
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+                                        Wavefront *w)
+{
+    dispatchList->at(unitId).first = w;
+    dispatchList->at(unitId).second = s;
+}
+
+bool
+ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
+{
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    assert(ii);
+    bool accessVrfWr = true;
+    if (!ii->isScalar()) {
+        accessVrfWr =
+            computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+    }
+    bool accessSrfWr =
+        computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
+    bool accessRf = accessVrfWr && accessSrfWr;
+    if (accessRf) {
+        if (!ii->isScalar()) {
+            computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
+        }
+        computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
+        return true;
+    } else {
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        if (!accessSrfWr) {
+            rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+        }
+        if (!accessVrfWr) {
+            rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        w->schStalls++;
+        w->schRfAccessStalls++;
+    }
+    return false;
+}
+
+void
+ScheduleStage::scheduleRfDestOperands()
+{
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        if (!dispatchList->at(j).first) {
+            continue;
+        }
+        // get the wave on dispatch list and attempt to allocate write
+        // resources in the RFs
+        Wavefront *w = dispatchList->at(j).first;
+        if (!schedRfWrites(j, w)) {
+            reinsertToSchList(j, w);
+            doDispatchListTransition(j, EMPTY);
+            // if this is a flat inst, also transition the LM pipe to empty
+            // Note: since FLAT/LM arbitration occurs before scheduling
+            // destination operands to the RFs, it is possible that a LM
+            // instruction lost arbitration, but would have been able to
+            // pass the RF destination operand check here, and execute
+            // instead of the FLAT.
+            if (w->instructionBuffer.front()->isFlat()) {
+                assert(dispatchList->at(w->localMem).second == SKIP);
+                doDispatchListTransition(w->localMem, EMPTY);
+            }
+        }
+    }
+}
+
+bool
+ScheduleStage::addToSchList(int exeType, Wavefront *w)
+{
+    // Attempt to add the wave to the schList if the VRF can support the
+    // wave's next instruction
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    assert(ii);
+    bool accessVrf = true;
+    if (!ii->isScalar()) {
+        accessVrf =
+            computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
+    }
+    bool accessSrf =
+        computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
+    // If RFs can support instruction, add to schList in RFBUSY state,
+    // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
+    // to the VRF
+    bool accessRf = accessVrf && accessSrf;
+    if (accessRf) {
+        DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+
+        computeUnit->insertInPipeMap(w);
+        wavesInSch.emplace(w->wfDynId);
+        schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
+        if (w->isOldestInstWaitcnt()) {
+            w->setStatus(Wavefront::S_WAITCNT);
+        }
+        if (!ii->isScalar()) {
+            computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
+        }
+        computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
+
+        DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+        return true;
+    } else {
+        // Number of stall cycles due to RF access denied
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        // Count number of denials due to each reason
+        // Multiple items may contribute to the denied request
+        if (!accessVrf) {
+            rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+        }
+        if (!accessSrf) {
+            rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        w->schStalls++;
+        w->schRfAccessStalls++;
+        DPRINTF(GPUSched, "schList[%d]: Could not add: "
+                "SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+    }
+    return false;
+}
+
+void
+ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
+{
+    // Insert wave w into schList for specified exeType.
+    // Wave is inserted in age order, with oldest wave being at the
+    // front of the schList
+    auto schIter = schList.at(exeType).begin();
+    while (schIter != schList.at(exeType).end()
+           && schIter->first->wfDynId < w->wfDynId) {
+        schIter++;
+    }
+    schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
+}
+
+void
+ScheduleStage::checkMemResources()
+{
+    // Check for resource availability in the next cycle
+    scalarMemBusRdy = false;
+    scalarMemIssueRdy = false;
+    // check if there is a SRF->Global Memory bus available and
+    if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
+        scalarMemBusRdy = true;
+    }
+    // check if we can issue a scalar memory instruction
+    if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
+        scalarMemIssueRdy = true;
+    }
+
+    glbMemBusRdy = false;
+    glbMemIssueRdy = false;
+    // check if there is a VRF->Global Memory bus available
+    if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+        glbMemBusRdy = true;
+    }
+    // check if we can issue a Global memory instruction
+    if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
+        glbMemIssueRdy = true;
+    }
+
+    locMemBusRdy = false;
+    locMemIssueRdy = false;
+    // check if there is a VRF->LDS bus available
+    if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+        locMemBusRdy = true;
+    }
+    // check if we can issue a LDS instruction
+    if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
+        locMemIssueRdy = true;
+    }
+}
+
+bool
+ScheduleStage::dispatchReady(Wavefront *w)
+{
+    vectorAluRdy = false;
+    scalarAluRdy = false;
+    // check for available vector/scalar ALUs in the next cycle
+    if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
+        vectorAluRdy = true;
+    }
+    if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+        scalarAluRdy = true;
+    }
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+
+    if (ii->isNop()) {
+        // S_NOP requires SALU. V_NOP requires VALU.
+        // TODO: Scalar NOP does not require SALU in hardware,
+        // and is executed out of IB directly.
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!ii->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (ii->isEndOfKernel()) {
+        // EndPgm instruction
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
+        // Barrier, Branch, or ALU instruction
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!ii->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isGlobalMem()) {
+        // Vector Global Memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (ii->isScalar() && ii->isGlobalMem()) {
+        // Scalar Global Memory instruction
+        bool rdy = true;
+        if (!scalarMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+        }
+        if (!scalarMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->scalarMemoryPipe.
+                isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
+                                 w->scalarWrGmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isLocalMem()) {
+        // Vector Local Memory instruction
+        bool rdy = true;
+        if (!locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+        }
+        if (!locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->localMemoryPipe.
+                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isFlat()) {
+        // Vector Flat memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy || !locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy || !locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+        }
+        if (!computeUnit->localMemoryPipe.
+                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else {
+        panic("%s: unknown instr checked for readiness", ii->disassemble());
+        return false;
+    }
+    dispNrdyStalls[SCH_RDY]++;
+    return true;
 }
 
 void
-ScheduleStage::arbitrate()
-{
-    // iterate over all Memory pipelines
-    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
-        if (dispatchList->at(j).first) {
-            Wavefront *waveToMemPipe = dispatchList->at(j).first;
-            // iterate over all execution pipelines
-            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
-                if ((i != j) && (dispatchList->at(i).first)) {
-                    Wavefront *waveToExePipe = dispatchList->at(i).first;
-                    // if the two selected wavefronts are mapped to the same
-                    // SIMD unit then they share the VRF
-                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
-                        int simdId = waveToMemPipe->simdId;
-                        // Read VRF port arbitration:
-                        // If there are read VRF port conflicts between the
-                        // a memory and another instruction we drop the other
-                        // instruction. We don't need to check for write VRF
-                        // port conflicts because the memory instruction either
-                        // does not need to write to the VRF (store) or will
-                        // write to the VRF when the data comes back (load) in
-                        // which case the arbiter of the memory pipes will
-                        // resolve any conflicts
-                        if (computeUnit->vrf[simdId]->
-                            isReadConflict(waveToMemPipe->wfSlotId,
-                            waveToExePipe->wfSlotId)) {
-                            // FIXME: The "second" member variable is never
-                            // used in the model. I am setting it to READY
-                            // simply to follow the protocol of setting it
-                            // when the WF has an instruction ready to issue
-                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
-                                                    .second = READY;
-
-                            dispatchList->at(i).first = nullptr;
-                            dispatchList->at(i).second = EMPTY;
-                            break;
-                        }
+ScheduleStage::fillDispatchList()
+{
+    // update execution resource status
+    checkMemResources();
+    // iterate execution resources
+    for (int j = 0; j < computeUnit->numExeUnits(); j++) {
+        assert(dispatchList->at(j).second == EMPTY);
+
+        // iterate waves in schList to pick one for dispatch
+        auto schIter = schList.at(j).begin();
+        bool dispatched = false;
+        while (schIter != schList.at(j).end()) {
+            // only attempt to dispatch if status is RFREADY
+            if (schIter->second == RFREADY) {
+                // Check if this wave is ready for dispatch
+                bool dispRdy = dispatchReady(schIter->first);
+                if (!dispatched && dispRdy) {
+                    // No other wave has been dispatched for this exe
+                    // resource, and this wave is ready. Place this wave
+                    // on dispatchList and make it ready for execution
+                    // next cycle.
+
+                    // Acquire a coalescer token if it is a global mem
+                    // operation.
+                    GPUDynInstPtr mp = schIter->first->
+                                       instructionBuffer.front();
+                    if (!mp->isMemSync() && !mp->isScalar() &&
+                        (mp->isGlobalMem() || mp->isFlat())) {
+                        computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
+                    }
+
+                    doDispatchListTransition(j, EXREADY, schIter->first);
+                    DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
+                            "EMPTY->EXREADY\n", j);
+                    schIter->first = nullptr;
+                    schIter = schList.at(j).erase(schIter);
+                    dispatched = true;
+                } else {
+                    // Either another wave has been dispatched, or this wave
+                    // was not ready, so it is stalled this cycle
+                    schIter->first->schStalls++;
+                    if (!dispRdy) {
+                        // not ready for dispatch, increment stall stat
+                        schIter->first->schResourceStalls++;
                     }
+                    // Examine next wave for this resource
+                    schIter++;
                 }
+            } else {
+                // Wave not in RFREADY, try next wave
+                schIter++;
             }
         }
+
+        // Increment stall count if no wave sent to dispatchList for
+        // current execution resource
+        if (!dispatched) {
+            schListToDispListStalls[j]++;
+        } else {
+            schListToDispList[j]++;
+        }
     }
 }
 
 void
-ScheduleStage::exec()
+ScheduleStage::arbitrateVrfToLdsBus()
 {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
-         uint32_t readyListSize = computeUnit->readyList[j].size();
+    // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
+    // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
+    // and a VRF->LDS bus. In GFx9, this is not the case.
 
-         // If no wave is ready to be scheduled on the execution resource
-         // then skip scheduling for this execution resource
-         if (!readyListSize) {
-             continue;
-         }
+    // iterate the GM pipelines
+    for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
+        // get the GM pipe index in the dispatchList
+        int gm_exe_unit = computeUnit->firstMemUnit() + i;
+        // get the wave in the dispatchList
+        Wavefront *w = dispatchList->at(gm_exe_unit).first;
+        // If the WF is valid, ready to execute, and the instruction
+        // is a flat access, arbitrate with the WF's assigned LM pipe
+        if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
+            w->instructionBuffer.front()->isFlat()) {
+            // If the associated LM pipe also has a wave selected, block
+            // that wave and let the Flat instruction issue. The WF in the
+            // LM pipe is added back to the schList for consideration next
+            // cycle.
+            if (dispatchList->at(w->localMem).second == EXREADY) {
+                reinsertToSchList(w->localMem,
+                                  dispatchList->at(w->localMem).first);
+                // Increment stall stats for LDS-VRF arbitration
+                ldsBusArbStalls++;
+                dispatchList->at(w->localMem).first->schLdsArbStalls++;
+            }
+            // With arbitration of LM pipe complete, transition the
+            // LM pipe to SKIP state in the dispatchList to inform EX stage
+            // that a Flat instruction is executing next cycle
+            doDispatchListTransition(w->localMem, SKIP, w);
+            DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
+                    "EXREADY->SKIP\n", w->localMem);
+        }
+    }
+}
+
+void
+ScheduleStage::checkRfOperandReadComplete()
+{
+    // Iterate the schList queues and check if operand reads
+    // have completed in the RFs. If so, mark the wave as ready for
+    // selection for dispatchList
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        for (auto &p : schList.at(j)) {
+            Wavefront *w = p.first;
+            assert(w);
 
-         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
-         dispatchList->at(j).first = waveToBeDispatched;
-         waveToBeDispatched->updateResources();
-         dispatchList->at(j).second = FILLED;
+            // Increment the number of cycles the wave spends in the
+            // SCH stage, since this loop visits every wave in SCH.
+            w->schCycles++;
 
-         waveStatusList[waveToBeDispatched->simdId]->at(
-                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+            GPUDynInstPtr ii = w->instructionBuffer.front();
+            bool vrfRdy = true;
+            if (!ii->isScalar()) {
+                vrfRdy =
+                    computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
+            }
+            bool srfRdy =
+                computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
+            bool operandsReady = vrfRdy && srfRdy;
+            if (operandsReady) {
+                DPRINTF(GPUSched,
+                        "schList[%d]: WV[%d] operands ready for: %d: %s\n",
+                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
+                        j, w->wfDynId);
+                p.second = RFREADY;
+            } else {
+                DPRINTF(GPUSched,
+                        "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
+                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+
+                // operands not ready yet, increment SCH stage stats
+                // aggregate to all wavefronts on the CU
+                p.second = RFBUSY;
+
+                // Increment stall stats
+                w->schStalls++;
+                w->schOpdNrdyStalls++;
 
-         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+                opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+                if (!vrfRdy) {
+                    opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+                }
+                if (!srfRdy) {
+                    opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+                }
+            }
+        }
     }
-    // arbitrate over all shared resources among instructions being issued
-    // simultaneously
-    arbitrate();
+}
+
+void
+ScheduleStage::reserveResources()
+{
+    std::vector<bool> exeUnitReservations;
+    exeUnitReservations.resize(computeUnit->numExeUnits(), false);
+
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        Wavefront *dispatchedWave = dispatchList->at(j).first;
+        if (dispatchedWave) {
+            DISPATCH_STATUS s = dispatchList->at(j).second;
+            if (s == EMPTY) {
+                continue;
+            } else if (s == EXREADY) {
+                // Wave is ready for execution
+                std::vector<int> execUnitIds =
+                    dispatchedWave->reserveResources();
+                GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
+
+                if (!ii->isScalar()) {
+                    computeUnit->vrf[dispatchedWave->simdId]->
+                        dispatchInstruction(ii);
+                }
+                computeUnit->srf[dispatchedWave->simdId]->
+                    dispatchInstruction(ii);
+
+                std::stringstream ss;
+                for (auto id : execUnitIds) {
+                    ss << id << " ";
+                }
+                DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
+                        "    Reserving ExeRes[ %s]\n",
+                        j, dispatchedWave->simdId, dispatchedWave->wfDynId,
+                        ii->seqNum(), ii->disassemble(), ss.str());
+                // mark the resources as reserved for this cycle
+                for (auto execUnitId : execUnitIds) {
+                    panic_if(exeUnitReservations.at(execUnitId),
+                             "Execution unit %d is reserved!!!\n"
+                             "SIMD[%d] WV[%d]: %d: %s",
+                             execUnitId, dispatchedWave->simdId,
+                             dispatchedWave->wfDynId,
+                             ii->seqNum(), ii->disassemble());
+                    exeUnitReservations.at(execUnitId) = true;
+                }
+
+                // If wavefront::reserveResources reserved multiple resources,
+                // then we're executing a flat memory instruction. This means
+                // that we've reserved a global and local memory unit. Thus,
+                // we need to mark the latter execution unit as not available.
+                if (execUnitIds.size() > 1) {
+                    int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
+                    assert(dispatchList->at(lm_exec_unit).second == SKIP);
+                }
+            } else if (s == SKIP) {
+                // Shared Memory pipe reserved for FLAT instruction.
+                // Verify the GM pipe for this wave is ready to execute
+                // and the wave in the GM pipe is the same as the wave
+                // in the LM pipe
+                int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
+                assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
+                       dispatchedWave->wfDynId);
+                assert(dispatchList->at(gm_exec_unit).second == EXREADY);
+            }
+        }
+    }
+}
+
+void
+ScheduleStage::deleteFromSch(Wavefront *w)
+{
+    wavesInSch.erase(w->wfDynId);
 }
 
 void
 ScheduleStage::regStats()
 {
+    rdyListNotEmpty
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".rdy_list_not_empty")
+        .desc("number of cycles one or more wave on ready list per "
+              "execution resource")
+        ;
+
+    rdyListEmpty
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".rdy_list_empty")
+        .desc("number of cycles no wave on ready list per "
+              "execution resource")
+        ;
+
+    addToSchListStalls
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_add_stalls")
+        .desc("number of cycles a wave is not added to schList per "
+              "execution resource when ready list is not empty")
+        ;
+
+    schListToDispList
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_to_disp_list")
+        .desc("number of cycles a wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    schListToDispListStalls
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_to_disp_list_stalls")
+        .desc("number of cycles no wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    // Operand Readiness Stall Cycles
+    opdNrdyStalls
+        .init(SCH_RF_OPD_NRDY_CONDITIONS)
+        .name(name() + ".opd_nrdy_stalls")
+        .desc("number of stalls in SCH due to operands not ready")
+        ;
+    opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
+    opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
+    opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
+
+    // dispatchReady Stall Cycles
+    dispNrdyStalls
+        .init(SCH_NRDY_CONDITIONS)
+        .name(name() + ".disp_nrdy_stalls")
+        .desc("number of stalls in SCH due to resource not ready")
+        ;
+    dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
+                                  csprintf("VectorMemIssue"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("VectorMemBusBusy"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
+                                  csprintf("VectorMemCoalescer"));
+    dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
+                                  csprintf("ScalarMemIssue"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("ScalarMemBusBusy"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
+                                  csprintf("ScalarMemFIFO"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
+                                  csprintf("LocalMemIssue"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+                                  csprintf("LocalMemBusBusy"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
+                                  csprintf("LocalMemFIFO"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
+                                  csprintf("FlatMemIssue"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
+                                  csprintf("FlatMemBusBusy"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
+                                  csprintf("FlatMemCoalescer"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
+                                  csprintf("FlatMemFIFO"));
+    dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
+
+    // RF Access Stall Cycles
+    rfAccessStalls
+        .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
+        .name(name() + ".rf_access_stalls")
+        .desc("number of stalls due to RF access denied")
+        ;
+    rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
+    rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
+    rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
+    rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
+    rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
+
+    // Stall cycles due to wave losing LDS bus arbitration
+    ldsBusArbStalls
+        .name(name() + ".lds_bus_arb_stalls")
+        .desc("number of stalls due to VRF->LDS bus conflicts")
+        ;
 }
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
index ee2dd14f7..98519701a 100644
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -34,6 +34,9 @@
 #ifndef __SCHEDULE_STAGE_HH__
 #define __SCHEDULE_STAGE_HH__
 
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -54,40 +57,169 @@ struct ComputeUnitParams;
 class ScheduleStage
 {
   public:
-    ScheduleStage(const ComputeUnitParams *params);
+    ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu);
     ~ScheduleStage();
     void init(ComputeUnit *cu);
     void exec();
-    void arbitrate();
+
     // Stats related variables and methods
     std::string name() { return _name; }
+    enum SchNonRdyType {
+        SCH_SCALAR_ALU_NRDY,
+        SCH_VECTOR_ALU_NRDY,
+        SCH_VECTOR_MEM_ISSUE_NRDY,
+        SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+        SCH_VECTOR_MEM_COALESCER_NRDY,
+        SCH_VECTOR_MEM_REQS_NRDY,
+        SCH_CEDE_SIMD_NRDY,
+        SCH_SCALAR_MEM_ISSUE_NRDY,
+        SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+        SCH_SCALAR_MEM_FIFO_NRDY,
+        SCH_LOCAL_MEM_ISSUE_NRDY,
+        SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+        SCH_LOCAL_MEM_FIFO_NRDY,
+        SCH_FLAT_MEM_ISSUE_NRDY,
+        SCH_FLAT_MEM_BUS_BUSY_NRDY,
+        SCH_FLAT_MEM_COALESCER_NRDY,
+        SCH_FLAT_MEM_REQS_NRDY,
+        SCH_FLAT_MEM_FIFO_NRDY,
+        SCH_RDY,
+        SCH_NRDY_CONDITIONS
+    };
+    enum schopdnonrdytype_e {
+        SCH_VRF_OPD_NRDY,
+        SCH_SRF_OPD_NRDY,
+        SCH_RF_OPD_NRDY,
+        SCH_RF_OPD_NRDY_CONDITIONS
+    };
+    enum schrfaccessnonrdytype_e {
+        SCH_VRF_RD_ACCESS_NRDY,
+        SCH_VRF_WR_ACCESS_NRDY,
+        SCH_SRF_RD_ACCESS_NRDY,
+        SCH_SRF_WR_ACCESS_NRDY,
+        SCH_RF_ACCESS_NRDY,
+        SCH_RF_ACCESS_NRDY_CONDITIONS
+    };
+
     void regStats();
 
+    // Called by ExecStage to inform SCH of instruction execution
+    void deleteFromSch(Wavefront *w);
+
+    // Schedule List status
+    enum SCH_STATUS
+    {
+        RFBUSY = 0, // RF busy reading operands
+        RFREADY, // ready for exec
+    };
+
   private:
     ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-    uint32_t numMemUnits;
-
     // Each execution resource will have its own
     // scheduler and a dispatch list
     std::vector<Scheduler> scheduler;
 
-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
-        waveStatusList;
-
     // List of waves which will be dispatched to
-    // each execution resource. A FILLED implies
-    // dispatch list is non-empty and
-    // execution unit has something to execute
-    // this cycle. Currently, the dispatch list of
+    // each execution resource.
+    // Currently, the dispatch list of
     // an execution resource can hold only one wave because
     // an execution resource can execute only one wave in a cycle.
     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
 
+    // Stats
+
+    // Number of cycles with empty (or not empty) readyList, per execution
+    // resource, when the CU is active (not sleeping)
+    Stats::Vector rdyListEmpty;
+    Stats::Vector rdyListNotEmpty;
+
+    // Number of cycles, per execution resource, when at least one wave
+    // was on the readyList and picked by scheduler, but was unable to be
+    // added to the schList, when the CU is active (not sleeping)
+    Stats::Vector addToSchListStalls;
+
+    // Number of cycles, per execution resource, when a wave is selected
+    // as candidate for dispatchList from schList
+    // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+    Stats::Vector schListToDispList;
+
+    // Per execution resource stat, incremented once per cycle if no wave
+    // was selected as candidate for dispatch and moved to dispatchList
+    Stats::Vector schListToDispListStalls;
+
+    // Number of times a wave is selected by the scheduler but cannot
+    // be added to the schList due to register files not being able to
+    // support reads or writes of operands. RF_ACCESS_NRDY condition is always
+    // incremented if at least one read/write not supported, other
+    // conditions are incremented independently from each other.
+    Stats::Vector rfAccessStalls;
+
+    // Number of times a wave is executing FLAT instruction and
+    // forces another wave occupying its required local memory resource
+    // to be deselected for execution, and placed back on schList
+    Stats::Scalar ldsBusArbStalls;
+
+    // Count of times VRF and/or SRF blocks waves on schList from
+    // performing RFBUSY->RFREADY transition
+    Stats::Vector opdNrdyStalls;
+
+    // Count of times resource required for dispatch is not ready and
+    // blocks wave in RFREADY state on schList from potentially moving
+    // to dispatchList
+    Stats::Vector dispNrdyStalls;
+
     std::string _name;
+
+    // called by exec() to add a wave to schList if the RFs can support it
+    bool addToSchList(int exeType, Wavefront *w);
+    // re-insert a wave to schList if wave lost arbitration
+    // wave is inserted such that age order (oldest to youngest) is preserved
+    void reinsertToSchList(int exeType, Wavefront *w);
+    // check waves in schList to see if RF reads complete
+    void checkRfOperandReadComplete();
+    // check execution resources for readiness
+    bool vectorAluRdy;
+    bool scalarAluRdy;
+    bool scalarMemBusRdy;
+    bool scalarMemIssueRdy;
+    bool glbMemBusRdy;
+    bool glbMemIssueRdy;
+    bool locMemBusRdy;
+    bool locMemIssueRdy;
+    // check status of memory pipes and RF to Mem buses
+    void checkMemResources();
+    // resource ready check called by fillDispatchList
+    bool dispatchReady(Wavefront *w);
+    // pick waves from schList and populate dispatchList with one wave
+    // per EXE resource type
+    void fillDispatchList();
+    // arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
+    void arbitrateVrfToLdsBus();
+    // schedule destination operand writes to register files for waves in
+    // dispatchList
+    void scheduleRfDestOperands();
+    // invoked by scheduleRfDestOperands to schedule RF writes for a wave
+    bool schedRfWrites(int exeType, Wavefront *w);
+    // reserve resources for waves surviving arbitration in dispatchList
+    void reserveResources();
+
+    void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+                                  Wavefront *w = nullptr);
+
+    // Set tracking wfDynId for each wave present in schedule stage
+    // Used to allow only one instruction per wave in schedule
+    std::unordered_set<uint64_t> wavesInSch;
+
+    // List of waves (one list per exe resource) that are in schedule
+    // stage. Waves are added to this list after selected by scheduler
+    // from readyList. Waves are removed from this list and placed on
+    // dispatchList when status reaches SCHREADY.
+    // Waves are kept ordered by age for each resource, always favoring
+    // forward progress for the oldest wave.
+    // The maximum number of waves per resource can be determined by either
+    // the VRF/SRF availability or limits imposed by paremeters (to be added)
+    // of the SCH stage or CU.
+    std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
 };
 
 #endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
index 262378e2c..c4b9b9fb6 100644
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -33,29 +33,23 @@
 
 #include "gpu-compute/scoreboard_check_stage.hh"
 
+#include "debug/GPUExec.hh"
+#include "debug/GPUSched.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/ComputeUnit.hh"
 
 ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
-    : numSIMDs(p->num_SIMDs),
-      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
-      numShrMemPipes(p->num_shared_mem_pipes),
-      vectorAluInstAvail(nullptr),
-      lastGlbMemSimd(-1),
-      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
-      shrMemInstAvail(nullptr)
 {
 }
 
 ScoreboardCheckStage::~ScoreboardCheckStage()
 {
     readyList.clear();
-    waveStatusList.clear();
-    shrMemInstAvail = nullptr;
-    glbMemInstAvail = nullptr;
 }
 
 void
@@ -64,102 +58,212 @@ ScoreboardCheckStage::init(ComputeUnit *cu)
     computeUnit = cu;
     _name = computeUnit->name() + ".ScoreboardCheckStage";
 
-    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
         readyList.push_back(&computeUnit->readyList[unitId]);
     }
-
-    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
-        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
-    }
-
-    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
-    glbMemInstAvail= &computeUnit->glbMemInstAvail;
-    shrMemInstAvail= &computeUnit->shrMemInstAvail;
 }
 
 void
-ScoreboardCheckStage::initStatistics()
+ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
 {
-    lastGlbMemSimd = -1;
-    lastShrMemSimd = -1;
-    *glbMemInstAvail = 0;
-    *shrMemInstAvail = 0;
-
-    for (int unitId = 0; unitId < numSIMDs; ++unitId)
-        vectorAluInstAvail->at(unitId) = false;
+    panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
+             "Instruction ready status %d is illegal!!!", rdyStatus);
+    stallCycles[rdyStatus]++;
 }
 
-void
-ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+// It also returns the reason (in rdyStatus) if the instruction is not
+// ready. Finally it sets the execution resource type (in exesResType)
+// of the instruction, only if it ready.
+bool
+ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
+                            int *exeResType, int wfSlot)
 {
-    if (curWave->instructionBuffer.empty())
-        return;
-
-    // track which vector SIMD unit has at least one WV with a vector
-    // ALU as the oldest instruction in its Instruction buffer
-    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
-                                     curWave->isOldestInstALU();
-
-    // track how many vector SIMD units have at least one WV with a
-    // vector Global memory instruction as the oldest instruction
-    // in its Instruction buffer
-    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
-         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
-        *glbMemInstAvail <= 1) {
-        (*glbMemInstAvail)++;
-        lastGlbMemSimd = unitId;
+    /**
+     * The waitCnt checks have to be done BEFORE checking for Instruction
+     * buffer empty condition. Otherwise, it will result into a deadlock if
+     * the last instruction in the Instruction buffer is a waitCnt: after
+     * executing the waitCnt, the Instruction buffer would be empty and the
+     * ready check logic will exit BEFORE checking for wait counters being
+     * satisfied.
+     */
+
+    // waitCnt instruction has been dispatched or executed: next
+    // instruction should be blocked until waitCnts are satisfied.
+    if (w->getStatus() == Wavefront::S_WAITCNT) {
+        if (!w->waitCntsSatisfied()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
+    }
+
+    // Is the wave waiting at a barrier. Check this condition BEFORE checking
+    // for instruction buffer occupancy to avoid a deadlock when the barrier is
+    // the last instruction in the instruction buffer.
+    if (w->stalledAtBarrier) {
+        if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
+                        computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+            // Are all threads at barrier?
+            *rdyStatus = NRDY_BARRIER_WAIT;
+            return false;
+        }
+        w->oldBarrierCnt = w->barrierCnt;
+        w->stalledAtBarrier = false;
+    }
+
+    // Check WF status: it has to be running
+    if (w->getStatus() == Wavefront::S_STOPPED ||
+        w->getStatus() == Wavefront::S_RETURNING ||
+        w->getStatus() == Wavefront::S_STALLED) {
+        *rdyStatus = NRDY_WF_STOP;
+        return false;
+    }
+
+    // is the Instruction buffer empty
+    if ( w->instructionBuffer.empty()) {
+        *rdyStatus = NRDY_IB_EMPTY;
+        return false;
+    }
+
+    // Check next instruction from instruction buffer
+    GPUDynInstPtr ii = w->nextInstr();
+    // Only instruction in the instruction buffer has been dispatched.
+    // No need to check it again for readiness
+    if (!ii) {
+        *rdyStatus = NRDY_IB_EMPTY;
+        return false;
+    }
+
+    // The following code is very error prone and the entire process for
+    // checking readiness will be fixed eventually.  In the meantime, let's
+    // make sure that we do not silently let an instruction type slip
+    // through this logic and always return not ready.
+    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
+         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
+         ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) {
+        panic("next instruction: %s is of unknown type\n", ii->disassemble());
     }
 
-    // track how many vector SIMD units have at least one WV with a
-    // vector shared memory (LDS) instruction as the oldest instruction
-    // in its Instruction buffer
-    // TODO: parametrize the limit of the LDS units
-    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
-        lastShrMemSimd != unitId) {
-        (*shrMemInstAvail)++;
-        lastShrMemSimd = unitId;
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
+            computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
+
+    // Non-scalar (i.e., vector) instructions may use VGPRs
+    if (!ii->isScalar()) {
+        if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
+            *rdyStatus = NRDY_VGPR_NRDY;
+            return false;
+        }
     }
+    // Scalar and non-scalar instructions may use SGPR
+    if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
+        *rdyStatus = NRDY_SGPR_NRDY;
+        return false;
+    }
+
+    // The hardware implicitly executes S_WAITCNT 0 before executing
+    // the S_ENDPGM instruction. Implementing this implicit S_WAITCNT.
+    // isEndOfKernel() is used to identify the S_ENDPGM instruction
+    // On identifying it, we do the following:
+    // 1. Wait for all older instruction to execute
+    // 2. Once all the older instruction are executed, we add a wait
+    //    count for the executed instruction(s) to complete.
+    if (ii->isEndOfKernel()) {
+        // Waiting for older instruction to execute
+        if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
+        // Older instructions have executed, adding implicit wait count
+        w->setStatus(Wavefront::S_WAITCNT);
+        w->setWaitCnts(0, 0, 0);
+        if (!w->waitCntsSatisfied()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
+    }
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+            w->simdId, w->wfSlotId, ii->disassemble());
+    *exeResType = mapWaveToExeUnit(w);
+    *rdyStatus = INST_RDY;
+    return true;
+}
+
+int
+ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
+{
+    GPUDynInstPtr ii = w->nextInstr();
+    assert(ii);
+    if (ii->isFlat()) {
+        /**
+         * NOTE: Flat memory ops requires both GM and LM resources.
+         * The simulator models consumption of both GM and LM
+         * resources in the schedule stage. At instruction execution time,
+         * after the aperture check is performed, only the GM or LM pipe
+         * is actually reserved by the timing model. The GM unit is returned
+         * here since Flat ops occupy the GM slot in the ready and dispatch
+         * lists. They also consume the LM slot in the dispatch list.
+         */
+        return w->globalMem;
+    } else if (ii->isLocalMem()) {
+        return w->localMem;
+    } else if (ii->isGlobalMem()) {
+        if (!ii->isScalar()) {
+            return w->globalMem;
+        } else {
+            return w->scalarMem;
+        }
+    } else if (ii->isBranch() ||
+               ii->isALU() ||
+               (ii->isKernArgSeg() && ii->isLoad()) ||
+               ii->isArgSeg() ||
+               ii->isReturn() ||
+               ii->isEndOfKernel() ||
+               ii->isNop() ||
+               ii->isBarrier()) {
+        if (!ii->isScalar()) {
+            return w->simdId;
+        } else {
+            return w->scalarAluGlobalIdx;
+        }
+    }
+    panic("%s: unmapped to an execution resource", ii->disassemble());
+    return computeUnit->numExeUnits();
 }
 
 void
 ScoreboardCheckStage::exec()
 {
-    initStatistics();
-
     // reset the ready list for all execution units; it will be
     // constructed every cycle since resource availability may change
-    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+        // Reset wavefront pointers to nullptr so clear() on the vector
+        // does not accidentally destruct the wavefront object
+        for (int i = 0; i < readyList[unitId]->size(); i++) {
+            readyList[unitId]->at(i) = nullptr;
+        }
         readyList[unitId]->clear();
     }
-
-    // iterate over the Wavefronts of all SIMD units
-    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
-        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+    // iterate over all WF slots across all vector ALUs
+    for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
+        for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
             // reset the ready status of each wavefront
-            waveStatusList[unitId]->at(wvId).second = BLOCKED;
-            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
-            collectStatistics(curWave, unitId);
-
-            if (curWave->ready(Wavefront::I_ALU)) {
-                readyList[unitId]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
-                if (computeUnit->cedeSIMD(unitId, wvId)) {
-                    continue;
-                }
-
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_SHARED)) {
-                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_FLAT)) {
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
+            Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
+            nonrdytype_e rdyStatus = NRDY_ILLEGAL;
+            int exeResType = -1;
+            // check WF readiness: If the WF's oldest
+            // instruction is ready to issue then add the WF to the ready list
+            if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) {
+                assert(curWave->simdId == simdId);
+                DPRINTF(GPUSched,
+                        "Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n",
+                        exeResType,
+                        curWave->simdId, curWave->wfDynId,
+                        curWave->nextInstr()->seqNum(),
+                        curWave->nextInstr()->disassemble());
+                readyList.at(exeResType)->push_back(curWave);
             }
+            collectStatistics(rdyStatus);
         }
     }
 }
@@ -167,4 +271,16 @@ ScoreboardCheckStage::exec()
 void
 ScoreboardCheckStage::regStats()
 {
+    stallCycles
+        .init(NRDY_CONDITIONS)
+        .name(name() + ".stall_cycles")
+        .desc("number of cycles wave stalled in SCB")
+        ;
+    stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
+    stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
+    stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
+    stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait"));
+    stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy"));
+    stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy"));
+    stallCycles.subname(INST_RDY, csprintf("InstrReady"));
 }
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
index 9f690d7b6..1e5695139 100644
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -36,20 +36,17 @@
 
 #include <cstdint>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "sim/stats.hh"
+
 class ComputeUnit;
 class Wavefront;
 
 struct ComputeUnitParams;
 
-enum WAVE_STATUS
-{
-    BLOCKED = 0,
-    READY
-};
-
 /*
  * Scoreboard check stage.
  * All wavefronts are analyzed to see if they are ready
@@ -61,6 +58,18 @@ enum WAVE_STATUS
 class ScoreboardCheckStage
 {
   public:
+    enum nonrdytype_e {
+        NRDY_ILLEGAL,
+        NRDY_WF_STOP,
+        NRDY_IB_EMPTY,
+        NRDY_WAIT_CNT,
+        NRDY_BARRIER_WAIT,
+        NRDY_VGPR_NRDY,
+        NRDY_SGPR_NRDY,
+        INST_RDY,
+        NRDY_CONDITIONS
+    };
+
     ScoreboardCheckStage(const ComputeUnitParams* params);
     ~ScoreboardCheckStage();
     void init(ComputeUnit *cu);
@@ -71,31 +80,18 @@ class ScoreboardCheckStage
     void regStats();
 
   private:
-    void collectStatistics(Wavefront *curWave, int unitId);
-    void initStatistics();
+    void collectStatistics(nonrdytype_e rdyStatus);
+    int mapWaveToExeUnit(Wavefront *w);
+    bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
+               int *exeResType, int wfSlot);
     ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-    uint32_t numMemUnits;
-    uint32_t numShrMemPipes;
-
-    // flag per vector SIMD unit that is set when there is at least one
-    // WF that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer
-    std::vector<bool> *vectorAluInstAvail;
-    int lastGlbMemSimd;
-    int lastShrMemSimd;
 
-    int *glbMemInstAvail;
-    int *shrMemInstAvail;
     // List of waves which are ready to be scheduled.
     // Each execution resource has a ready list
     std::vector<std::vector<Wavefront*>*> readyList;
 
-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
-        waveStatusList;
+    // Stats
+    Stats::Vector stallCycles;
 
     std::string _name;
 };
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 91f78a50a..4be2fbfbd 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -39,37 +39,63 @@
 #include "base/chunk_generator.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUMem.hh"
-#include "debug/HSAIL.hh"
+#include "debug/GPUShader.hh"
+#include "debug/GPUWgLatency.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "sim/sim_exit.hh"
 
-Shader::Shader(const Params *p)
-    : ClockedObject(p), clock(p->clk_domain->clockPeriod()),
-      cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
-      tickEvent([this]{ processTick(); }, "Shader tick",
-                false, Event::CPU_Tick_Pri),
-      timingSim(p->timing), hsail_mode(SIMT),
-      impl_kern_boundary_sync(p->impl_kern_boundary_sync),
-      separate_acquire_release(p->separate_acquire_release), coissue_return(1),
-      trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
-      globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
-      box_tick_cnt(0), start_tick_cnt(0)
+Shader::Shader(const Params *p) : ClockedObject(p),
+    _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
+    gpuTc(nullptr), cpuPointer(p->cpu_pointer),
+    tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
+          false, Event::CPU_Tick_Pri),
+    timingSim(p->timing), hsail_mode(SIMT),
+    impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    coissue_return(1),
+    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+    globalMemSize(p->globalmem),
+    nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
+    _dispatcher(*p->dispatcher),
+    max_valu_insts(p->max_valu_insts), total_valu_insts(0)
 {
+    gpuCmdProc.setShader(this);
+    _dispatcher.setShader(this);
+
+    _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
+    _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+
+    _ldsApe.base = ((Addr)1 << 61) + 0x0;
+    _ldsApe.limit =  (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+    _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
+    _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+    shHiddenPrivateBaseVmid = 0;
 
     cuList.resize(n_cu);
 
+    panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
+
     for (int i = 0; i < n_cu; ++i) {
         cuList[i] = p->CUs[i];
         assert(i == cuList[i]->cu_id);
         cuList[i]->shader = this;
+        cuList[i]->idleCUTimeout = p->idlecu_timeout;
     }
 }
 
+GPUDispatcher&
+Shader::dispatcher()
+{
+    return _dispatcher;
+}
+
 Addr
 Shader::mmap(int length)
 {
@@ -83,11 +109,11 @@ Shader::mmap(int length)
     auto mem_state = proc->memState;
 
     if (proc->mmapGrowsDown()) {
-        DPRINTF(HSAIL, "GROWS DOWN");
+        DPRINTF(GPUShader, "GROWS DOWN");
         start = mem_state->getMmapEnd() - length;
         mem_state->setMmapEnd(start);
     } else {
-        DPRINTF(HSAIL, "GROWS UP");
+        DPRINTF(GPUShader, "GROWS UP");
         start = mem_state->getMmapEnd();
         mem_state->setMmapEnd(start + length);
 
@@ -96,7 +122,7 @@ Shader::mmap(int length)
                mem_state->getMmapEnd());
     }
 
-    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+    DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
 
     proc->allocateMem(start, length);
 
@@ -146,15 +172,15 @@ ShaderParams::create()
 }
 
 void
-Shader::exec()
+Shader::execScheduledAdds()
 {
-    tick_cnt = curTick();
-    box_tick_cnt = curTick() - start_tick_cnt;
+    assert(!sa_when.empty());
 
     // apply any scheduled adds
     for (int i = 0; i < sa_n; ++i) {
-        if (sa_when[i] <= tick_cnt) {
+        if (sa_when[i] <= curTick()) {
             *sa_val[i] += sa_x[i];
+            panic_if(*sa_val[i] < 0, "Negative counter value\n");
             sa_val.erase(sa_val.begin() + i);
             sa_x.erase(sa_x.begin() + i);
             sa_when.erase(sa_when.begin() + i);
@@ -162,14 +188,62 @@ Shader::exec()
             --i;
         }
     }
+    if (!sa_when.empty()) {
+        Tick shader_wakeup = *std::max_element(sa_when.begin(),
+                 sa_when.end());
+        DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
+        schedule(tickEvent, shader_wakeup);
+    } else {
+        DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
+    }
+}
+
+/*
+ * dispatcher/shader arranges invalidate requests to the CUs
+ */
+void
+Shader::prepareInvalidate(HSAQueueEntry *task) {
+    // if invalidate has already started/finished, then do nothing
+    if (task->isInvStarted()) return;
+
+    // invalidate has never started; it can only perform once at kernel launch
+    assert(task->outstandingInvs() == -1);
+    int kernId = task->dispatchId();
+    // counter value is 0 now, indicating the inv is about to start
+    _dispatcher.updateInvCounter(kernId, +1);
+
+    // iterate all cus managed by the shader, to perform invalidate.
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        // create a request to hold INV info; the request's fields will
+        // be updated in cu before use
+        auto req = std::make_shared<Request>(0, 0, 0,
+                                             cuList[i_cu]->masterId(),
+                                             0, -1);
+
+        _dispatcher.updateInvCounter(kernId, +1);
+        // all necessary INV flags are all set now, call cu to execute
+        cuList[i_cu]->doInvalidate(req, task->dispatchId());
+    }
+}
 
-    // clock all of the cu's
-    for (int i = 0; i < n_cu; ++i)
-        cuList[i]->exec();
+/**
+ * dispatcher/shader arranges flush requests to the CUs
+ */
+void
+Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
+    int kernId = gpuDynInst->kern_id;
+    // flush has never been started, performed only once at kernel end
+    assert(_dispatcher.getOutstandingWbs(kernId) == 0);
+
+    // iterate all cus, managed by the shader, to perform flush.
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        _dispatcher.updateWbCounter(kernId, +1);
+        cuList[i_cu]->doFlush(gpuDynInst);
+    }
 }
 
 bool
-Shader::dispatch_workgroups(NDRange *ndr)
+Shader::dispatchWorkgroups(HSAQueueEntry *task)
 {
     bool scheduledSomething = false;
     int cuCount = 0;
@@ -182,32 +256,24 @@ Shader::dispatch_workgroups(NDRange *ndr)
         // dispatch workgroup iff the following two conditions are met:
         // (a) wg_rem is true - there are unassigned workgroups in the grid
         // (b) there are enough free slots in cu cuList[i] for this wg
-        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+        if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
             scheduledSomething = true;
-            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
-
-            // ticks() member function translates cycles to simulation ticks.
-            if (!tickEvent.scheduled()) {
-                schedule(tickEvent, curTick() + this->ticks(1));
+            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
+                            curCu, task->globalWgId());
+            DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
+                    curTick(), task->globalWgId(), curCu);
+
+            if (!cuList[curCu]->tickEvent.scheduled()) {
+                if (!_activeCus)
+                    _lastInactiveTick = curTick();
+                _activeCus++;
             }
 
-            cuList[curCu]->StartWorkgroup(ndr);
-            ndr->wgId[0]++;
-            ndr->globalWgId++;
-            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
-                ndr->wgId[0] = 0;
-                ndr->wgId[1]++;
-
-                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
-                    ndr->wgId[1] = 0;
-                    ndr->wgId[2]++;
-
-                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
-                        ndr->wg_disp_rem = false;
-                        break;
-                    }
-                }
-            }
+            panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+                     "Invalid activeCu size\n");
+            cuList[curCu]->dispWorkgroup(task);
+
+            task->markWgDispatch();
         }
 
         ++cuCount;
@@ -218,9 +284,83 @@ Shader::dispatch_workgroups(NDRange *ndr)
 }
 
 void
-Shader::handshake(GpuDispatcher *_dispatcher)
+Shader::regStats()
 {
-    dispatcher = _dispatcher;
+    ClockedObject::regStats();
+
+    shaderActiveTicks
+        .name(name() + ".shader_active_ticks")
+        .desc("Total ticks that any CU attached to this shader is active")
+        ;
+    allLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".allLatencyDist")
+        .desc("delay distribution for all")
+        .flags(Stats::pdf | Stats::oneline);
+
+    loadLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".loadLatencyDist")
+        .desc("delay distribution for loads")
+        .flags(Stats::pdf | Stats::oneline);
+
+    storeLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".storeLatencyDist")
+        .desc("delay distribution for stores")
+        .flags(Stats::pdf | Stats::oneline);
+
+    vectorInstSrcOperand
+        .init(4)
+        .name(name() + ".vec_inst_src_operand")
+        .desc("vector instruction source operand distribution");
+
+    vectorInstDstOperand
+        .init(4)
+        .name(name() + ".vec_inst_dst_operand")
+        .desc("vector instruction destination operand distribution");
+
+    initToCoalesceLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".initToCoalesceLatency")
+        .desc("Ticks from vmem inst initiateAcc to coalescer issue")
+        .flags(Stats::pdf | Stats::oneline);
+
+    rubyNetworkLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".rubyNetworkLatency")
+        .desc("Ticks from coalescer issue to coalescer hit callback")
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmEnqueueLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".gmEnqueueLatency")
+        .desc("Ticks from coalescer hit callback to GM pipe enqueue")
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmToCompleteLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".gmToCompleteLatency")
+        .desc("Ticks queued in GM pipes ordered response buffer")
+        .flags(Stats::pdf | Stats::oneline);
+
+    coalsrLineAddresses
+        .init(0, 20, 1)
+        .name(name() + ".coalsrLineAddresses")
+        .desc("Number of cache lines for coalesced request")
+        .flags(Stats::pdf | Stats::oneline);
+
+    int wfSize = cuList[0]->wfSize();
+    cacheBlockRoundTrip = new Stats::Distribution[wfSize];
+    for (int idx = 0; idx < wfSize; ++idx) {
+        std::stringstream namestr;
+        ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
+        cacheBlockRoundTrip[idx]
+            .init(0, 1600000, 10000)
+            .name(namestr.str())
+            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+            .flags(Stats::pdf | Stats::oneline);
+    }
 }
 
 void
@@ -251,7 +391,6 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
         RequestPtr req1, req2;
         req->splitOnVaddr(split_addr, req1, req2);
 
-
         PacketPtr pkt1 = new Packet(req2, cmd);
         PacketPtr pkt2 = new Packet(req1, cmd);
 
@@ -297,34 +436,22 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
     }
 }
 
-bool
-Shader::busy()
-{
-    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
-        if (!cuList[i_cu]->isDone()) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 void
-Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+Shader::ScheduleAdd(int *val,Tick when,int x)
 {
     sa_val.push_back(val);
-    sa_when.push_back(tick_cnt + when);
+    when += curTick();
+    sa_when.push_back(when);
     sa_x.push_back(x);
     ++sa_n;
-}
-
-
-void
-Shader::processTick()
-{
-    if (busy()) {
-        exec();
-        schedule(tickEvent, curTick() + ticks(1));
+    if (!tickEvent.scheduled() || (when < tickEvent.when())) {
+        DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
+                "%lu\n", when);
+        reschedule(tickEvent, when, true);
+    } else {
+        assert(tickEvent.scheduled());
+        DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
+                "%lu\n", when);
     }
 }
 
@@ -356,7 +483,8 @@ void
 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                 bool suppress_func_errors)
 {
-    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
+        suppress_func_errors);
 }
 
 void
@@ -385,15 +513,11 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
     pkt->senderState =
         new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
 
-    if (cu_id == n_cu) {
-        dispatcher->tlbPort->sendFunctional(pkt);
-    } else {
-        // even when the perLaneTLB flag is turned on
-        // it's ok tp send all accesses through lane 0
-        // since the lane # is not known here,
-        // This isn't important since these are functional accesses.
-        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
-    }
+    // even when the perLaneTLB flag is turned on
+    // it's ok tp send all accesses through lane 0
+    // since the lane # is not known here,
+    // This isn't important since these are functional accesses.
+    cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
 
     /* safe_cast the senderState */
     TheISA::GpuTLB::TranslationState *sender_state =
@@ -402,3 +526,82 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
     delete sender_state->tlbEntry;
     delete pkt->senderState;
 }
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleStore(const Tick accessTime)
+{
+    storeLatencyDist.sample(accessTime);
+    allLatencyDist.sample(accessTime);
+}
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleLoad(const Tick accessTime)
+{
+    loadLatencyDist.sample(accessTime);
+    allLatencyDist.sample(accessTime);
+}
+
+void
+Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
+{
+    // Only sample instructions that go all the way to main memory
+    if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
+        return;
+    }
+
+    Tick t1 = roundTripTime[0];
+    Tick t2 = roundTripTime[1];
+    Tick t3 = roundTripTime[2];
+    Tick t4 = roundTripTime[3];
+    Tick t5 = roundTripTime[4];
+
+    initToCoalesceLatency.sample(t2-t1);
+    rubyNetworkLatency.sample(t3-t2);
+    gmEnqueueLatency.sample(t4-t3);
+    gmToCompleteLatency.sample(t5-t4);
+}
+
+void
+Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
+{
+    coalsrLineAddresses.sample(lineMap.size());
+    std::vector<Tick> netTimes;
+
+    // For each cache block address generated by a vmem inst, calculate
+    // the round-trip time for that cache block.
+    for (auto& it : lineMap) {
+        const std::vector<Tick>& timeVec = it.second;
+        if (timeVec.size() == 2) {
+            netTimes.push_back(timeVec[1] - timeVec[0]);
+        }
+    }
+
+    // Sort the cache block round trip times so that the first
+    // distrubtion is always measuring the fastests and the last
+    // distrubtion is always measuring the slowest cache block.
+    std::sort(netTimes.begin(), netTimes.end());
+
+    // Sample the round trip time for each N cache blocks into the
+    // Nth distribution.
+    int idx = 0;
+    for (auto& time : netTimes) {
+        cacheBlockRoundTrip[idx].sample(time);
+        ++idx;
+    }
+}
+
+void
+Shader::notifyCuSleep() {
+    // If all CUs attached to his shader are asleep, update shaderActiveTicks
+    panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+             "Invalid activeCu size\n");
+    _activeCus--;
+    if (!_activeCus)
+        shaderActiveTicks += curTick() - _lastInactiveTick;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 5c14d9898..72063a4a5 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -14,9 +14,9 @@
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -30,7 +30,7 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * Author: Steve Reinhardt
+ * Authors: Steve Reinhardt
  */
 
 #ifndef __SHADER_HH__
@@ -47,11 +47,11 @@
 #include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
-#include "enums/MemType.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
-#include "gpu-compute/qstruct.hh"
 #include "mem/page_table.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -61,7 +61,8 @@
 #include "sim/sim_object.hh"
 
 class BaseTLB;
-class GpuDispatcher;
+class GPUCommandProcessor;
+class GPUDispatcher;
 
 namespace TheISA
 {
@@ -70,36 +71,144 @@ namespace TheISA
 
 static const int LDS_SIZE = 65536;
 
+// aperture (APE) registers define the base/limit
+// pair for the ATC mapped memory space. currently
+// the only APEs we consider are for GPUVM/LDS/scratch.
+// the APEs are registered with unique values based
+// on a per-device basis
+struct ApertureRegister
+{
+    Addr base;
+    Addr limit;
+};
+
 // Class Shader: This describes a single shader instance. Most
 // configurations will only have a single shader.
 
 class Shader : public ClockedObject
 {
-  protected:
-      // Shader's clock period in terms of number of ticks of curTime,
-      // aka global simulation clock
-      Tick clock;
+  private:
+    ApertureRegister _gpuVmApe;
+    ApertureRegister _ldsApe;
+    ApertureRegister _scratchApe;
+    Addr shHiddenPrivateBaseVmid;
+
+    // Number of active Cus attached to this shader
+    int _activeCus;
+
+    // Last tick that all CUs attached to this shader were inactive
+    Tick _lastInactiveTick;
+
+    // some stats for measuring latency
+    Stats::Distribution allLatencyDist;
+    Stats::Distribution loadLatencyDist;
+    Stats::Distribution storeLatencyDist;
+
+    // average ticks from vmem inst initiateAcc to coalescer issue,
+    // average ticks from coalescer issue to coalescer hit callback,
+    // average ticks from coalescer hit callback to GM pipe enqueue,
+    // and average ticks spent in GM pipe's ordered resp buffer.
+    Stats::Distribution initToCoalesceLatency;
+    Stats::Distribution rubyNetworkLatency;
+    Stats::Distribution gmEnqueueLatency;
+    Stats::Distribution gmToCompleteLatency;
+
+    // average number of cache blocks requested by vmem inst, and
+    // average ticks for cache blocks to main memory for the Nth
+    // cache block generated by a vmem inst.
+    Stats::Distribution coalsrLineAddresses;
+    Stats::Distribution *cacheBlockRoundTrip;
 
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
 
-    // clock related functions ; maps to-and-from
-    // Simulation ticks and shader clocks.
-    Tick frequency() const { return SimClock::Frequency / clock; }
-
-    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
-
-    Tick getClock() const { return clock; }
-    Tick curCycle() const { return curTick() / clock; }
-    Tick tickToCycles(Tick val) const { return val / clock;}
-
+    GPUDispatcher &dispatcher();
+    void sampleLoad(const Tick accessTime);
+    void sampleStore(const Tick accessTime);
+    void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
+    void sampleLineRoundTrip(const std::map<Addr,
+        std::vector<Tick>> &roundTripTime);
 
     SimpleThread *cpuThread;
     ThreadContext *gpuTc;
     BaseCPU *cpuPointer;
 
-    void processTick();
+    const ApertureRegister&
+    gpuVmApe() const
+    {
+        return _gpuVmApe;
+    }
+
+    const ApertureRegister&
+    ldsApe() const
+    {
+        return _ldsApe;
+    }
+
+    const ApertureRegister&
+    scratchApe() const
+    {
+        return _scratchApe;
+    }
+
+    bool
+    isGpuVmApe(Addr addr) const
+    {
+        bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
+
+        return is_gpu_vm;
+    }
+
+    bool
+    isLdsApe(Addr addr) const
+    {
+        bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
+
+        return is_lds;
+    }
+
+    bool
+    isScratchApe(Addr addr) const
+    {
+        bool is_scratch
+            = addr >= _scratchApe.base && addr <= _scratchApe.limit;
+
+        return is_scratch;
+    }
+
+    Addr
+    getScratchBase()
+    {
+        return _scratchApe.base;
+    }
+
+    Addr
+    getHiddenPrivateBase()
+    {
+        return shHiddenPrivateBaseVmid;
+    }
+
+    void
+    initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
+    {
+        Addr sh_hidden_base_new = queueBase - offset;
+
+        // We are initializing sh_hidden_private_base_vmid from the
+        // amd queue descriptor from the first queue.
+        // The sh_hidden_private_base_vmid is supposed to be same for
+        // all the queues from the same process
+        if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
+            // Do not panic if shHiddenPrivateBaseVmid == 0,
+            // that is if it is uninitialized. Panic only
+            // if the value is initilized and we get
+            // a differnt base later.
+            panic_if(shHiddenPrivateBaseVmid != 0,
+                     "Currently we support only single process\n");
+        }
+        shHiddenPrivateBaseVmid = sh_hidden_base_new;
+    }
+
     EventFunctionWrapper tickEvent;
 
     // is this simulation going to be timing mode in the memory?
@@ -108,30 +217,18 @@ class Shader : public ClockedObject
 
     // If set, issue acq packet @ kernel launch
     int impl_kern_boundary_sync;
-    // If set, generate a separate packet for acquire/release on
-    // ld_acquire/st_release/atomic operations
-    int separate_acquire_release;
     // If set, fetch returns may be coissued with instructions
     int coissue_return;
     // If set, always dump all 64 gprs to trace
     int trace_vgpr_all;
     // Number of cu units in the shader
     int n_cu;
-    // Number of wavefront slots per cu
+    // Number of wavefront slots per SIMD per CU
     int n_wf;
+
     // The size of global memory
     int globalMemSize;
 
-    /*
-     * Bytes/work-item for call instruction
-     * The number of arguments for an hsail function will
-     * vary. We simply determine the maximum # of arguments
-     * required by any hsail function up front before the
-     * simulation (during parsing of the Brig) and record
-     * that number here.
-     */
-    int funcargs_size;
-
     // Tracks CU that rr dispatcher should attempt scheduling
     int nextSchedCu;
 
@@ -139,7 +236,7 @@ class Shader : public ClockedObject
     uint32_t sa_n;
 
     // Pointer to value to be increments
-    std::vector<uint32_t*> sa_val;
+    std::vector<int*> sa_val;
     // When to do the increment
     std::vector<uint64_t> sa_when;
     // Amount to increment by
@@ -148,24 +245,29 @@ class Shader : public ClockedObject
     // List of Compute Units (CU's)
     std::vector<ComputeUnit*> cuList;
 
-    uint64_t tick_cnt;
-    uint64_t box_tick_cnt;
-    uint64_t start_tick_cnt;
+    GPUCommandProcessor &gpuCmdProc;
+    GPUDispatcher &_dispatcher;
+
+    /**
+     * Statistics
+     */
+    Stats::Scalar shaderActiveTicks;
+    Stats::Vector vectorInstSrcOperand;
+    Stats::Vector vectorInstDstOperand;
+    void regStats();
 
-    GpuDispatcher *dispatcher;
+    int max_valu_insts;
+    int total_valu_insts;
 
     Shader(const Params *p);
     ~Shader();
     virtual void init();
 
-    // Run shader
-    void exec();
-
-    // Check to see if shader is busy
-    bool busy();
+    // Run shader scheduled adds
+    void execScheduledAdds();
 
     // Schedule a 32-bit value to be incremented some time in the future
-    void ScheduleAdd(uint32_t *val, Tick when, int x);
+    void ScheduleAdd(int *val, Tick when, int x);
     bool processTimingPacket(PacketPtr pkt);
 
     void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
@@ -190,12 +292,15 @@ class Shader : public ClockedObject
         cuList[cu_id] = compute_unit;
     }
 
-    void handshake(GpuDispatcher *dispatcher);
-    bool dispatch_workgroups(NDRange *ndr);
+    void prepareInvalidate(HSAQueueEntry *task);
+    void prepareFlush(GPUDynInstPtr gpuDynInst);
+
+    bool dispatchWorkgroups(HSAQueueEntry *task);
     Addr mmap(int length);
     void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
     void updateContext(int cid);
     void hostWakeUp(BaseCPU *cpu);
+    void notifyCuSleep();
 };
 
 #endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
index 1e4f0c6fc..1d0f1b8d7 100644
--- a/src/gpu-compute/simple_pool_manager.cc
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -35,6 +35,12 @@
 
 #include "base/logging.hh"
 
+SimplePoolManager *
+SimplePoolManagerParams::create()
+{
+    return new SimplePoolManager(this);
+}
+
 // return the min number of elements that the manager can reserve given
 // a request for "size" elements
 uint32_t
@@ -64,8 +70,6 @@ SimplePoolManager::printRegion()
 bool
 SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
 {
-    assert(numRegions * minAllocatedElements(size) <= poolSize());
-
     return _reservedGroups == 0;
 }
 
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
index 3b7ea9eb3..9fd90a505 100644
--- a/src/gpu-compute/simple_pool_manager.hh
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -38,14 +38,15 @@
 #include <cstdint>
 
 #include "gpu-compute/pool_manager.hh"
+#include "params/SimplePoolManager.hh"
 
 // Simple Pool Manager: allows one region per pool. No region merging is
 // supported.
 class SimplePoolManager : public PoolManager
 {
   public:
-    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
-        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+    SimplePoolManager(const PoolManagerParams *p)
+        : PoolManager(p), _regionSize(0), _nxtFreeIdx(0),
           _reservedGroups(0)
     {
     }
@@ -62,7 +63,7 @@ class SimplePoolManager : public PoolManager
     // be reserved)
     uint32_t _regionSize;
     // next index to allocate a region
-    uint8_t _nxtFreeIdx;
+    int _nxtFreeIdx;
     // number of groups that reserve a region
     uint32_t _reservedGroups;
 };
diff --git a/src/gpu-compute/static_register_manager_policy.cc b/src/gpu-compute/static_register_manager_policy.cc
new file mode 100644
index 000000000..85f530bfc
--- /dev/null
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#include "gpu-compute/static_register_manager_policy.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+StaticRegisterManagerPolicy::StaticRegisterManagerPolicy()
+{
+}
+
+void
+StaticRegisterManagerPolicy::exec()
+{
+}
+
+int
+StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
+{
+    panic_if((vgprIndex >= w->reservedVectorRegs)
+             || (w->reservedVectorRegs < 0),
+             "VGPR index %d is out of range: VGPR range=[0,%d]",
+             vgprIndex, w->reservedVectorRegs);
+
+    // add the offset from where the VGPRs of the wavefront have been assigned
+    int physicalVgprIndex = w->startVgprIndex + vgprIndex;
+
+    panic_if(!((w->startVgprIndex <= physicalVgprIndex) &&
+             (w->startVgprIndex + w->reservedVectorRegs - 1)
+             >= physicalVgprIndex),
+             "Invalid VGPR index %d\n", physicalVgprIndex);
+
+    // calculate physical VGPR index
+    return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs();
+}
+
+int
+StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex)
+{
+    panic_if(!((sgprIndex < w->reservedScalarRegs)
+             && (w->reservedScalarRegs > 0)),
+             "SGPR index %d is out of range: SGPR range=[0,%d]\n",
+             sgprIndex, w->reservedScalarRegs);
+
+    // add the offset from where the SGPRs of the wavefront have been assigned
+    int physicalSgprIndex = w->startSgprIndex + sgprIndex;
+
+    panic_if(!((w->startSgprIndex <= physicalSgprIndex) &&
+             (w->startSgprIndex + w->reservedScalarRegs - 1)
+             >= physicalSgprIndex),
+             "Invalid SGPR index %d\n", physicalSgprIndex);
+
+    // calculate physical SGPR index
+    return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs();
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs,
+                                              int demandPerWf)
+{
+    return cu->registerManager->vrfPoolMgrs[simdId]->
+        canAllocate(nWfs, demandPerWf);
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs,
+                                              int demandPerWf)
+{
+    return cu->registerManager->srfPoolMgrs[simdId]->
+        canAllocate(nWfs, demandPerWf);
+}
+
+void
+StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand,
+                                               int scalarDemand)
+{
+    uint32_t allocatedSize = 0;
+    w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]->
+        allocateRegion(vectorDemand, &allocatedSize);
+    w->reservedVectorRegs = allocatedSize;
+    cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs;
+    panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd,
+             "VRF[%d] has been overallocated %d > %d\n",
+             w->simdId, cu->vectorRegsReserved[w->simdId],
+             cu->numVecRegsPerSimd);
+
+    if (scalarDemand) {
+        w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]->
+            allocateRegion(scalarDemand, &allocatedSize);
+        w->reservedScalarRegs = allocatedSize;
+        cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs;
+        panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd,
+                 "SRF[%d] has been overallocated %d > %d\n",
+                 w->simdId, cu->scalarRegsReserved[w->simdId],
+                 cu->numScalarRegsPerSimd);
+    }
+}
+
+void
+StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
+{
+    // free the vector registers of the completed wavefront
+    w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs;
+    // free the scalar registers of the completed wavefront
+    w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs;
+
+    panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0,
+             "Freeing VRF[%d] registers left %d registers reserved\n",
+             w->simdId,
+             w->computeUnit->vectorRegsReserved[w->simdId]);
+    panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0,
+             "Freeing SRF[%d] registers left %d registers reserved\n",
+             w->simdId,
+             w->computeUnit->scalarRegsReserved[w->simdId]);
+
+    int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
+        w->computeUnit->vrf[w->simdId]->numRegs();
+
+    w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
+        freeRegion(w->startVgprIndex, endIndex);
+
+    // mark/pre-mark all registers as not busy
+    for (int i = 0; i < w->reservedVectorRegs; i++) {
+        uint32_t physVgprIdx = mapVgpr(w, i);
+        w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
+    }
+
+    w->reservedVectorRegs = 0;
+    w->startVgprIndex = 0;
+
+    endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
+        w->computeUnit->srf[w->simdId]->numRegs();
+    w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
+        freeRegion(w->startSgprIndex, endIndex);
+
+    // mark/pre-mark all registers as not busy
+    for (int i = 0; i < w->reservedScalarRegs; i++) {
+        uint32_t physSgprIdx = mapSgpr(w, i);
+        w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
+    }
+
+    w->reservedScalarRegs = 0;
+    w->startSgprIndex = 0;
+}
+
+void
+StaticRegisterManagerPolicy::regStats()
+{
+}
diff --git a/src/gpu-compute/static_register_manager_policy.hh b/src/gpu-compute/static_register_manager_policy.hh
new file mode 100644
index 000000000..6abeb1d1a
--- /dev/null
+++ b/src/gpu-compute/static_register_manager_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__
+#define __STATIC_REGISTER_MANAGER_POLICY_HH__
+
+#include "gpu-compute/register_manager_policy.hh"
+
+class HSAQueueEntry;
+
+class StaticRegisterManagerPolicy : public RegisterManagerPolicy
+{
+  public:
+
+    StaticRegisterManagerPolicy();
+
+    void exec() override;
+
+    int mapVgpr(Wavefront* w, int vgprIndex) override;
+    int mapSgpr(Wavefront* w, int sgprIndex) override;
+
+    bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override;
+    bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override;
+
+    void allocateRegisters(Wavefront *w, int vectorDemand,
+        int scalarDemand) override;
+
+    void freeRegisters(Wavefront *w) override;
+
+    void regStats() override;
+};
+
+#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
index 90eadb026..51d2e761a 100644
--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -41,7 +41,6 @@
 
 TLBCoalescer::TLBCoalescer(const Params *p)
     : ClockedObject(p),
-      clock(p->clk_domain->clockPeriod()),
       TLBProbesPerCycle(p->probesPerCycle),
       coalescingWindow(p->coalescingWindow),
       disableCoalescing(p->disableCoalescing),
@@ -317,7 +316,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
     //coalesced requests to the TLB
     if (!coalescer->probeTLBEvent.scheduled()) {
         coalescer->schedule(coalescer->probeTLBEvent,
-                curTick() + coalescer->ticks(1));
+                curTick() + coalescer->clockPeriod());
     }
 
     return true;
@@ -380,7 +379,7 @@ TLBCoalescer::MemSidePort::recvReqRetry()
     //we've receeived a retry. Schedule a probeTLBEvent
     if (!coalescer->probeTLBEvent.scheduled())
         coalescer->schedule(coalescer->probeTLBEvent,
-                curTick() + coalescer->ticks(1));
+                curTick() + coalescer->clockPeriod());
 }
 
 void
@@ -448,7 +447,7 @@ TLBCoalescer::processProbeTLBEvent()
 
             // send the coalesced request for virt_page_addr
             if (!memSidePort[0]->sendTimingReq(first_packet)) {
-                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
                        virt_page_addr);
 
                 // No need for a retries queue since we are already buffering
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
index 72d06deff..842237e5c 100644
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -65,13 +65,6 @@ class ThreadContext;
  */
 class TLBCoalescer : public ClockedObject
 {
-   protected:
-    // TLB clock: will inherit clock from shader's clock period in terms
-    // of nuber of ticks of curTime (aka global simulation clock)
-    // The assignment of TLB clock from shader clock is done in the
-    // python config files.
-    int clock;
-
   public:
     typedef TLBCoalescerParams Params;
     TLBCoalescer(const Params *p);
@@ -105,7 +98,8 @@ class TLBCoalescer : public ClockedObject
      * option is to change it to curTick(), so we coalesce based
      * on the receive time.
      */
-    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+    typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
+        CoalescingFIFO;
 
     CoalescingFIFO coalescerFIFO;
 
@@ -143,13 +137,6 @@ class TLBCoalescer : public ClockedObject
     void updatePhysAddresses(PacketPtr pkt);
     void regStats() override;
 
-    // Clock related functions. Maps to-and-from
-    // Simulation ticks and object clocks.
-    Tick frequency() const { return SimClock::Frequency / clock; }
-    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
-    Tick curCycle() const { return curTick() / clock; }
-    Tick tickToCycles(Tick val) const { return val / clock;}
-
     class CpuSidePort : public SlavePort
     {
       public:
@@ -171,7 +158,8 @@ class TLBCoalescer : public ClockedObject
         virtual void
         recvRespRetry()
         {
-            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+            fatal("recvRespRetry() is not implemented in the TLB "
+                "coalescer.\n");
         }
 
         virtual AddrRangeList getAddrRanges() const;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index a57a80972..3bddfccc1 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -36,81 +36,21 @@
 #include <string>
 
 #include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
-#include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/VectorRegisterFile.hh"
 
 VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
-    : SimObject(p),
-      manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
-      simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
-      vgprState(new VecRegisterState())
+    : RegisterFile(p)
 {
-    fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
-    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+    regFile.resize(numRegs(), VecRegContainer());
 
-    fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
-             "multiple of VRF size\n");
-
-    busy.clear();
-    busy.resize(numRegsPerSimd, 0);
-    nxtBusy.clear();
-    nxtBusy.resize(numRegsPerSimd, 0);
-
-    vgprState->init(numRegsPerSimd, p->wfSize);
-}
-
-void
-VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
-{
-    computeUnit = _computeUnit;
-    vgprState->setParent(computeUnit);
-}
-
-uint8_t
-VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
-{
-    uint8_t status = nxtBusy.at(idx);
-
-    if (operandSize > 4) {
-        status = status | (nxtBusy.at((idx + 1) % numRegs()));
-    }
-
-    return status;
-}
-
-uint8_t
-VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
-{
-    uint8_t status = busy.at(idx);
-
-    if (operandSize > 4) {
-        status = status | (busy.at((idx + 1) % numRegs()));
-    }
-
-    return status;
-}
-
-void
-VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
-    nxtBusy.at(regIdx) = value;
-
-    if (operandSize > 4) {
-        nxtBusy.at((regIdx + 1) % numRegs()) = value;
-    }
-}
-
-void
-VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
-    busy.at(regIdx) = value;
-
-    if (operandSize > 4) {
-        busy.at((regIdx + 1) % numRegs()) = value;
+    for (auto &reg : regFile) {
+        reg.zero();
     }
 }
 
@@ -118,127 +58,154 @@ bool
 VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 {
     for (int i = 0; i < ii->getNumOperands(); ++i) {
-        if (ii->isVectorRegister(i)) {
-            uint32_t vgprIdx = ii->getRegisterIndex(i, ii);
-            uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
-
-            if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
-                if (ii->isDstOperand(i)) {
-                    w->numTimesBlockedDueWAXDependencies++;
-                } else if (ii->isSrcOperand(i)) {
-                    w->numTimesBlockedDueRAWDependencies++;
-                }
-
-                return false;
-            }
-
-            if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
-                if (ii->isDstOperand(i)) {
-                    w->numTimesBlockedDueWAXDependencies++;
-                } else if (ii->isSrcOperand(i)) {
-                    w->numTimesBlockedDueRAWDependencies++;
+        if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) {
+            int vgprIdx = ii->getRegisterIndex(i, ii);
+
+            // determine number of registers
+            int nRegs =
+                ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4;
+            for (int j = 0; j < nRegs; j++) {
+                int pVgpr = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);
+                if (regBusy(pVgpr)) {
+                    if (ii->isDstOperand(i)) {
+                        w->numTimesBlockedDueWAXDependencies++;
+                    } else if (ii->isSrcOperand(i)) {
+                        DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+                                w->wfDynId, ii->disassemble(), pVgpr);
+                        w->numTimesBlockedDueRAWDependencies++;
+                    }
+                    return false;
                 }
-
-                return false;
             }
         }
     }
-
     return true;
 }
 
 void
-VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
 {
-    bool loadInstr = ii->isLoad();
-    bool atomicInstr = ii->isAtomic() || ii->isMemFence();
-
-    bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
-
     // iterate over all register destination operands
     for (int i = 0; i < ii->getNumOperands(); ++i) {
         if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
-            uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
-                                        ii->getOperandSize(i), 1);
-
-            // mark the destination vector register as busy
-            markReg(physReg, ii->getOperandSize(i), 1);
-            // clear the in-flight status of the destination vector register
-            preMarkReg(physReg, ii->getOperandSize(i), 0);
-
-            // FIXME: if we ever model correct timing behavior
-            // for load argument instructions then we should not
-            // set the destination register as busy now but when
-            // the data returns. Loads and Atomics should free
-            // their destination registers when the data returns,
-            // not now
-            if (!atomicInstr && !loadNoArgInstr) {
-                uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
-                    computeUnit->spBypassLength() :
-                    computeUnit->dpBypassLength();
-
-                // schedule an event for marking the register as ready
-                computeUnit->registerEvent(w->simdId, physReg,
-                                           ii->getOperandSize(i),
-                                           computeUnit->shader->tick_cnt +
-                                           computeUnit->shader->ticks(pipeLen),
-                                           0);
+            int vgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);
+
+                // If instruction is atomic instruction and
+                // the atomics do not return value, then
+                // do not mark this reg as busy.
+                if (!(ii->isAtomic() && !ii->isAtomicRet())) {
+                    /**
+                     * if the instruction is a load with EXEC = 0, then
+                     * we do not mark the reg. we do this to avoid a
+                     * deadlock that can occur because a load reserves
+                     * its destination regs before checking its exec mask,
+                     * and in the case it is 0, it will not send/recv any
+                     * packets, and therefore it will never free its dest
+                     * reg(s).
+                     */
+                    if (!ii->isLoad() || (ii->isLoad()
+                        && ii->exec_mask.any())) {
+                        markReg(physReg, true);
+                    }
+                }
             }
         }
     }
 }
 
-int
-VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
-                         std::vector<uint32_t> &regVec, uint32_t operandSize,
-                         uint64_t timestamp)
+void
+VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
 {
-    int delay = 0;
+    // increment count of number of DWORDs read from VRF
+    int DWORDs = ii->numSrcVecDWORDs();
+    registerReads += (DWORDs * w->execMask().count());
+
+    uint64_t mask = w->execMask().to_ullong();
+    int srams = w->execMask().size() / 4;
+    for (int i = 0; i < srams; i++) {
+        if (mask & 0xF) {
+            sramReads += DWORDs;
+        }
+        mask = mask >> 4;
+    }
 
-    panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
-             regVec.size());
+    if (!ii->isLoad()
+        && !(ii->isAtomic() || ii->isMemSync())) {
+        int opSize = 4;
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->getOperandSize(i) > opSize) {
+                opSize = ii->getOperandSize(i);
+            }
+        }
+        Cycles delay(opSize <= 4 ? computeUnit->spBypassLength()
+            : computeUnit->dpBypassLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+                int vgprIdx = ii->getRegisterIndex(i, ii);
+                int nRegs = ii->getOperandSize(i) <= 4 ? 1
+                    : ii->getOperandSize(i) / 4;
+                for (int j = 0; j < nRegs; j++) {
+                    int physReg = computeUnit->registerManager
+                        ->mapVgpr(w, vgprIdx + j);
+                    enqRegFreeEvent(physReg, tickDelay);
+                }
+            }
+        }
 
-    for (int i = 0; i < regVec.size(); ++i) {
-        // mark the destination VGPR as free when the timestamp expires
-        computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
-                                   computeUnit->shader->tick_cnt + timestamp +
-                                   computeUnit->shader->ticks(delay), 0);
-    }
+        // increment count of number of DWORDs written to VRF
+        DWORDs = ii->numDstVecDWORDs();
+        registerWrites += (DWORDs * w->execMask().count());
 
-    return delay;
+        mask = w->execMask().to_ullong();
+        srams = w->execMask().size() / 4;
+        for (int i = 0; i < srams; i++) {
+            if (mask & 0xF) {
+                sramWrites += DWORDs;
+            }
+            mask = mask >> 4;
+        }
+    }
 }
 
 void
-VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+VectorRegisterFile::scheduleWriteOperandsFromLoad(
+    Wavefront *w, GPUDynInstPtr ii)
 {
-    // iterate over all register destination operands
+    assert(ii->isLoad() || ii->isAtomicRet());
     for (int i = 0; i < ii->getNumOperands(); ++i) {
         if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
-            uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
-                                        ii->getOperandSize(i), 1);
-            // set the in-flight status of the destination vector register
-            preMarkReg(physReg, ii->getOperandSize(i), 1);
+            int vgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);
+                enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+            }
         }
     }
-}
-
-bool
-VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
-                                          GPUDynInstPtr ii,
-                                          VrfAccessType accessType)
-{
-    bool ready = true;
-
-    return ready;
-}
-
-bool
-VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
-                                          VrfAccessType accessType)
-{
-    bool ready = true;
-
-    return ready;
+    // increment count of number of DWORDs written to VRF
+    int DWORDs = ii->numDstVecDWORDs();
+    registerWrites += (DWORDs * ii->exec_mask.count());
+
+    uint64_t mask = ii->exec_mask.to_ullong();
+    int srams = ii->exec_mask.size() / 4;
+    for (int i = 0; i < srams; i++) {
+        if (mask & 0xF) {
+            sramWrites += DWORDs;
+        }
+        mask = mask >> 4;
+    }
 }
 
 VectorRegisterFile*
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
index 254197540..0ad086d68 100644
--- a/src/gpu-compute/vector_register_file.hh
+++ b/src/gpu-compute/vector_register_file.hh
@@ -34,111 +34,76 @@
 #ifndef __VECTOR_REGISTER_FILE_HH__
 #define __VECTOR_REGISTER_FILE_HH__
 
-#include <list>
-
-#include "base/statistics.hh"
-#include "base/trace.hh"
-#include "base/types.hh"
+#include "arch/gpu_isa.hh"
+#include "config/the_gpu_isa.hh"
 #include "debug/GPUVRF.hh"
-#include "gpu-compute/vector_register_state.hh"
-#include "sim/sim_object.hh"
-
-class ComputeUnit;
-class Shader;
-class SimplePoolManager;
-class Wavefront;
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"
 
 struct VectorRegisterFileParams;
 
-enum class VrfAccessType : uint8_t
-{
-    READ = 0x01,
-    WRITE = 0x02,
-    RD_WR = READ | WRITE
-};
-
 // Vector Register File
-class VectorRegisterFile : public SimObject
+class VectorRegisterFile : public RegisterFile
 {
   public:
+    using VecRegContainer = TheGpuISA::VecRegContainerU32;
+
     VectorRegisterFile(const VectorRegisterFileParams *p);
+    ~VectorRegisterFile() { }
 
-    void setParent(ComputeUnit *_computeUnit);
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+    virtual void scheduleWriteOperands(Wavefront *w,
+                                       GPUDynInstPtr ii) override;
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii) override;
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
 
-    // Read a register
-    template<typename T>
-    T
-    read(int regIdx, int threadId=0)
+    void
+    setParent(ComputeUnit *_computeUnit) override
     {
-        T p0 = vgprState->read<T>(regIdx, threadId);
-        DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0);
-
-        return p0;
+        RegisterFile::setParent(_computeUnit);
     }
 
-    // Write a register
-    template<typename T>
-    void
-    write(int regIdx, T value, int threadId=0)
+    // Read a register that is writeable (e.g., a DST operand)
+    VecRegContainer&
+    readWriteable(int regIdx)
     {
-        DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value);
-        vgprState->write<T>(regIdx, value, threadId);
+        return regFile[regIdx];
     }
 
-    uint8_t regBusy(int idx, uint32_t operandSize) const;
-    uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
-
-    int numRegs() const { return numRegsPerSimd; }
-
-    void markReg(int regIdx, uint32_t operandSize, uint8_t value);
-    void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
-
-    virtual void exec(GPUDynInstPtr ii, Wavefront *w);
-
-    virtual int exec(uint64_t dynamic_id, Wavefront *w,
-                     std::vector<uint32_t> &regVec, uint32_t operandSize,
-                     uint64_t timestamp);
-
-    bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
-    virtual void updateEvents() { }
-    virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
-
-    virtual bool
-    isReadConflict(int memWfId, int exeWfId) const
+    // Read a register that is not writeable (e.g., src operand)
+    const VecRegContainer&
+    read(int regIdx) const
     {
-        return false;
+        return regFile[regIdx];
     }
 
-    virtual bool
-    isWriteConflict(int memWfId, int exeWfId) const
+    // Write a register
+    void
+    write(int regIdx, const VecRegContainer &value)
     {
-        return false;
+        regFile[regIdx] = value;
     }
 
-    virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
-                                       GPUDynInstPtr ii,
-                                       VrfAccessType accessType);
-
-    virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
-                                       VrfAccessType accessType);
-
-    SimplePoolManager *manager;
-
-  protected:
-    ComputeUnit* computeUnit;
-    int simdId;
-
-    // flag indicating if a register is busy
-    std::vector<uint8_t> busy;
-    // flag indicating if a register will be busy (by instructions
-    // in the SIMD pipeline)
-    std::vector<uint8_t> nxtBusy;
-
-    // numer of registers (bank size) per simd unit (bank)
-    int numRegsPerSimd;
+    void
+    printReg(Wavefront *wf, int regIdx) const
+    {
+#ifndef NDEBUG
+        const auto &vec_reg_cont = regFile[regIdx];
+        auto vgpr = vec_reg_cont.as<TheGpuISA::VecElemU32>();
+
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n",
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane,
+                    vgpr[lane]);
+            }
+        }
+#endif
+    }
 
-    // vector register state
-    VecRegisterState *vgprState;
+  private:
+    std::vector<VecRegContainer> regFile;
 };
 
 #endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 46cce9ce8..c2c98ba0c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -34,10 +34,13 @@
 #include "gpu-compute/wavefront.hh"
 
 #include "debug/GPUExec.hh"
+#include "debug/GPUInitAbi.hh"
 #include "debug/WavefrontStack.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/vector_register_file.hh"
 
 Wavefront*
@@ -47,16 +50,18 @@ WavefrontParams::create()
 }
 
 Wavefront::Wavefront(const Params *p)
-  : SimObject(p), callArgMem(nullptr), _gpuISA()
+  : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
+    maxIbSize(p->max_ib_size), _gpuISA(*this),
+    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
 {
     lastTrace = 0;
-    simdId = p->simdId;
-    wfSlotId = p->wf_slot_id;
+    execUnitId = -1;
     status = S_STOPPED;
     reservedVectorRegs = 0;
+    reservedScalarRegs = 0;
     startVgprIndex = 0;
+    startSgprIndex = 0;
     outstandingReqs = 0;
-    memReqsInPipe = 0;
     outstandingReqsWrGm = 0;
     outstandingReqsWrLm = 0;
     outstandingReqsRdGm = 0;
@@ -65,28 +70,38 @@ Wavefront::Wavefront(const Params *p)
     rdGmReqsInPipe = 0;
     wrLmReqsInPipe = 0;
     wrGmReqsInPipe = 0;
-
+    scalarRdGmReqsInPipe = 0;
+    scalarWrGmReqsInPipe = 0;
+    scalarOutstandingReqsRdGm = 0;
+    scalarOutstandingReqsWrGm = 0;
+    lastNonIdleTick = 0;
     barrierCnt = 0;
     oldBarrierCnt = 0;
     stalledAtBarrier = false;
+    ldsChunk = nullptr;
 
     memTraceBusy = 0;
     oldVgprTcnt = 0xffffffffffffffffll;
     oldDgprTcnt = 0xffffffffffffffffll;
-    oldVgpr.resize(p->wfSize);
+    oldVgpr.resize(p->wf_size);
 
     pendingFetch = false;
     dropFetch = false;
-    condRegState = new ConditionRegisterState();
-    maxSpVgprs = 0;
-    maxDpVgprs = 0;
-    lastAddr.resize(p->wfSize);
-    workItemFlatId.resize(p->wfSize);
-    oldDgpr.resize(p->wfSize);
-    barCnt.resize(p->wfSize);
+    maxVgprs = 0;
+    maxSgprs = 0;
+
+    lastAddr.resize(p->wf_size);
+    workItemFlatId.resize(p->wf_size);
+    oldDgpr.resize(p->wf_size);
+    barCnt.resize(p->wf_size);
     for (int i = 0; i < 3; ++i) {
-        workItemId[i].resize(p->wfSize);
+        workItemId[i].resize(p->wf_size);
     }
+
+    _execMask.set();
+    rawDist.clear();
+    lastInstExec = 0;
+    vecReads.clear();
 }
 
 void
@@ -94,19 +109,6 @@ Wavefront::regStats()
 {
     SimObject::regStats();
 
-    srcRegOpDist
-        .init(0, 4, 2)
-        .name(name() + ".src_reg_operand_dist")
-        .desc("number of executed instructions with N source register operands")
-        ;
-
-    dstRegOpDist
-        .init(0, 3, 2)
-        .name(name() + ".dst_reg_operand_dist")
-        .desc("number of executed instructions with N destination register "
-              "operands")
-        ;
-
     // FIXME: the name of the WF needs to be unique
     numTimesBlockedDueWAXDependencies
         .name(name() + ".timesBlockedDueWAXDependencies")
@@ -121,11 +123,53 @@ Wavefront::regStats()
               "dependencies")
         ;
 
-    // FIXME: the name of the WF needs to be unique
-    numTimesBlockedDueVrfPortAvail
-        .name(name() + ".timesBlockedDueVrfPortAvail")
-        .desc("number of times instructions are blocked due to VRF port "
-              "availability")
+    numInstrExecuted
+        .name(name() + ".num_instr_executed")
+        .desc("number of instructions executed by this WF slot")
+        ;
+
+    schCycles
+        .name(name() + ".sch_cycles")
+        .desc("number of cycles spent in schedule stage")
+        ;
+
+    schStalls
+        .name(name() + ".sch_stalls")
+        .desc("number of cycles WF is stalled in SCH stage")
+        ;
+
+    schRfAccessStalls
+        .name(name() + ".sch_rf_access_stalls")
+        .desc("number of cycles wave selected in SCH but RF denied adding "
+              "instruction")
+        ;
+
+    schResourceStalls
+        .name(name() + ".sch_resource_stalls")
+        .desc("number of cycles stalled in sch by resource not available")
+        ;
+
+    schOpdNrdyStalls
+        .name(name() + ".sch_opd_nrdy_stalls")
+        .desc("number of cycles stalled in sch waiting for RF reads to "
+              "complete")
+        ;
+
+    schLdsArbStalls
+        .name(name() + ".sch_lds_arb_stalls")
+        .desc("number of cycles wave stalled due to LDS-VRF arbitration")
+        ;
+
+    vecRawDistance
+        .init(0,20,1)
+        .name(name() + ".vec_raw_distance")
+        .desc("Count of RAW distance in dynamic instructions for this WF")
+        ;
+
+    readsPerWrite
+        .init(0,4,1)
+        .name(name() + ".vec_reads_per_write")
+        .desc("Count of Vector reads per write for this WF")
         ;
 }
 
@@ -133,37 +177,473 @@ void
 Wavefront::init()
 {
     reservedVectorRegs = 0;
+    reservedScalarRegs = 0;
     startVgprIndex = 0;
+    startSgprIndex = 0;
+
+    scalarAlu = computeUnit->mapWaveToScalarAlu(this);
+    scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
+    globalMem = computeUnit->mapWaveToGlobalMem(this);
+    localMem = computeUnit->mapWaveToLocalMem(this);
+    scalarMem = computeUnit->mapWaveToScalarMem(this);
+}
+
+void
+Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
+{
+    int regInitIdx = 0;
+
+    // iterate over all the init fields and check which
+    // bits are enabled
+    for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
+
+        if (task->sgprBitEnabled(en_bit)) {
+            int physSgprIdx = 0;
+            uint32_t wiCount = 0;
+            uint32_t firstWave = 0;
+            int orderedAppendTerm = 0;
+            int numWfsInWg = 0;
+            uint32_t finalValue = 0;
+            Addr host_disp_pkt_addr = task->hostDispPktAddr();
+            Addr kernarg_addr = task->kernargAddr();
+            Addr hidden_priv_base(0);
+
+            switch (en_bit) {
+              case PrivateSegBuf:
+                    physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[0]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting PrivateSegBuffer: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[0]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[1]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting PrivateSegBuffer: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[1]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[2]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting PrivateSegBuffer: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[2]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[3]);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting PrivateSegBuffer: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        task->amdQueue.scratch_resource_descriptor[3]);
+                break;
+              case DispatchPtr:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&host_disp_pkt_addr)[0]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting DispatchPtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        ((uint32_t*)&host_disp_pkt_addr)[0]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&host_disp_pkt_addr)[1]);
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting DispatchPtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        ((uint32_t*)&host_disp_pkt_addr)[1]);
+
+                ++regInitIdx;
+                break;
+              case QueuePtr:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting QueuePtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                       ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting QueuePtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                       ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+
+                ++regInitIdx;
+                break;
+              case KernargSegPtr:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&kernarg_addr)[0]);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting KernargSegPtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                       ((uint32_t*)kernarg_addr)[0]);
+
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                        ((uint32_t*)&kernarg_addr)[1]);
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting KernargSegPtr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                       ((uint32_t*)kernarg_addr)[1]);
+
+                ++regInitIdx;
+                break;
+              case FlatScratchInit:
+                physSgprIdx
+                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                    (TheGpuISA::ScalarRegU32)(task->amdQueue
+                        .scratch_backing_memory_location & 0xffffffff));
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting FlatScratch Addr: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        (TheGpuISA::ScalarRegU32)(task->amdQueue
+                        .scratch_backing_memory_location & 0xffffffff));
+
+                physSgprIdx =
+                       computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                // This vallue should be sizeof(DWORD) aligned, that is
+                // 4 byte aligned
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                    task->amdQueue.scratch_workitem_byte_size);
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting FlatScratch size: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        task->amdQueue.scratch_workitem_byte_size);
+                /**
+                 * Since flat scratch init is needed for this kernel, this
+                 * kernel is going to have flat memory instructions and we
+                 * need to initialize the hidden private base for this queue.
+                 * scratch_resource_descriptor[0] has this queue's scratch
+                 * base address. scratch_backing_memory_location has the
+                 * offset to this queue's scratch base address from the
+                 * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
+                 * queue's scratch base address for address calculation
+                 * (stored in scratch_resource_descriptor[0]). But that
+                 * address calculation shoule be done by first finding the
+                 * queue's scratch base address using the calculation
+                 * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
+                 * SH_HIDDEN_PRIVATE_BASE_VMID.
+                 *
+                 * For more details see:
+                 *     http://rocm-documentation.readthedocs.io/en/latest/
+                 *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
+                 *
+                 *     https://github.com/ROCm-Developer-Tools/
+                 *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
+                 *     #flat-addressing
+                 */
+                hidden_priv_base =
+                    (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
+                    (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
+                    & 0x000000000000ffff) << 32);
+                computeUnit->shader->initShHiddenPrivateBase(
+                       hidden_priv_base,
+                       task->amdQueue.scratch_backing_memory_location);
+                break;
+              case GridWorkgroupCountX:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                wiCount = ((task->gridSize(0) +
+                           task->wgSize(0) - 1) /
+                           task->wgSize(0));
+                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting num WG X: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, wiCount);
+                break;
+              case GridWorkgroupCountY:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                wiCount = ((task->gridSize(1) +
+                           task->wgSize(1) - 1) /
+                           task->wgSize(1));
+                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting num WG Y: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, wiCount);
+                break;
+              case GridWorkgroupCountZ:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                wiCount = ((task->gridSize(2) +
+                           task->wgSize(2) - 1) /
+                           task->wgSize(2));
+                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting num WG Z: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, wiCount);
+                break;
+              case WorkgroupIdX:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                                                     workGroupId[0]);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting WG ID X: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
+                break;
+              case WorkgroupIdY:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                                                     workGroupId[1]);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting WG ID Y: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
+                break;
+              case WorkgroupIdZ:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->write(physSgprIdx,
+                                                     workGroupId[2]);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting WG ID Z: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
+                break;
+              case PrivSegWaveByteOffset:
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                /**
+                  * the compute_tmpring_size_wavesize specifies the number of
+                  * kB allocated per wavefront, hence the multiplication by
+                  * 1024.
+                  *
+                  * to get the per wavefront offset into the scratch
+                  * memory, we also multiply this by the wfId. the wfId stored
+                  * in the Wavefront class, however, is the wave ID within the
+                  * WG, whereas here we need the global WFID because the
+                  * scratch space will be divided amongst all waves in the
+                  * kernel. to get the global ID we multiply the WGID by
+                  * the WG size, then add the WFID of the wave within its WG.
+                  */
+                computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
+                    (wgId * (wgSz / 64) + wfId) *
+                    task->amdQueue.compute_tmpring_size_wavesize);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting Private Seg Offset: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx,
+                        1024 * (wgId * (wgSz / 64) + wfId) *
+                        task->amdQueue.compute_tmpring_size_wavesize);
+                break;
+              case WorkgroupInfo:
+                firstWave = (wfId == 0) ? 1 : 0;
+                numWfsInWg = divCeil(wgSizeInWorkItems,
+                                         computeUnit->wfSize());
+                finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
+                finalValue |= (orderedAppendTerm << 6);
+                finalValue |= numWfsInWg;
+                physSgprIdx =
+                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+                computeUnit->srf[simdId]->
+                    write(physSgprIdx, finalValue);
+
+                ++regInitIdx;
+                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                        "Setting WG Info: s[%d] = %x\n",
+                        computeUnit->cu_id, simdId,
+                        wfSlotId, wfDynId, physSgprIdx, finalValue);
+                break;
+              default:
+                fatal("SGPR enable bit %i not supported\n", en_bit);
+                break;
+            }
+        }
+    }
+
+    regInitIdx = 0;
+
+    // iterate over all the init fields and check which
+    // bits are enabled
+    for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
+        if (task->vgprBitEnabled(en_bit)) {
+            uint32_t physVgprIdx = 0;
+            TheGpuISA::VecRegContainerU32 raw_vgpr;
+
+            switch (en_bit) {
+              case WorkitemIdX:
+                {
+                    physVgprIdx = computeUnit->registerManager
+                        ->mapVgpr(this, regInitIdx);
+                    TheGpuISA::VecRegU32 vgpr_x
+                        = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+                    for (int lane = 0; lane < workItemId[0].size(); ++lane) {
+                        vgpr_x[lane] = workItemId[0][lane];
+                    }
+
+                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+                    rawDist[regInitIdx] = 0;
+                    ++regInitIdx;
+                }
+                break;
+              case WorkitemIdY:
+                {
+                    physVgprIdx = computeUnit->registerManager
+                        ->mapVgpr(this, regInitIdx);
+                    TheGpuISA::VecRegU32 vgpr_y
+                        = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+                    for (int lane = 0; lane < workItemId[1].size(); ++lane) {
+                        vgpr_y[lane] = workItemId[1][lane];
+                    }
+
+                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+                    rawDist[regInitIdx] = 0;
+                    ++regInitIdx;
+                }
+                break;
+              case WorkitemIdZ:
+                {
+                    physVgprIdx = computeUnit->registerManager->
+                        mapVgpr(this, regInitIdx);
+                    TheGpuISA::VecRegU32 vgpr_z
+                        = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+                    for (int lane = 0; lane < workItemId[2].size(); ++lane) {
+                        vgpr_z[lane] = workItemId[2][lane];
+                    }
+
+                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+                    rawDist[regInitIdx] = 0;
+                    ++regInitIdx;
+                }
+                break;
+            }
+        }
+    }
 }
 
 void
-Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
 {
-    condRegState->init(num_cregs);
-    maxSpVgprs = num_sregs;
-    maxDpVgprs = num_dregs;
+    maxVgprs = num_vregs;
+    maxSgprs = num_sregs;
 }
 
 Wavefront::~Wavefront()
 {
-    if (callArgMem)
-        delete callArgMem;
-    delete condRegState;
 }
 
 void
-Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
+Wavefront::setStatus(status_e newStatus)
+{
+    if (computeUnit->idleCUTimeout > 0) {
+        // Wavefront's status transitions to stalled or stopped
+        if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
+             newStatus == S_WAITCNT) &&
+            (status != newStatus)) {
+            computeUnit->idleWfs++;
+            assert(computeUnit->idleWfs <=
+                   (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
+            if (computeUnit->idleWfs ==
+                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
+                lastNonIdleTick = curTick();
+            }
+            // Wavefront's status transitions to an active state (from
+            // a stopped or stalled state)
+        } else if ((status == S_STOPPED || status == S_STALLED ||
+                    status == S_WAITCNT) &&
+                   (status != newStatus)) {
+            // if all WFs in the CU were idle then check if the idleness
+            // period exceeded the timeout threshold
+            if (computeUnit->idleWfs ==
+                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
+                panic_if((curTick() - lastNonIdleTick) >=
+                         computeUnit->idleCUTimeout,
+                         "CU%d has been idle for %d ticks at tick %d",
+                         computeUnit->cu_id, computeUnit->idleCUTimeout,
+                         curTick());
+            }
+            computeUnit->idleWfs--;
+            assert(computeUnit->idleWfs >= 0);
+        }
+    }
+    status = newStatus;
+}
+
+void
+Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
 {
     wfDynId = _wf_dyn_id;
-    basePtr = _base_ptr;
+    _pc = init_pc;
+
     status = S_RUNNING;
+
+    vecReads.resize(maxVgprs, 0);
 }
 
 bool
 Wavefront::isGmInstruction(GPUDynInstPtr ii)
 {
-    if (ii->isGlobalMem() || ii->isFlat())
+    if (ii->isGlobalMem() ||
+        (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
         return true;
+    }
 
     return false;
 }
@@ -171,7 +651,40 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii)
 bool
 Wavefront::isLmInstruction(GPUDynInstPtr ii)
 {
-    if (ii->isLocalMem()) {
+    if (ii->isLocalMem() ||
+        (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstWaitcnt()
+{
+    if (instructionBuffer.empty())
+        return false;
+
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (ii->isWaitcnt()) {
+        // waitcnt is a scalar
+        assert(ii->isScalar());
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstScalarALU()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
+        || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
+        (ii->isKernArgSeg() && ii->isLoad()))) {
         return true;
     }
 
@@ -179,14 +692,14 @@ Wavefront::isLmInstruction(GPUDynInstPtr ii)
 }
 
 bool
-Wavefront::isOldestInstALU()
+Wavefront::isOldestInstVectorALU()
 {
     assert(!instructionBuffer.empty());
     GPUDynInstPtr ii = instructionBuffer.front();
 
-    if (status != S_STOPPED && (ii->isNop() ||
-        ii->isReturn() || ii->isBranch() ||
-        ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
+    if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
+        ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
+        || (ii->isKernArgSeg() && ii->isLoad()))) {
         return true;
     }
 
@@ -212,7 +725,20 @@ Wavefront::isOldestInstGMem()
     assert(!instructionBuffer.empty());
     GPUDynInstPtr ii = instructionBuffer.front();
 
-    if (status != S_STOPPED && ii->isGlobalMem()) {
+    if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstScalarMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
         return true;
     }
 
@@ -258,15 +784,13 @@ Wavefront::isOldestInstFlatMem()
     return false;
 }
 
-// Return true if the Wavefront's instruction
-// buffer has branch instruction.
 bool
-Wavefront::instructionBufferHasBranch()
+Wavefront::stopFetch()
 {
     for (auto it : instructionBuffer) {
         GPUDynInstPtr ii = it;
-
-        if (ii->isReturn() || ii->isBranch()) {
+        if (ii->isReturn() || ii->isBranch() ||
+            ii->isEndOfKernel()) {
             return true;
         }
     }
@@ -274,377 +798,125 @@ Wavefront::instructionBufferHasBranch()
     return false;
 }
 
-// Remap HSAIL register to physical VGPR.
-// HSAIL register = virtual register assigned to an operand by HLC compiler
-uint32_t
-Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+void
+Wavefront::freeResources()
 {
-    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
-    // add the offset from where the VGPRs of the wavefront have been assigned
-    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
-    // HSAIL double precision (DP) register: calculate the physical VGPR index
-    // assuming that DP registers are placed after SP ones in the VRF. The DP
-    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
-    // the DP VGPR index before mapping it to the physical VRF address space
-    if (mode == 1 && size > 4) {
-        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
-    }
-
-    assert((startVgprIndex <= physicalVgprIndex) &&
-           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
-
-    // calculate absolute physical VGPR index
-    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+    execUnitId = -1;
 }
 
-// Return true if this wavefront is ready
-// to execute an instruction of the specified type.
-int
-Wavefront::ready(itype_e type)
+void Wavefront::validateRequestCounters()
 {
-    // Check to make sure wave is running
-    if (status == S_STOPPED || status == S_RETURNING ||
-        instructionBuffer.empty()) {
-        return 0;
-    }
-
-    // Is the wave waiting at a barrier
-    if (stalledAtBarrier) {
-        if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
-                        computeUnit->getRefCounter(dispatchId, wgId))) {
-            // Are all threads at barrier?
-            return 0;
-        }
-        oldBarrierCnt = barrierCnt;
-        stalledAtBarrier = false;
-    }
-
-    // Read instruction
-    GPUDynInstPtr ii = instructionBuffer.front();
+    panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
+             wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
+             outstandingReqs < 0,
+             "Negative requests in pipe for WF%d for slot%d"
+             " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
+             " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
+             " Outstanding Reqs=%d\n",
+             wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
+             rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
+}
 
-    bool ready_inst M5_VAR_USED = false;
-    bool glbMemBusRdy = false;
-    bool glbMemIssueRdy = false;
-    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
-        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
-            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
-                glbMemBusRdy = true;
-            if (computeUnit->wfWait[j].prerdy())
-                glbMemIssueRdy = true;
+void
+Wavefront::reserveGmResource(GPUDynInstPtr ii)
+{
+    if (!ii->isScalar()) {
+        if (ii->isLoad()) {
+            rdGmReqsInPipe++;
+        } else if (ii->isStore()) {
+            wrGmReqsInPipe++;
+        } else if (ii->isAtomic() || ii->isMemSync()) {
+            rdGmReqsInPipe++;
+            wrGmReqsInPipe++;
+        } else {
+            panic("Invalid memory operation!\n");
         }
-    }
-    bool locMemBusRdy = false;
-    bool locMemIssueRdy = false;
-    if (type == I_SHARED || type == I_FLAT) {
-        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
-            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
-                locMemBusRdy = true;
-            if (computeUnit->wfWait[j].prerdy())
-                locMemIssueRdy = true;
+        execUnitId = globalMem;
+    } else {
+        if (ii->isLoad()) {
+            scalarRdGmReqsInPipe++;
+        } else if (ii->isStore()) {
+            scalarWrGmReqsInPipe++;
+        } else if (ii->isAtomic() || ii->isMemSync()) {
+            scalarWrGmReqsInPipe++;
+            scalarRdGmReqsInPipe++;
+        } else {
+            panic("Invalid memory operation!\n");
         }
+        execUnitId = scalarMem;
     }
+}
 
-    // The following code is very error prone and the entire process for
-    // checking readiness will be fixed eventually.  In the meantime, let's
-    // make sure that we do not silently let an instruction type slip
-    // through this logic and always return not ready.
-    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
-        ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
-        ii->isMemFence() || ii->isFlat())) {
-        panic("next instruction: %s is of unknown type\n", ii->disassemble());
-    }
-
-    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
-            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
-
-    if (type == I_ALU && ii->isBarrier()) {
-        // Here for ALU instruction (barrier)
-        if (!computeUnit->wfWait[simdId].prerdy()) {
-            // Is wave slot free?
-            return 0;
-        }
-
-        // Are there in pipe or outstanding memory requests?
-        if ((outstandingReqs + memReqsInPipe) > 0) {
-            return 0;
-        }
-
-        ready_inst = true;
-    } else if (type == I_ALU && ii->isNop()) {
-        // Here for ALU instruction (nop)
-        if (!computeUnit->wfWait[simdId].prerdy()) {
-            // Is wave slot free?
-            return 0;
-        }
-
-        ready_inst = true;
-    } else if (type == I_ALU && ii->isReturn()) {
-        // Here for ALU instruction (return)
-        if (!computeUnit->wfWait[simdId].prerdy()) {
-            // Is wave slot free?
-            return 0;
-        }
-
-        // Are there in pipe or outstanding memory requests?
-        if ((outstandingReqs + memReqsInPipe) > 0) {
-            return 0;
-        }
-
-        ready_inst = true;
-    } else if (type == I_ALU && (ii->isBranch() ||
-               ii->isALU() ||
-               (ii->isKernArgSeg() && ii->isLoad()) ||
-               ii->isArgSeg())) {
-        // Here for ALU instruction (all others)
-        if (!computeUnit->wfWait[simdId].prerdy()) {
-            // Is alu slot free?
-            return 0;
-        }
-        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
-                    VrfAccessType::RD_WR)) {
-            return 0;
-        }
-
-        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
-            return 0;
-        }
-        ready_inst = true;
-    } else if (type == I_GLOBAL && ii->isGlobalMem()) {
-        // Here Global memory instruction
-        if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
-            // Are there in pipe or outstanding global memory write requests?
-            if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
-                return 0;
-            }
-        }
-
-        if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
-            // Are there in pipe or outstanding global memory read requests?
-            if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
-                return 0;
-        }
-
-        if (!glbMemIssueRdy) {
-            // Is WV issue slot free?
-            return 0;
-        }
-
-        if (!glbMemBusRdy) {
-            // Is there an available VRF->Global memory read bus?
-            return 0;
-        }
-
-        // Does the coalescer have space for our instruction?
-        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
-            return 0;
-        }
-
-        if (!computeUnit->globalMemoryPipe.
-            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
-            // Can we insert a new request to the Global Mem Request FIFO?
-            return 0;
-        }
-        // can we schedule source & destination operands on the VRF?
-        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
-                    VrfAccessType::RD_WR)) {
-            return 0;
-        }
-        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
-            return 0;
-        }
-        ready_inst = true;
-    } else if (type == I_SHARED && ii->isLocalMem()) {
-        // Here for Shared memory instruction
-        if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
-            if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
-                return 0;
-            }
-        }
-
-        if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
-            if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
-                return 0;
-            }
-        }
-
-        if (!locMemBusRdy) {
-            // Is there an available VRF->LDS read bus?
-            return 0;
-        }
-        if (!locMemIssueRdy) {
-            // Is wave slot free?
-            return 0;
-        }
-
-        if (!computeUnit->localMemoryPipe.
-            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
-            // Can we insert a new request to the LDS Request FIFO?
-            return 0;
-        }
-        // can we schedule source & destination operands on the VRF?
-        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
-                    VrfAccessType::RD_WR)) {
-            return 0;
-        }
-        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
-            return 0;
-        }
-        ready_inst = true;
-    } else if (type == I_FLAT && ii->isFlat()) {
-        if (!glbMemBusRdy) {
-            // Is there an available VRF->Global memory read bus?
-            return 0;
-        }
-
-        if (!locMemBusRdy) {
-            // Is there an available VRF->LDS read bus?
-            return 0;
-        }
-
-        if (!glbMemIssueRdy) {
-            // Is wave slot free?
-            return 0;
-        }
-
-        if (!locMemIssueRdy) {
-            return 0;
-        }
-
-        // Does the coalescer have space for our instruction?
-        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
-            return 0;
-        }
-
-        if (!computeUnit->globalMemoryPipe.
-            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
-            // Can we insert a new request to the Global Mem Request FIFO?
-            return 0;
-        }
-
-        if (!computeUnit->localMemoryPipe.
-            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
-            // Can we insert a new request to the LDS Request FIFO?
-            return 0;
-        }
-        // can we schedule source & destination operands on the VRF?
-        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
-                    VrfAccessType::RD_WR)) {
-            return 0;
-        }
-        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
-        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
-            return 0;
-        }
-        ready_inst = true;
+void
+Wavefront::reserveLmResource(GPUDynInstPtr ii)
+{
+    fatal_if(ii->isScalar(),
+             "Scalar instructions can not access Shared memory!!!");
+    if (ii->isLoad()) {
+        rdLmReqsInPipe++;
+    } else if (ii->isStore()) {
+        wrLmReqsInPipe++;
+    } else if (ii->isAtomic() || ii->isMemSync()) {
+        wrLmReqsInPipe++;
+        rdLmReqsInPipe++;
     } else {
-        return 0;
+        panic("Invalid memory operation!\n");
     }
-
-    assert(ready_inst);
-
-    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
-            simdId, wfSlotId, ii->disassemble());
-    return 1;
+    execUnitId = localMem;
 }
 
-void
-Wavefront::updateResources()
+std::vector<int>
+Wavefront::reserveResources()
 {
+    // vector of execution unit IDs to return to schedule stage
+    // this return is only used for debugging and an assertion...
+    std::vector<int> execUnitIds;
+
     // Get current instruction
     GPUDynInstPtr ii = instructionBuffer.front();
     assert(ii);
-    computeUnit->vrf[simdId]->updateResources(this, ii);
+
     // Single precision ALU or Branch or Return or Special instruction
     if (ii->isALU() || ii->isSpecialOp() ||
-        ii->isBranch() ||
-        // FIXME: Kernel argument loads are currently treated as ALU operations
-        // since we don't send memory packets at execution. If we fix that then
-        // we should map them to one of the memory pipelines
+        ii->isBranch() || ii->isNop() ||
         (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
-        ii->isReturn()) {
-        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
-                                            ticks(computeUnit->spBypassLength()));
-        // this is to enforce a fixed number of cycles per issue slot per SIMD
-        computeUnit->wfWait[simdId].preset(computeUnit->shader->
-                                           ticks(computeUnit->issuePeriod));
-    } else if (ii->isBarrier()) {
-        computeUnit->wfWait[simdId].preset(computeUnit->shader->
-                                           ticks(computeUnit->issuePeriod));
-    } else if (ii->isLoad() && ii->isFlat()) {
-        assert(Enums::SC_NONE != ii->executedAs());
-        memReqsInPipe++;
-        rdGmReqsInPipe++;
-        if ( Enums::SC_SHARED == ii->executedAs() ) {
-            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-                preset(computeUnit->shader->ticks(4));
-            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-        } else {
-            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-                preset(computeUnit->shader->ticks(4));
-            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-        }
-    } else if (ii->isStore() && ii->isFlat()) {
-        assert(Enums::SC_NONE != ii->executedAs());
-        memReqsInPipe++;
-        wrGmReqsInPipe++;
-        if (Enums::SC_SHARED == ii->executedAs()) {
-            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-                preset(computeUnit->shader->ticks(8));
-            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        ii->isReturn() || ii->isEndOfKernel()) {
+        if (!ii->isScalar()) {
+            execUnitId = simdId;
         } else {
-            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-                preset(computeUnit->shader->ticks(8));
-            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+            execUnitId = scalarAluGlobalIdx;
         }
-    } else if (ii->isLoad() && ii->isGlobalMem()) {
-        memReqsInPipe++;
-        rdGmReqsInPipe++;
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            preset(computeUnit->shader->ticks(4));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isStore() && ii->isGlobalMem()) {
-        memReqsInPipe++;
-        wrGmReqsInPipe++;
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            preset(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
-        memReqsInPipe++;
-        wrGmReqsInPipe++;
-        rdGmReqsInPipe++;
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            preset(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isLoad() && ii->isLocalMem()) {
-        memReqsInPipe++;
-        rdLmReqsInPipe++;
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            preset(computeUnit->shader->ticks(4));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isStore() && ii->isLocalMem()) {
-        memReqsInPipe++;
-        wrLmReqsInPipe++;
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            preset(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
-        memReqsInPipe++;
-        wrLmReqsInPipe++;
-        rdLmReqsInPipe++;
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            preset(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+    } else if (ii->isBarrier()) {
+        execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
+    } else if (ii->isFlat()) {
+        assert(!ii->isScalar());
+        reserveLmResource(ii);
+        // add execUnitId, reserved by reserveLmResource, list before it is
+        // overwriten by reserveGmResource
+        execUnitIds.push_back(execUnitId);
+        flatLmUnitId = execUnitId;
+        reserveGmResource(ii);
+        flatGmUnitId = execUnitId;
+        execUnitIds.push_back(flatGmUnitId);
+        execUnitId = -1;
+    } else if (ii->isGlobalMem()) {
+        reserveGmResource(ii);
+    } else if (ii->isLocalMem()) {
+        reserveLmResource(ii);
+    } else if (ii->isPrivateSeg()) {
+        fatal_if(ii->isScalar(),
+                 "Scalar instructions can not access Private memory!!!");
+        reserveGmResource(ii);
+    } else {
+        panic("reserveResources -> Couldn't process op!\n");
+    }
+
+    if (execUnitId != -1) {
+        execUnitIds.push_back(execUnitId);
     }
+    assert(execUnitIds.size());
+    return execUnitIds;
 }
 
 void
@@ -653,49 +925,171 @@ Wavefront::exec()
     // ---- Exit if wavefront is inactive ----------------------------- //
 
     if (status == S_STOPPED || status == S_RETURNING ||
-        instructionBuffer.empty()) {
+        status==S_STALLED || instructionBuffer.empty()) {
         return;
     }
 
+    if (status == S_WAITCNT) {
+        /**
+         * if this wave is in S_WAITCNT state, then
+         * it should enter exec() precisely one time
+         * before the waitcnts are satisfied, in order
+         * to execute the waitcnt instruction itself
+         * thus we assert that the waitcnt is the
+         * oldest instruction. if we enter exec() with
+         * active waitcnts, and we're not executing
+         * the waitcnt instruction, something must be
+         * wrong
+         */
+        assert(isOldestInstWaitcnt());
+    }
+
     // Get current instruction
 
     GPUDynInstPtr ii = instructionBuffer.front();
 
-    const uint32_t old_pc = pc();
+    const Addr old_pc = pc();
     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
-            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
-            ii->disassemble(), old_pc);
-
-    // update the instruction stats in the CU
+            "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
+            wfDynId, ii->disassemble(), old_pc, ii->seqNum());
 
     ii->execute(ii);
+    // delete the dynamic instruction from the pipeline map
+    computeUnit->deleteFromPipeMap(this);
+    // update the instruction stats in the CU
     computeUnit->updateInstStats(ii);
-    // access the VRF
-    computeUnit->vrf[simdId]->exec(ii, this);
-    srcRegOpDist.sample(ii->numSrcRegOperands());
-    dstRegOpDist.sample(ii->numDstRegOperands());
+
+    // inform VRF of instruction execution to schedule write-back
+    // and scoreboard ready for registers
+    if (!ii->isScalar()) {
+        computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
+    }
+    computeUnit->srf[simdId]->waveExecuteInst(this, ii);
+
+    computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
+    computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
     computeUnit->numInstrExecuted++;
+    numInstrExecuted++;
+    computeUnit->instExecPerSimd[simdId]++;
     computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
                                      computeUnit->lastExecCycle[simdId]);
     computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
-    if (pc() == old_pc) {
-        uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
-        // PC not modified by instruction, proceed to next or pop frame
-        pc(new_pc);
-        if (new_pc == rpc()) {
-            popFromReconvergenceStack();
-            discardFetch();
-        } else {
-            instructionBuffer.pop_front();
+
+    if (lastInstExec) {
+        computeUnit->instInterleave[simdId].
+            sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
+    }
+    lastInstExec = computeUnit->instExecPerSimd[simdId];
+
+    // want to track:
+    // number of reads that occur per value written
+
+    // vector RAW dependency tracking
+    for (int i = 0; i < ii->getNumOperands(); i++) {
+        if (ii->isVectorRegister(i)) {
+            int vgpr = ii->getRegisterIndex(i, ii);
+            int nReg = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+            for (int n = 0; n < nReg; n++) {
+                if (ii->isSrcOperand(i)) {
+                    // This check should never fail, but to be safe we check
+                    if (rawDist.find(vgpr+n) != rawDist.end()) {
+                        vecRawDistance.
+                            sample(numInstrExecuted.value() - rawDist[vgpr+n]);
+                    }
+                    // increment number of reads to this register
+                    vecReads[vgpr+n]++;
+                } else if (ii->isDstOperand(i)) {
+                    // rawDist is set on writes, but will not be set
+                    // for the first write to each physical register
+                    if (rawDist.find(vgpr+n) != rawDist.end()) {
+                        // sample the number of reads that were performed
+                        readsPerWrite.sample(vecReads[vgpr+n]);
+                    }
+                    // on a write, reset count of reads to 0
+                    vecReads[vgpr+n] = 0;
+
+                    rawDist[vgpr+n] = numInstrExecuted.value();
+                }
+            }
         }
+    }
+
+    if (pc() == old_pc) {
+        // PC not modified by instruction, proceed to next
+        _gpuISA.advancePC(ii);
+        instructionBuffer.pop_front();
     } else {
+        DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
+                computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+                ii->disassemble());
         discardFetch();
     }
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
+            computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
 
     if (computeUnit->shader->hsail_mode==Shader::SIMT) {
         const int num_active_lanes = execMask().count();
         computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
         computeUnit->numVecOpsExecuted += num_active_lanes;
+
+        if (ii->isF16() && ii->isALU()) {
+            if (ii->isF32() || ii->isF64()) {
+                fatal("Instruction is tagged as both (1) F16, and (2)"
+                       "either F32 or F64.");
+            }
+            computeUnit->numVecOpsExecutedF16 += num_active_lanes;
+            if (ii->isFMA()) {
+                computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAC()) {
+                computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAD()) {
+                computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+        }
+        if (ii->isF32() && ii->isALU()) {
+            if (ii->isF16() || ii->isF64()) {
+                fatal("Instruction is tagged as both (1) F32, and (2)"
+                       "either F16 or F64.");
+            }
+            computeUnit->numVecOpsExecutedF32 += num_active_lanes;
+            if (ii->isFMA()) {
+                computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAC()) {
+                computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAD()) {
+                computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+        }
+        if (ii->isF64() && ii->isALU()) {
+            if (ii->isF16() || ii->isF32()) {
+                fatal("Instruction is tagged as both (1) F64, and (2)"
+                       "either F16 or F32.");
+            }
+            computeUnit->numVecOpsExecutedF64 += num_active_lanes;
+            if (ii->isFMA()) {
+                computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAC()) {
+                computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+            else if (ii->isMAD()) {
+                computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
+                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+            }
+        }
         if (isGmInstruction(ii)) {
             computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
         } else if (isLmInstruction(ii)) {
@@ -703,82 +1097,120 @@ Wavefront::exec()
         }
     }
 
-    // ---- Update Vector ALU pipeline and other resources ------------------ //
+    /**
+     * we return here to avoid spurious errors related to flat insts
+     * and their address segment resolution.
+     */
+    if (execMask().none() && ii->isFlat()) {
+        computeUnit->getTokenManager()->recvTokens(1);
+        return;
+    }
+
+    // Update Vector ALU pipeline and other resources
+    bool flat_as_gm = false;
+    bool flat_as_lm = false;
+    if (ii->isFlat()) {
+        flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
+                     (ii->executedAs() == Enums::SC_PRIVATE);
+        flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
+    }
+
     // Single precision ALU or Branch or Return or Special instruction
+    // Note, we use the same timing regardless of SP or DP ALU operation.
     if (ii->isALU() || ii->isSpecialOp() ||
-        ii->isBranch() ||
-        // FIXME: Kernel argument loads are currently treated as ALU operations
-        // since we don't send memory packets at execution. If we fix that then
-        // we should map them to one of the memory pipelines
+        ii->isBranch() || ii->isNop() ||
         (ii->isKernArgSeg() && ii->isLoad()) ||
-        ii->isArgSeg() ||
-        ii->isReturn()) {
-        computeUnit->aluPipe[simdId].set(computeUnit->shader->
-                                         ticks(computeUnit->spBypassLength()));
-
+        ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
         // this is to enforce a fixed number of cycles per issue slot per SIMD
-        computeUnit->wfWait[simdId].set(computeUnit->shader->
-                                        ticks(computeUnit->issuePeriod));
+        if (!ii->isScalar()) {
+            computeUnit->vectorALUs[simdId].set(computeUnit->
+                cyclesToTicks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->scalarALUs[scalarAlu].set(computeUnit->
+                cyclesToTicks(computeUnit->issuePeriod));
+        }
+    // Barrier on Scalar ALU
     } else if (ii->isBarrier()) {
-        computeUnit->wfWait[simdId].set(computeUnit->shader->
-                                        ticks(computeUnit->issuePeriod));
-    } else if (ii->isLoad() && ii->isFlat()) {
-        assert(Enums::SC_NONE != ii->executedAs());
-
-        if (Enums::SC_SHARED == ii->executedAs()) {
-            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-                set(computeUnit->shader->ticks(4));
-            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        computeUnit->scalarALUs[scalarAlu].set(computeUnit->
+            cyclesToTicks(computeUnit->issuePeriod));
+    // GM or Flat as GM Load
+    } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
+        if (!ii->isScalar()) {
+            computeUnit->vrfToGlobalMemPipeBus.set(
+                computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
+            computeUnit->vectorGlobalMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesVMemPerSimd[simdId] +=
+                computeUnit->vrf_gm_bus_latency;
         } else {
-            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-                set(computeUnit->shader->ticks(4));
-            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+                cyclesToTicks(computeUnit->srf_scm_bus_latency));
+            computeUnit->scalarMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesScMemPerSimd[simdId] +=
+                computeUnit->srf_scm_bus_latency;
         }
-    } else if (ii->isStore() && ii->isFlat()) {
-        assert(Enums::SC_NONE != ii->executedAs());
-        if (Enums::SC_SHARED == ii->executedAs()) {
-            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-                set(computeUnit->shader->ticks(8));
-            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    // GM or Flat as GM Store
+    } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
+        if (!ii->isScalar()) {
+            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
+                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
+            computeUnit->vectorGlobalMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesVMemPerSimd[simdId] +=
+                (2 * computeUnit->vrf_gm_bus_latency);
         } else {
-            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-                set(computeUnit->shader->ticks(8));
-            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
+            computeUnit->scalarMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesScMemPerSimd[simdId] +=
+                (2 * computeUnit->srf_scm_bus_latency);
         }
-    } else if (ii->isLoad() && ii->isGlobalMem()) {
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            set(computeUnit->shader->ticks(4));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isStore() && ii->isGlobalMem()) {
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            set(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
-        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
-            set(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isLoad() && ii->isLocalMem()) {
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            set(computeUnit->shader->ticks(4));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if (ii->isStore() && ii->isLocalMem()) {
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            set(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
-    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
-        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
-            set(computeUnit->shader->ticks(8));
-        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
-            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if ((ii->isAtomic() || ii->isMemSync()) &&
+               (ii->isGlobalMem() || flat_as_gm)) {
+        if (!ii->isScalar()) {
+            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
+                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
+            computeUnit->vectorGlobalMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesVMemPerSimd[simdId] +=
+                (2 * computeUnit->vrf_gm_bus_latency);
+        } else {
+            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
+            computeUnit->scalarMemUnit.
+                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+            computeUnit->instCyclesScMemPerSimd[simdId] +=
+                (2 * computeUnit->srf_scm_bus_latency);
+        }
+    // LM or Flat as LM Load
+    } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
+        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+            cyclesToTicks(computeUnit->vrf_lm_bus_latency));
+        computeUnit->vectorSharedMemUnit.
+            set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
+        computeUnit->instCyclesLdsPerSimd[simdId] +=
+            computeUnit->vrf_lm_bus_latency;
+    // LM or Flat as LM Store
+    } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
+        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
+        computeUnit->vectorSharedMemUnit.
+            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+        computeUnit->instCyclesLdsPerSimd[simdId] +=
+            (2 * computeUnit->vrf_lm_bus_latency);
+    // LM or Flat as LM, Atomic or MemFence
+    } else if ((ii->isAtomic() || ii->isMemSync()) &&
+               (ii->isLocalMem() || flat_as_lm)) {
+        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
+        computeUnit->vectorSharedMemUnit.
+            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+        computeUnit->instCyclesLdsPerSimd[simdId] +=
+            (2 * computeUnit->vrf_lm_bus_latency);
+    } else {
+        panic("Bad instruction type!\n");
     }
 }
 
@@ -788,212 +1220,197 @@ Wavefront::waitingAtBarrier(int lane)
     return barCnt[lane] < maxBarCnt;
 }
 
-void
-Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
-                                    const VectorMask& mask)
+GPUDynInstPtr
+Wavefront::nextInstr()
 {
-    assert(mask.count());
-    reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
+    // Read next instruction from instruction buffer
+    GPUDynInstPtr ii = instructionBuffer.front();
+    // if the WF has been dispatched in the schedule stage then
+    // check the next oldest instruction for readiness
+    if (computeUnit->pipeMap.find(ii->seqNum()) !=
+        computeUnit->pipeMap.end()) {
+        if (instructionBuffer.size() > 1) {
+            auto it = instructionBuffer.begin() + 1;
+            return *it;
+        } else { // No new instructions to check
+            return nullptr;
+        }
+    }
+    return ii;
 }
 
 void
-Wavefront::popFromReconvergenceStack()
+Wavefront::discardFetch()
 {
-    assert(!reconvergenceStack.empty());
+    instructionBuffer.clear();
+    dropFetch |= pendingFetch;
 
-    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
-            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
-            execMask().to_string<char, std::string::traits_type,
-            std::string::allocator_type>().c_str(), pc());
+    /**
+     * clear the fetch buffer for this wave in order to
+     * remove any stale inst data
+     */
+    computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
+}
 
-    reconvergenceStack.pop_back();
+bool
+Wavefront::waitCntsSatisfied()
+{
+    // Both vmWaitCnt && lgkmWaitCnt uninitialized means
+    // waitCnt instruction has been dispatched but not executed yet: next
+    // instruction should be blocked until waitCnt is executed.
+    if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
+        return false;
+    }
 
-    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
-            execMask().to_string<char, std::string::traits_type,
-            std::string::allocator_type>().c_str());
+    // If we reach here, that means waitCnt instruction is executed and
+    // the waitcnts are set by the execute method. Check if waitcnts are
+    // satisfied.
 
-}
+    // current number of vector memory ops in flight
+    int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm;
 
-void
-Wavefront::discardFetch()
-{
-    instructionBuffer.clear();
-    dropFetch |=pendingFetch;
-}
+    // current number of export insts or vector memory writes in flight
+    int exp_cnt = outstandingReqsWrGm;
 
-uint32_t
-Wavefront::pc() const
-{
-    return reconvergenceStack.back()->pc;
+    // current number of scalar/LDS memory ops in flight
+    // we do not consider GDS/message ops
+    int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm +
+        scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm;
+
+    if (vmWaitCnt != -1) {
+        if (vm_cnt > vmWaitCnt) {
+            // vmWaitCnt not satisfied
+            return false;
+        }
+    }
+
+    if (expWaitCnt != -1) {
+        if (exp_cnt > expWaitCnt) {
+            // expWaitCnt not satisfied
+            return false;
+        }
+    }
+
+    if (lgkmWaitCnt != -1) {
+        if (lgkm_cnt > lgkmWaitCnt) {
+            // lgkmWaitCnt not satisfied
+            return false;
+        }
+    }
+
+    // if we get here all outstanding waitcnts must
+    // be satisfied, so we resume normal operation
+    clearWaitCnts();
+
+    return true;
 }
 
-uint32_t
-Wavefront::rpc() const
+void
+Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
 {
-    return reconvergenceStack.back()->rpc;
+    // the scoreboard should have set the status
+    // to S_WAITCNT once a waitcnt instruction
+    // was marked as ready
+    assert(status == S_WAITCNT);
+
+    // waitcnt instruction shouldn't be sending
+    // negative counts
+    assert(vm_wait_cnt >= 0);
+    assert(exp_wait_cnt >= 0);
+    assert(lgkm_wait_cnt >= 0);
+    // waitcnts are a max of 15 because we have
+    // only 1 nibble (4 bits) to set the counts
+    assert(vm_wait_cnt <= 0xf);
+    assert(exp_wait_cnt <= 0x7);
+    assert(lgkm_wait_cnt <= 0x1f);
+
+    /**
+     * prior waitcnts should be satisfied,
+     * at which time the WF resets them
+     * back to -1, indicating they are no
+     * longer active
+     */
+    assert(vmWaitCnt == -1);
+    assert(expWaitCnt == -1);
+    assert(lgkmWaitCnt == -1);
+
+    /**
+     * if the instruction encoding
+     * indicates a waitcnt of 0xf,
+     * that means the waitcnt is
+     * not being used
+     */
+    if (vm_wait_cnt != 0xf)
+        vmWaitCnt = vm_wait_cnt;
+
+    if (exp_wait_cnt != 0x7)
+        expWaitCnt = exp_wait_cnt;
+
+    if (lgkm_wait_cnt != 0x1f)
+        lgkmWaitCnt = lgkm_wait_cnt;
 }
 
-VectorMask
-Wavefront::execMask() const
+void
+Wavefront::clearWaitCnts()
 {
-    return reconvergenceStack.back()->execMask;
+    // reset the waitcnts back to
+    // -1, indicating they are no
+    // longer valid
+    vmWaitCnt = -1;
+    expWaitCnt = -1;
+    lgkmWaitCnt = -1;
+
+    // resume running normally
+    status = S_RUNNING;
 }
 
-bool
-Wavefront::execMask(int lane) const
+Addr
+Wavefront::pc() const
 {
-    return reconvergenceStack.back()->execMask[lane];
+    return _pc;
 }
 
-
 void
-Wavefront::pc(uint32_t new_pc)
+Wavefront::pc(Addr new_pc)
 {
-    reconvergenceStack.back()->pc = new_pc;
+    _pc = new_pc;
 }
 
-uint32_t
-Wavefront::getStaticContextSize() const
+VectorMask&
+Wavefront::execMask()
 {
-    return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
-           sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
-           sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
-           sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
-           computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
+    return _execMask;
 }
 
-void
-Wavefront::getContext(const void *out)
-{
-    uint8_t *iter = (uint8_t *)out;
-    for (int i = 0; i < barCnt.size(); i++) {
-        *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
-    }
-    *(int *)iter = wfId; iter += sizeof(wfId);
-    *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
-    *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
-    *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
-    *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
-    *(uint32_t *)iter = wgId; iter += sizeof(wgId);
-    *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
-    *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
-    *(Addr *)iter = privBase; iter += sizeof(privBase);
-    *(Addr *)iter = spillBase; iter += sizeof(spillBase);
-
-    int stackSize = reconvergenceStack.size();
-    ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
-                                    std::numeric_limits<uint32_t>::max(),
-                                    std::numeric_limits<uint64_t>::max()};
-    for (int i = 0; i < workItemId[0].size(); i++) {
-        if (i < stackSize) {
-            *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
-            iter += sizeof(ReconvergenceStackEntry);
-            reconvergenceStack.pop_back();
-        } else {
-            *(ReconvergenceStackEntry *)iter = empty;
-            iter += sizeof(ReconvergenceStackEntry);
-        }
-    }
-
-    int wf_size = computeUnit->wfSize();
-    for (int i = 0; i < maxSpVgprs; i++) {
-        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint32_t regVal = computeUnit->vrf[simdId]->
-                            read<uint32_t>(vgprIdx,lane);
-            *(uint32_t *)iter = regVal; iter += sizeof(regVal);
-        }
-    }
-
-    for (int i = 0; i < maxDpVgprs; i++) {
-        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint64_t regVal = computeUnit->vrf[simdId]->
-                            read<uint64_t>(vgprIdx,lane);
-            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
-        }
-    }
-
-    for (int i = 0; i < condRegState->numRegs(); i++) {
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint64_t regVal = condRegState->read<uint64_t>(i, lane);
-            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
-        }
-    }
-
-    /* saving LDS content */
-    if (ldsChunk)
-        for (int i = 0; i < ldsChunk->size(); i++) {
-            char val = ldsChunk->read<char>(i);
-            *(char *) iter = val; iter += sizeof(val);
-        }
+bool
+Wavefront::execMask(int lane) const
+{
+    return _execMask[lane];
 }
 
 void
-Wavefront::setContext(const void *in)
-{
-    uint8_t *iter = (uint8_t *)in;
-    for (int i = 0; i < barCnt.size(); i++) {
-        barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
-    }
-    wfId = *(int *)iter; iter += sizeof(wfId);
-    maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
-    oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
-    barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
-    computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
-    wgId = *(uint32_t *)iter; iter += sizeof(wgId);
-    barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
-    initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
-    privBase = *(Addr *)iter; iter += sizeof(privBase);
-    spillBase = *(Addr *)iter; iter += sizeof(spillBase);
-
-    for (int i = 0; i < workItemId[0].size(); i++) {
-        ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
-        iter += sizeof(ReconvergenceStackEntry);
-        if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
-            pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
-                                     newEntry.execMask);
-        }
-    }
-    int wf_size = computeUnit->wfSize();
-
-    for (int i = 0; i < maxSpVgprs; i++) {
-        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
-            computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
-        }
-    }
-
-    for (int i = 0; i < maxDpVgprs; i++) {
-        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
-            computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
-        }
+Wavefront::freeRegisterFile()
+{
+    /* clear busy registers */
+    for (int i=0; i < maxVgprs; i++) {
+        int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
+        computeUnit->vrf[simdId]->markReg(vgprIdx, false);
     }
 
-    for (int i = 0; i < condRegState->numRegs(); i++) {
-        for (int lane = 0; lane < wf_size; lane++) {
-            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
-            condRegState->write<uint64_t>(i, lane, regVal);
-        }
-    }
-    /** Restoring LDS contents */
-    if (ldsChunk)
-        for (int i = 0; i < ldsChunk->size(); i++) {
-            char val = *(char *) iter; iter += sizeof(val);
-            ldsChunk->write<char>(i, val);
-        }
+    /* Free registers used by this wavefront */
+    uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
+                         computeUnit->vrf[simdId]->numRegs();
+    computeUnit->registerManager->vrfPoolMgrs[simdId]->
+        freeRegion(startVgprIndex, endIndex);
 }
 
 void
-Wavefront::computeActualWgSz(NDRange *ndr)
+Wavefront::computeActualWgSz(HSAQueueEntry *task)
 {
     actualWgSzTotal = 1;
-    for (int d = 0; d < 3; ++d) {
-        actualWgSz[d] = std::min(workGroupSz[d],
-                                 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
+    for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
+        actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
+                                 - task->wgId(d) * workGroupSz[d]);
         actualWgSzTotal *= actualWgSz[d];
     }
 }
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 9e73f1060..451e5dfcb 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -31,161 +31,116 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __WAVEFRONT_HH__
-#define __WAVEFRONT_HH__
+#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
+#define __GPU_COMPUTE_WAVEFRONT_HH__
 
 #include <cassert>
 #include <deque>
+#include <list>
 #include <memory>
-#include <stack>
+#include <unordered_map>
 #include <vector>
 
 #include "arch/gpu_isa.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
-#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
 #include "gpu-compute/misc.hh"
-#include "gpu-compute/ndrange.hh"
 #include "params/Wavefront.hh"
 #include "sim/sim_object.hh"
 
-static const int MAX_NUM_INSTS_PER_WF = 12;
-
-/**
- * A reconvergence stack entry conveys the necessary state to implement
- * control flow divergence.
- */
-struct ReconvergenceStackEntry {
-    /**
-     * PC of current instruction.
-     */
-    uint32_t pc;
-    /**
-     * PC of the immediate post-dominator instruction, i.e., the value of
-     * @a pc for the first instruction that will be executed by the wavefront
-     * when a reconvergence point is reached.
-     */
-    uint32_t rpc;
-    /**
-     * Execution mask.
-     */
-    VectorMask execMask;
-};
-
-/*
- * Arguments for the hsail opcode call, are user defined and variable length.
- * The hardware/finalizer can support arguments in hardware or use memory to
- * pass arguments. For now, let's assume that an unlimited number of arguments
- * are supported in hardware (the compiler inlines functions whenver it can
- * anyways, so unless someone is interested in the implications of linking/
- * library functions, I think this is a reasonable assumption given the typical
- * size of an OpenCL kernel).
- *
- * Note that call args are different than kernel arguments:
- *   * All work-items in a kernel refer the same set of kernel arguments
- *   * Each work-item has it's on set of call args. So a call argument at
- *     address 0x4 is different for work-item 0 and work-item 1.
- *
- * Ok, the table below shows an example of how we organize the call arguments in
- * the CallArgMem class.
- *
- * int foo(int arg1, double arg2)
- *  ___________________________________________________
- * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
- * |---------------------------------------------------|
- * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
- * |---------------------------------------------------|
- * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
- *  ___________________________________________________
- */
-class CallArgMem
-{
-  public:
-    // pointer to buffer for storing function arguments
-    uint8_t *mem;
-    int wfSize;
-    // size of function args
-    int funcArgsSizePerItem;
-
-    template<typename CType>
-    int
-    getLaneOffset(int lane, int addr)
-    {
-        return addr * wfSize + sizeof(CType) * lane;
-    }
-
-    CallArgMem(int func_args_size_per_item, int wf_size)
-        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
-    {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
-    }
-
-    ~CallArgMem()
-    {
-        free(mem);
-    }
-
-    template<typename CType>
-    uint8_t*
-    getLaneAddr(int lane, int addr)
-    {
-        return mem + getLaneOffset<CType>(lane, addr);
-    }
-
-    template<typename CType>
-    void
-    setLaneAddr(int lane, int addr, CType val)
-    {
-        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
-    }
-};
-
 class Wavefront : public SimObject
 {
   public:
-    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
-    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
-
-    // Base pointer for array of instruction pointers
-    uint64_t basePtr;
+    enum status_e {
+        // wavefront is stalled
+        S_STOPPED,
+        // wavefront is returning from a kernel
+        S_RETURNING,
+        // wavefront is running normally
+        S_RUNNING,
+        // wavefront is stalled
+        S_STALLED,
+        /**
+         * wavefront has unsatisfied wait counts
+         *
+         * while in this state the WF will only execute if
+         * the oldest instruction is the waitcnt. while in
+         * S_WAITCNT, the wavefront will not be ready until
+         * all of its waitcnts have been satisfied. the
+         * scoreboard ready() function will check the status
+         * of the waitcnts whenever the WF is in S_WAITCNT,
+         * and once they are satisfied, it will resume normal
+         * operation.
+         */
+        S_WAITCNT
+    };
 
     uint32_t oldBarrierCnt;
     uint32_t barrierCnt;
     uint32_t barrierId;
     uint32_t barrierSlots;
-    status_e status;
     // HW slot id where the WF is mapped to inside a SIMD unit
-    int wfSlotId;
+    const int wfSlotId;
     int kernId;
     // SIMD unit where the WV has been scheduled
-    int simdId;
+    const int simdId;
+    // id of the execution unit (or pipeline) where the oldest instruction
+    // of the WF is scheduled
+    int execUnitId;
+    int flatLmUnitId;
+    int flatGmUnitId;
     // pointer to parent CU
     ComputeUnit *computeUnit;
+    int maxIbSize;
 
     std::deque<GPUDynInstPtr> instructionBuffer;
 
     bool pendingFetch;
     bool dropFetch;
-
-    // Condition Register State (for HSAIL simulations only)
-    class ConditionRegisterState *condRegState;
-    // number of single precision VGPRs required by WF
-    uint32_t maxSpVgprs;
-    // number of double precision VGPRs required by WF
-    uint32_t maxDpVgprs;
-    // map virtual to physical vector register
-    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
-    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    // last tick during which all WFs in the CU are not idle
+    Tick lastNonIdleTick;
+
+    // Execution unit resource ID's associated with this WF
+    // These are static mappings set at WF slot construction and
+    // based off of the simdId and wfSlotId.
+
+    // Index to scalarALUs resource vector in CU
+    int scalarAlu;
+
+    // Indices into readyList/dispatchList of resources used by this
+    // wavefront
+    int scalarAluGlobalIdx;
+    int globalMem;
+    int localMem;
+    int scalarMem;
+
+    // number of VGPRs required by WF
+    uint32_t maxVgprs;
+    // number of SGPRs required by WF
+    uint32_t maxSgprs;
+    void freeResources();
+    GPUDynInstPtr nextInstr();
+    void setStatus(status_e newStatus);
+    status_e getStatus() { return status; }
+    void resizeRegFiles(int num_vregs, int num_sregs);
     bool isGmInstruction(GPUDynInstPtr ii);
     bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstWaitcnt();
     bool isOldestInstGMem();
     bool isOldestInstLMem();
     bool isOldestInstPrivMem();
     bool isOldestInstFlatMem();
-    bool isOldestInstALU();
+    bool isOldestInstVectorALU();
+    bool isOldestInstScalarALU();
+    bool isOldestInstScalarMem();
     bool isOldestInstBarrier();
+
     // used for passing spill address to DDInstGPU
     std::vector<Addr> lastAddr;
     std::vector<uint32_t> workItemId[3];
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
     /* the actual WG size can differ than the maximum size */
     uint32_t actualWgSz[3];
     uint32_t actualWgSzTotal;
-    void computeActualWgSz(NDRange *ndr);
+    void computeActualWgSz(HSAQueueEntry *task);
     // wavefront id within a workgroup
     uint32_t wfId;
     uint32_t maxDynWaveId;
     uint32_t dispatchId;
-    // outstanding global+local memory requests
-    uint32_t outstandingReqs;
-    // memory requests between scoreboard
-    // and execute stage not yet executed
-    uint32_t memReqsInPipe;
+    // vector and scalar memory requests pending in memory system
+    int outstandingReqs;
     // outstanding global memory write requests
-    uint32_t outstandingReqsWrGm;
+    int outstandingReqsWrGm;
     // outstanding local memory write requests
-    uint32_t outstandingReqsWrLm;
+    int outstandingReqsWrLm;
     // outstanding global memory read requests
-    uint32_t outstandingReqsRdGm;
+    int outstandingReqsRdGm;
     // outstanding local memory read requests
-    uint32_t outstandingReqsRdLm;
-    uint32_t rdLmReqsInPipe;
-    uint32_t rdGmReqsInPipe;
-    uint32_t wrLmReqsInPipe;
-    uint32_t wrGmReqsInPipe;
+    int outstandingReqsRdLm;
+    // outstanding scalar memory read requests
+    int scalarOutstandingReqsRdGm;
+    // outstanding scalar memory write requests
+    int scalarOutstandingReqsWrGm;
+    int rdLmReqsInPipe;
+    int rdGmReqsInPipe;
+    int wrLmReqsInPipe;
+    int wrGmReqsInPipe;
+    int scalarRdGmReqsInPipe;
+    int scalarWrGmReqsInPipe;
 
     int memTraceBusy;
     uint64_t lastTrace;
-    // number of vector registers reserved by WF
+    // number of virtual vector registers reserved by WF
     int reservedVectorRegs;
+    // number of virtual scalar registers reserved by WF
+    int reservedScalarRegs;
     // Index into the Vector Register File's namespace where the WF's registers
     // will live while the WF is executed
     uint32_t startVgprIndex;
+    // Index into the Scalar Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startSgprIndex;
 
     // Old value of destination gpr (for trace)
     std::vector<uint32_t> oldVgpr;
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
     // to this workgroup (thus this wavefront)
     LdsChunk *ldsChunk;
 
-    // A pointer to the spill area
-    Addr spillBase;
-    // The size of the spill area
-    uint32_t spillSizePerItem;
-    // The vector width of the spill area
-    uint32_t spillWidth;
-
-    // A pointer to the private memory area
-    Addr privBase;
-    // The size of the private memory area
-    uint32_t privSizePerItem;
-
-    // A pointer ot the read-only memory area
-    Addr roBase;
-    // size of the read-only memory area
-    uint32_t roSize;
-
-    // pointer to buffer for storing kernel arguments
-    uint8_t *kernelArgs;
     // unique WF id over all WFs executed across all CUs
     uint64_t wfDynId;
 
-    // number of times instruction issue for this wavefront is blocked
-    // due to VRF port availability
-    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // Wavefront slot stats
+
+    // Number of instructions executed by this wavefront slot across all
+    // dynamic wavefronts
+    Stats::Scalar numInstrExecuted;
+
+    // Number of cycles this WF spends in SCH stage
+    Stats::Scalar schCycles;
+
+    // Number of stall cycles encounterd by this WF in SCH stage
+    Stats::Scalar schStalls;
+
+    // The following stats sum to the value of schStalls, and record, per
+    // WF slot, what the cause of each stall was at a coarse granularity.
+
+    // Cycles WF is selected by scheduler, but RFs cannot support instruction
+    Stats::Scalar schRfAccessStalls;
+    // Cycles spent waiting for execution resources
+    Stats::Scalar schResourceStalls;
+    // cycles spent waiting for RF reads to complete in SCH stage
+    Stats::Scalar schOpdNrdyStalls;
+    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+    // but another wave is executing FLAT, which requires LM and GM and forces
+    // this WF to stall.
+    Stats::Scalar schLdsArbStalls;
+
     // number of times an instruction of a WF is blocked from being issued
     // due to WAR and WAW dependencies
     Stats::Scalar numTimesBlockedDueWAXDependencies;
     // number of times an instruction of a WF is blocked from being issued
     // due to WAR and WAW dependencies
     Stats::Scalar numTimesBlockedDueRAWDependencies;
-    // distribution of executed instructions based on their register
-    // operands; this is used to highlight the load on the VRF
-    Stats::Distribution srcRegOpDist;
-    Stats::Distribution dstRegOpDist;
-
-    // Functions to operate on call argument memory
-    // argument memory for hsail call instruction
-    CallArgMem *callArgMem;
-    void
-    initCallArgMem(int func_args_size_per_item, int wf_size)
-    {
-        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
-    }
 
-    template<typename CType>
-    CType
-    readCallArgMem(int lane, int addr)
-    {
-        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
-    }
+    // dyn inst id (per SIMD) of last instruction exec from this wave
+    uint64_t lastInstExec;
 
-    template<typename CType>
-    void
-    writeCallArgMem(int lane, int addr, CType val)
-    {
-        callArgMem->setLaneAddr<CType>(lane, addr, val);
-    }
+    // Distribution to track the distance between producer and consumer
+    // for vector register values
+    Stats::Distribution vecRawDistance;
+    // Map to track the dyn instruction id of each vector register value
+    // produced, indexed by physical vector register ID
+    std::unordered_map<int,uint64_t> rawDist;
+
+    // Distribution to track the number of times every vector register
+    // value produced is consumed.
+    Stats::Distribution readsPerWrite;
+    // Counts the number of reads performed to each physical register
+    // - counts are reset to 0 for each dynamic wavefront launched
+    std::vector<int> vecReads;
+
+    void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
+
+    // context for save/restore
+    uint8_t *context;
 
     typedef WavefrontParams Params;
     Wavefront(const Params *p);
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
         computeUnit = cu;
     }
 
+    void validateRequestCounters();
     void start(uint64_t _wfDynId, uint64_t _base_ptr);
     void exec();
-    void updateResources();
-    int ready(itype_e type);
-    bool instructionBufferHasBranch();
+    // called by SCH stage to reserve
+    std::vector<int> reserveResources();
+    bool stopFetch();
     void regStats();
-    VectorMask getPred() { return execMask() & initMask; }
 
     bool waitingAtBarrier(int lane);
 
-    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
-                                  const VectorMask& exec_mask);
-
-    void popFromReconvergenceStack();
-
-    uint32_t pc() const;
-
-    uint32_t rpc() const;
-
-    VectorMask execMask() const;
+    Addr pc() const;
+    void pc(Addr new_pc);
 
+    VectorMask& execMask();
     bool execMask(int lane) const;
 
-    void pc(uint32_t new_pc);
 
     void discardFetch();
 
-    /**
-     * Returns the size of the static hardware context of a particular wavefront
-     * This should be updated everytime the context is changed
-     */
-    uint32_t getStaticContextSize() const;
+    bool waitCntsSatisfied();
+    void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
+    void clearWaitCnts();
 
-    /**
-     * Returns the hardware context as a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void getContext(const void *out);
-
-    /**
-     * Sets the hardware context fromt a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void setContext(const void *in);
+    /** Freeing VRF space */
+    void freeRegisterFile();
 
     TheGpuISA::GPUISA&
     gpuISA()
@@ -380,14 +323,32 @@ class Wavefront : public SimObject
 
   private:
     TheGpuISA::GPUISA _gpuISA;
+
+    void reserveGmResource(GPUDynInstPtr ii);
+    void reserveLmResource(GPUDynInstPtr ii);
+
     /**
-     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
-     * to be visited by the wavefront, and the associated execution masks. The
-     * reconvergence stack grows every time the wavefront reaches a divergence
-     * point (branch instruction), and shrinks every time the wavefront
-     * reaches a reconvergence point (immediate post-dominator instruction).
+     * the following are used for waitcnt instructions
+     * vmWaitCnt: once set, we wait for the oustanding
+     *            number of vector mem instructions to be
+     *            at, or below vmWaitCnt.
+     *
+     * expWaitCnt: once set, we wait for the outstanding
+     *             number outstanding VM writes or EXP
+     *             insts to be at, or below expWaitCnt.
+     *
+     * lgkmWaitCnt: once set, we wait for the oustanding
+     *              number of LDS, GDS, scalar memory,
+     *              and message instructions to be at, or
+     *              below lgkmCount. we currently do not
+     *              support GDS/message ops.
      */
-    std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+    int vmWaitCnt;
+    int expWaitCnt;
+    int lgkmWaitCnt;
+    status_e status;
+    Addr _pc;
+    VectorMask _execMask;
 };
 
-#endif // __WAVEFRONT_HH__
+#endif // __GPU_COMPUTE_WAVEFRONT_HH__
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 1c1da212d..b009cc5f6 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
             WriteResp, "WriteReq" },
     /* WriteResp */
     { SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
+    /* WriteCompleteResp - The WriteCompleteResp command is needed
+     * because in the GPU memory model we use a WriteResp to indicate
+     * that a write has reached the cache controller so we can free
+     * resources at the coalescer. Later, when the write succesfully
+     * completes we send a WriteCompleteResp to the CU so its wait
+     * counters can be updated. Wait counters in the CU is how memory
+     * dependences are handled in the GPU ISA. */
+    { SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
     /* WritebackDirty */
     { SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
             InvalidCmd, "WritebackDirty" },
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 42d286a5e..4af0d0b1c 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -83,6 +83,7 @@ class MemCmd
         ReadRespWithInvalidate,
         WriteReq,
         WriteResp,
+        WriteCompleteResp,
         WritebackDirty,
         WritebackClean,
         WriteClean,            // writes dirty data below without evicting
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 9dffe0f2c..4047dc689 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
             trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-              if (in_msg.segment == HSASegment:SPILL) {
-                trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
-              } else if (WB) {
+              if (WB) {
                 trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
               } else {
                 trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
new file mode 100644
index 000000000..124ebbeda
--- /dev/null
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+structure (GPUCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void evictionCallback(Addr);
+  void recordCPReadCallBack(MachineID, MachineID);
+  void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void invCallback(Addr);
+  void wbCallback(Addr);
+  void evictionCallback(Addr);
+}
diff --git a/src/mem/ruby/protocol/GPU_VIPER.slicc b/src/mem/ruby/protocol/GPU_VIPER.slicc
index 45f7f3477..55ed6710a 100644
--- a/src/mem/ruby/protocol/GPU_VIPER.slicc
+++ b/src/mem/ruby/protocol/GPU_VIPER.slicc
@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
 include "MOESI_AMD_Base-msg.sm";
 include "MOESI_AMD_Base-dir.sm";
 include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-msg.sm";
 include "GPU_VIPER-TCP.sm";
 include "GPU_VIPER-SQC.sm";
 include "GPU_VIPER-TCC.sm";
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index a1e751180..f4f50cb32 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
   CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
   WriteMask writeMask, desc="Write Through Data";
   MachineID WTRequestor,            desc="Node who initiated the write through";
-  HSAScope scope,                      default="HSAScope_SYSTEM", desc="Request Scope";
   int wfid,                         default="0", desc="wavefront id";
   bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
   int ProgramCounter,               desc="PC that accesses to this block";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 08d30cfee..f1d17c85e 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
   NotPresent, desc="block is NotPresent";
   Busy,       desc="block is in a transient state, currently invalid";
 }
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
-  UNSPECIFIED, desc="Unspecified scope";
-  NOSCOPE,     desc="Explictly unscoped";
-  WAVEFRONT,   desc="Wavefront scope";
-  WORKGROUP,   desc="Workgroup scope";
-  DEVICE,      desc="Device scope";
-  SYSTEM,      desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
-  GLOBAL,   desc="Global segment";
-  GROUP,    desc="Group segment";
-  PRIVATE,  desc="Private segment";
-  KERNARG,  desc="Kernarg segment";
-  READONLY, desc="Readonly segment";
-  SPILL,    desc="Spill segment";
-  ARG,      desc="Arg segment";
-}
 
 // TesterStatus
 enumeration(TesterStatus, desc="...") {
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index b59cf9717..76c45b9b0 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
   bool checkResourceAvailable(CacheResourceType, Addr);
 }
 
-structure (GPUCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void evictionCallback(Addr);
-  void recordCPReadCallBack(MachineID, MachineID);
-  void recordCPWriteCallBack(MachineID, MachineID);
-}
-
-structure (VIPERCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void invCallback(Addr);
-  void wbCallback(Addr);
-  void evictionCallback(Addr);
-}
-
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr LineAddress,       desc="Line address for this request";
   Addr PhysicalAddress,   desc="Physical address for this request";
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   WriteMask writeMask,       desc="Writethrough mask";
   DataBlock WTData,          desc="Writethrough data block";
   int wfid,                  desc="Writethrough wavefront";
-  HSAScope scope,            desc="HSA scope";
-  HSASegment segment,        desc="HSA segment";
   PacketPtr pkt,             desc="Packet associated with this request";
 }
 
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index b729d26dd..bdc88b9ef 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -43,7 +43,6 @@
 #include "debug/RubyQueue.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/protocol/MemoryMsg.hh"
-#include "mem/ruby/system/GPUCoalescer.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "mem/ruby/system/Sequencer.hh"
 #include "sim/system.hh"
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 68b11f55d..29bedfa51 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -35,8 +35,6 @@
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/Message.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 0153b4c4b..1eecb82ad 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -61,58 +61,6 @@
 
 using namespace std;
 
-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
-    return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
-    HSAScope accessScope = HSAScope_UNSPECIFIED;
-    if (req->isScoped()) {
-        if (req->isWavefrontScope()) {
-            accessScope = HSAScope_WAVEFRONT;
-        } else if (req->isWorkgroupScope()) {
-            accessScope = HSAScope_WORKGROUP;
-        } else if (req->isDeviceScope()) {
-            accessScope = HSAScope_DEVICE;
-        } else if (req->isSystemScope()) {
-            accessScope = HSAScope_SYSTEM;
-        } else {
-            fatal("Bad scope type");
-        }
-    }
-    return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
-    HSASegment accessSegment = HSASegment_GLOBAL;
-
-    if (req->isGlobalSegment()) {
-        accessSegment = HSASegment_GLOBAL;
-    } else if (req->isGroupSegment()) {
-        accessSegment = HSASegment_GROUP;
-    } else if (req->isPrivateSegment()) {
-        accessSegment = HSASegment_PRIVATE;
-    } else if (req->isKernargSegment()) {
-        accessSegment = HSASegment_KERNARG;
-    } else if (req->isReadonlySegment()) {
-        accessSegment = HSASegment_READONLY;
-    } else if (req->isSpillSegment()) {
-        accessSegment = HSASegment_SPILL;
-    } else if (req->isArgSegment()) {
-        accessSegment = HSASegment_ARG;
-    } else {
-        fatal("Bad segment type");
-    }
-
-    return accessSegment;
-}
-
 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
     : coalescer(gc)
 {
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
 {
     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
         if (iter->second.empty()) {
+            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
             instMap.erase(iter++);
             coalescer->getGMTokenPort().sendTokens(1);
         } else {
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
     }
 }
 
+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+    // iterate the instructions held in UncoalescedTable to see whether there
+    // are more requests to issue; if yes, not yet done; otherwise, done
+    for (auto& inst : instMap) {
+        DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+            ,inst.first, inst.second.size());
+        if (inst.first == instSeqNum) { return false; }
+    }
+
+    return true;
+}
+
 void
 UncoalescedTable::printRequestTable(std::stringstream& ss)
 {
-    ss << "UncoalescedTable contains " << instMap.size()
-       << " address entries." << std::endl;
+    ss << "Listing pending packets from " << instMap.size() << " instructions";
+
     for (auto& inst : instMap) {
-        ss << "Addr 0x" << std::hex << inst.first << std::dec
-           << " with " << inst.second.size() << " packets"
-           << std::endl;
+        ss << "\tAddr: " << printAddress(inst.first) << " with "
+           << inst.second.size() << " pending packets" << std::endl;
     }
 }
 
@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
     assert(m_dataCache_ptr);
 
     m_runningGarnetStandalone = p->garnet_standalone;
-    assumingRfOCoherence = p->assume_rfo;
 }
 
 GPUCoalescer::~GPUCoalescer()
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
             if (current_time - req->getIssueTime() > m_deadlock_threshold) {
                 std::stringstream ss;
                 printRequestTable(ss);
-                ss << "Outstanding requests: " << m_outstanding_count
-                   << std::endl;
-
-                panic("Possible Deadlock detected. Aborting!\n"
-                     "version: %d request.paddr: 0x%x coalescedTable: %d "
-                     "current time: %u issue_time: %d difference: %d\n"
-                     "Request Tables:\n %s", m_version,
-                      req->getFirstPkt()->getAddr(),
-                      coalescedTable.size(), cyclesToTicks(current_time),
-                      cyclesToTicks(req->getIssueTime()),
-                      cyclesToTicks(current_time - req->getIssueTime()),
-                      ss.str());
+                warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+                     m_version, ss.str());
+                panic("Aborting due to deadlock!\n");
             }
         }
     }
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
 void
 GPUCoalescer::printRequestTable(std::stringstream& ss)
 {
-    uncoalescedTable.printRequestTable(ss);
+    ss << "Printing out " << coalescedTable.size()
+       << " outstanding requests in the coalesced table\n";
 
-    ss << "CoalescedTable contains " << coalescedTable.size()
-       << " address entries." << std::endl;
     for (auto& requestList : coalescedTable) {
-        ss << "Addr 0x" << std::hex << requestList.first << std::dec
-           << ": type-";
         for (auto& request : requestList.second) {
-            ss << RubyRequestType_to_string(request->getRubyType())
-               << " pkts-" << request->getPackets().size()
-               << " issued-" << request->getIssueTime() << " seqNum-"
-               << request->getSeqNum() << "; ";
+            ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+               << "\tInstruction sequence number: "
+               << request->getSeqNum() << "\n"
+               << "\t\tType: "
+               << RubyRequestType_to_string(request->getRubyType()) << "\n"
+               << "\t\tNumber of associated packets: "
+               << request->getPackets().size() << "\n"
+               << "\t\tIssue time: "
+               << request->getIssueTime() * clockPeriod() << "\n"
+               << "\t\tDifference from current tick: "
+               << (curCycle() - request->getIssueTime()) * clockPeriod();
         }
-        ss << std::endl;
     }
+
+    // print out packets waiting to be issued in uncoalesced table
+    uncoalescedTable.printRequestTable(ss);
 }
 
 void
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
     hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                 forwardRequestTime, firstResponseTime, isRegion);
 
+    // remove this crequest in coalescedTable
     delete crequest;
     coalescedTable.at(address).pop_front();
 
@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
     }
 }
 
+void
+GPUCoalescer::writeCompleteCallback(Addr address,
+                                    uint64_t instSeqNum,
+                                    MachineType mach)
+{
+    DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+            " instSeqNum = %d\n", address, instSeqNum);
+
+    assert(pendingWriteInsts.count(instSeqNum) == 1);
+    PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+    // check the uncoalescedTable to see whether all requests for the inst
+    // have been issued or not
+    bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+    DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+                    "reqsAllIssued=%d\n", reqsAllIssued,
+                    inst.getNumPendingStores()-1, reqsAllIssued);
+
+    if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+        // if the pending write instruction has received all write completion
+        // callbacks for its issued Ruby requests, we can now start respond
+        // the requesting CU in one response packet.
+        inst.ackWriteCompletion(m_usingRubyTester);
+
+        DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+                instSeqNum);
+        pendingWriteInsts.erase(instSeqNum);
+    }
+}
+
 void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
 {
     PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(request_address);
+    Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
 
     RubyRequestType type = crequest->getRubyType();
 
@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                     "%s\n",
                     RubyRequestType_to_string(type));
         }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>
-                    (requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
     }
 
 
@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
     } else if (pkt->isWrite()) {
         req_type = RubyRequestType_ST;
     } else {
-        // Acquire and release packets will have been issued by
-        // makeRequest, so we do not need to check for it here.
         panic("Unsupported ruby packet type\n");
     }
 
@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // Check for GPU Barrier Kernel End or Kernel Begin
-    // Leave these to be handled by the child class
-    // Kernel End/Barrier = isFlush + isRelease
-    // Kernel Begin = isFlush + isAcquire
-    if (pkt->req->isKernel()) {
-        if (pkt->req->isAcquire()){
-            // This is a Kernel Begin leave handling to
-            // virtual xCoalescer::makeRequest
-            return RequestStatus_Issued;
-        }else if (pkt->req->isRelease()) {
-            // This is a Kernel End leave handling to
-            // virtual xCoalescer::makeRequest
-            // If we are here then we didn't call
-            // a virtual version of this function
-            // so we will also schedule the callback
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
-        }
-    }
+    // all packets must have valid instruction sequence numbers
+    assert(pkt->req->hasInstSeqNum());
 
-    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
-        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
-        (pkt->req->isRelease() || pkt->req->isAcquire())) {
-        if (assumingRfOCoherence) {
-            // If we reached here, this request must be a memFence
-            // and the protocol implements RfO, the coalescer can
-            // assume sequentially consistency and schedule the callback
-            // immediately.
-            // Currently the code implements fence callbacks
-            // by reusing the mechanism for kernel completions.
-            // This should be fixed.
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
-        } else {
-            // If not RfO, return issued here and let the child coalescer
-            // take care of it.
-            return RequestStatus_Issued;
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        // issue mem_sync requests immedidately to the cache system without
+        // going though uncoalescedTable like normal LD/ST/Atomic requests
+        issueMemSyncRequest(pkt);
+    } else {
+        // otherwise, this must be either read or write command
+        assert(pkt->isRead() || pkt->isWrite());
+
+        // the pkt is temporarily stored in the uncoalesced table until
+        // it's picked for coalescing process later in this cycle or in a
+        // future cycle
+        uncoalescedTable.insertPacket(pkt);
+        DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+                pkt->getAddr());
+
+        // we schedule an issue event here to process the uncoalesced table
+        // and try to issue Ruby request to cache system
+        if (!issueEvent.scheduled()) {
+            schedule(issueEvent, curTick());
         }
     }
 
-    uncoalescedTable.insertPacket(pkt);
-    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
-    if (!issueEvent.scheduled())
-        schedule(issueEvent, curTick());
-    // TODO: issue hardware prefetches here
+    // we always return RequestStatus_Issued in this coalescer
+    // b/c the coalescer's resouce was checked ealier and the coalescer is
+    // queueing up aliased requets in its coalesced table
     return RequestStatus_Issued;
 }
 
+/**
+ * TODO: Figure out what do with this code. This code may go away
+ *       and/or be merged into the VIPER coalescer once the VIPER
+ *       protocol is re-integrated with GCN3 codes.
+ */
+/*
 void
 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
     }
 
     assert(m_mandatory_q_ptr);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}*/
 
 template <class KEY, class VALUE>
 std::ostream &
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
 }
 
 
-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
-    DPRINTF(RubyStats, "Recorded statistic: %s\n",
-            SequencerRequestType_to_string(requestType));
-}
-
 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
 {
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
         // be counted as outstanding requests.
         m_outstanding_count++;
 
+        // We track all issued or to-be-issued Ruby requests associated with
+        // write instructions. An instruction may have multiple Ruby
+        // requests.
+        if (pkt->cmd == MemCmd::WriteReq) {
+            DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+                    " the pending write instruction list\n", seqNum,
+                    line_addr);
+
+            RubyPort::SenderState* ss =
+                    safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+            // we need to save this port because it will be used to call
+            // back the requesting CU when we receive write
+            // complete callbacks for all issued Ruby requests of this
+            // instruction.
+            RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+            GPUDynInstPtr gpuDynInst = nullptr;
+
+            if (!m_usingRubyTester) {
+                // If this coalescer is connected to a real CU, we need
+                // to save the corresponding gpu dynamic instruction.
+                // CU will use that instruction to decrement wait counters
+                // in the issuing wavefront.
+                // For Ruby tester, gpuDynInst == nullptr
+                ComputeUnit::DataPort::SenderState* cu_state =
+                    safe_cast<ComputeUnit::DataPort::SenderState*>
+                        (ss->predecessor);
+                gpuDynInst = cu_state->_gpuDynInst;
+            }
+
+            PendingWriteInst& inst = pendingWriteInsts[seqNum];
+            inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+        }
+
         return true;
     }
 
@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
     }
 }
 
-void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPLdHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPLdTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCLdHits++;
-    } else {
-        CP_LdMiss++;
-    }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPStHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPStTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCStHits++;
-    } else {
-        CP_StMiss++;
-    }
-}
-
 void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
-    RubyRequestType type = crequest->getRubyType();
-    Cycles issued_time = crequest->getIssueTime();
-    Cycles completion_time = curCycle();
-    assert(completion_time >= issued_time);
-    Cycles total_lat = completion_time - issued_time;
-
-    // cache stats (valid for RfO protocol only)
-    if (mach == MachineType_TCP) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdHits++;
-        } else {
-            GPU_TCPStHits++;
-        }
-    } else if (mach == MachineType_L1Cache_wCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdTransfers++;
-        } else {
-            GPU_TCPStTransfers++;
-        }
-    } else if (mach == MachineType_TCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCCLdHits++;
-        } else {
-            GPU_TCCStHits++;
-        }
-    } else  {
-        if (type == RubyRequestType_LD) {
-            GPU_LdMiss++;
-        } else {
-            GPU_StMiss++;
-        }
-    }
-
-    // Profile all access latency, even zero latency accesses
-    m_latencyHist.sample(total_lat);
-    m_typeLatencyHist[type]->sample(total_lat);
-
-    // Profile the miss latency for all non-zero demand misses
-    if (total_lat != Cycles(0)) {
-        m_missLatencyHist.sample(total_lat);
-        m_missTypeLatencyHist[type]->sample(total_lat);
-
-        if (mach != MachineType_NUM) {
-            m_missMachLatencyHist[mach]->sample(total_lat);
-            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
-            if ((issued_time <= initialRequestTime) &&
-                (initialRequestTime <= forwardRequestTime) &&
-                (forwardRequestTime <= firstResponseTime) &&
-                (firstResponseTime <= completion_time)) {
-
-                m_IssueToInitialDelayHist[mach]->sample(
-                    initialRequestTime - issued_time);
-                m_InitialToForwardDelayHist[mach]->sample(
-                    forwardRequestTime - initialRequestTime);
-                m_ForwardToFirstResponseDelayHist[mach]->sample(
-                    firstResponseTime - forwardRequestTime);
-                m_FirstResponseToCompletionDelayHist[mach]->sample(
-                    completion_time - firstResponseTime);
-            }
-        }
-
-    }
-
-    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
-             curTick(), m_version, "Coal",
-             success ? "Done" : "SC_Failed", "", "",
-             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }
 
 void
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
             m_missTypeMachLatencyHist[i][j]->init(10);
         }
     }
-
-    // GPU cache stats
-    GPU_TCPLdHits
-        .name(name() + ".gpu_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    GPU_TCPLdTransfers
-        .name(name() + ".gpu_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    GPU_TCCLdHits
-        .name(name() + ".gpu_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    GPU_LdMiss
-        .name(name() + ".gpu_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    GPU_TCPStHits
-        .name(name() + ".gpu_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    GPU_TCPStTransfers
-        .name(name() + ".gpu_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    GPU_TCCStHits
-        .name(name() + ".gpu_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    GPU_StMiss
-        .name(name() + ".gpu_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
-
-    // CP cache stats
-    CP_TCPLdHits
-        .name(name() + ".cp_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    CP_TCPLdTransfers
-        .name(name() + ".cp_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    CP_TCCLdHits
-        .name(name() + ".cp_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    CP_LdMiss
-        .name(name() + ".cp_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    CP_TCPStHits
-        .name(name() + ".cp_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    CP_TCPStTransfers
-        .name(name() + ".cp_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    CP_TCCStHits
-        .name(name() + ".cp_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    CP_StMiss
-        .name(name() + ".cp_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
 }
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 56a207906..789ca308f 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -38,11 +38,11 @@
 #include <unordered_map>
 
 #include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
 #include "mem/request.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
@@ -57,9 +57,6 @@ class CacheMemory;
 
 class RubyGPUCoalescerParams;
 
-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
 // List of packets that belongs to a specific instruction.
 typedef std::list<PacketPtr> PerInstPackets;
 
@@ -78,6 +75,7 @@ class UncoalescedTable
     // instructions at the offset.
     PerInstPackets* getInstPackets(int offset);
     void updateResources();
+    bool areRequestsDone(const uint64_t instSeqNum);
 
     // Check if a packet hasn't been removed from instMap in too long.
     // Panics if a deadlock is detected and returns nothing otherwise.
@@ -120,6 +118,86 @@ class CoalescedRequest
     std::vector<PacketPtr> pkts;
 };
 
+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+  public:
+    PendingWriteInst()
+        : numPendingStores(0),
+          originalPort(nullptr),
+          gpuDynInstPtr(nullptr)
+    {}
+
+    ~PendingWriteInst()
+    {}
+
+    void
+    addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+                  bool usingRubyTester)
+    {
+        assert(port);
+        originalPort = port;
+
+        if (!usingRubyTester) {
+            gpuDynInstPtr = inst;
+        }
+
+        numPendingStores++;
+    }
+
+    // return true if no more ack is expected
+    bool
+    receiveWriteCompleteAck()
+    {
+        assert(numPendingStores > 0);
+        numPendingStores--;
+        return (numPendingStores == 0) ? true : false;
+    }
+
+    // ack the original requester that this write instruction is complete
+    void
+    ackWriteCompletion(bool usingRubyTester)
+    {
+        assert(numPendingStores == 0);
+
+        // make a response packet
+        PacketPtr pkt = new Packet(std::make_shared<Request>(),
+                                   MemCmd::WriteCompleteResp);
+
+        if (!usingRubyTester) {
+            assert(gpuDynInstPtr);
+            ComputeUnit::DataPort::SenderState* ss =
+                    new ComputeUnit::DataPort::SenderState
+                                            (gpuDynInstPtr, 0, nullptr);
+            pkt->senderState = ss;
+        }
+
+        // send the ack response to the requester
+        originalPort->sendTimingResp(pkt);
+    }
+
+    int
+    getNumPendingStores() {
+        return numPendingStores;
+    }
+
+  private:
+    // the number of stores waiting for writeCompleteCallback
+    int numPendingStores;
+    // The original port that sent one of packets associated with this
+    // write instruction. We may have more than one packet per instruction,
+    // which implies multiple ports per instruction. However, we need
+    // only 1 of the ports to call back the CU. Therefore, here we keep
+    // track the port that sent the first packet of this instruction.
+    RubyPort::MemSlavePort* originalPort;
+    // similar to the originalPort, this gpuDynInstPtr is set only for
+    // the first packet of this instruction.
+    GPUDynInstPtr gpuDynInstPtr;
+};
+
 class GPUCoalescer : public RubyPort
 {
   public:
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
     void collateStats();
     void regStats() override;
 
+    // each store request needs two callbacks:
+    //  (1) writeCallback is called when the store is received and processed
+    //      by TCP. This writeCallback does not guarantee the store is actually
+    //      completed at its destination cache or memory. writeCallback helps
+    //      release hardware resources (e.g., its entry in coalescedTable)
+    //      allocated for the store so that subsequent requests will not be
+    //      blocked unnecessarily due to hardware resource constraints.
+    //  (2) writeCompleteCallback is called when the store is fully completed
+    //      at its destination cache or memory. writeCompleteCallback
+    //      guarantees that the store is fully completed. This callback
+    //      will decrement hardware counters in CU
     void writeCallback(Addr address, DataBlock& data);
 
     void writeCallback(Addr address,
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime);
 
+    void writeCompleteCallback(Addr address,
+                               uint64_t instSeqNum,
+                               MachineType mach);
+
     void readCallback(Addr address, DataBlock& data);
 
     void readCallback(Addr address,
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
                       Cycles forwardRequestTime,
                       Cycles firstResponseTime,
                       bool isRegion);
-    /* atomics need their own callback because the data
-       might be const coming from SLICC */
+
     void atomicCallback(Addr address,
                         MachineType mach,
                         const DataBlock& data);
 
-    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
-    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
-    // Alternate implementations in VIPER Coalescer
-    virtual RequestStatus makeRequest(PacketPtr pkt) override;
-
+    RequestStatus makeRequest(PacketPtr pkt) override;
     int outstandingCount() const override { return m_outstanding_count; }
 
     bool
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort
 
     GMTokenPort& getGMTokenPort() { return gmTokenPort; }
 
-    void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
 
     Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
     getFirstResponseToCompletionDelayHist(const MachineType t) const
     { return *m_FirstResponseToCompletionDelayHist[t]; }
 
-  // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     bool tryCacheAccess(Addr addr, RubyRequestType type,
                         Addr pc, RubyAccessMode access_mode,
                         int size, DataBlock*& data_ptr);
-    // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(CoalescedRequest* crequest);
 
-    void kernelCallback(int wavfront_id);
+    // since the two following issue functions are protocol-specific,
+    // they must be implemented in a derived coalescer
+    virtual void issueRequest(CoalescedRequest* crequest) = 0;
+    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+    void kernelCallback(int wavefront_id);
 
     void hitCallback(CoalescedRequest* crequest,
                      MachineType mach,
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
                            bool success, bool isRegion);
     void completeHitCallback(std::vector<PacketPtr> & mylist);
 
-
     virtual RubyRequestType getRequestType(PacketPtr pkt);
 
     // Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort
 
     EventFunctionWrapper issueEvent;
 
-
-  // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     int m_max_outstanding_requests;
     Cycles m_deadlock_threshold;
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
     // an address, the are serviced in age order.
     std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
 
+    // a map btw an instruction sequence number and PendingWriteInst
+    // this is used to do a final call back for each write when it is
+    // completely done in the memory system
+    std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
     EventFunctionWrapper deadlockCheckEvent;
     bool assumingRfOCoherence;
 
-    // m5 style stats for TCP hit/miss counts
-    Stats::Scalar GPU_TCPLdHits;
-    Stats::Scalar GPU_TCPLdTransfers;
-    Stats::Scalar GPU_TCCLdHits;
-    Stats::Scalar GPU_LdMiss;
-
-    Stats::Scalar GPU_TCPStHits;
-    Stats::Scalar GPU_TCPStTransfers;
-    Stats::Scalar GPU_TCCStHits;
-    Stats::Scalar GPU_StMiss;
-
-    Stats::Scalar CP_TCPLdHits;
-    Stats::Scalar CP_TCPLdTransfers;
-    Stats::Scalar CP_TCCLdHits;
-    Stats::Scalar CP_LdMiss;
-
-    Stats::Scalar CP_TCPStHits;
-    Stats::Scalar CP_TCPStTransfers;
-    Stats::Scalar CP_TCCStHits;
-    Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    // m5 style stats for TCP hit/miss counts
+//    Stats::Scalar GPU_TCPLdHits;
+//    Stats::Scalar GPU_TCPLdTransfers;
+//    Stats::Scalar GPU_TCCLdHits;
+//    Stats::Scalar GPU_LdMiss;
+//
+//    Stats::Scalar GPU_TCPStHits;
+//    Stats::Scalar GPU_TCPStTransfers;
+//    Stats::Scalar GPU_TCCStHits;
+//    Stats::Scalar GPU_StMiss;
+//
+//    Stats::Scalar CP_TCPLdHits;
+//    Stats::Scalar CP_TCPLdTransfers;
+//    Stats::Scalar CP_TCCLdHits;
+//    Stats::Scalar CP_LdMiss;
+//
+//    Stats::Scalar CP_TCPStHits;
+//    Stats::Scalar CP_TCPStTransfers;
+//    Stats::Scalar CP_TCCStHits;
+//    Stats::Scalar CP_StMiss;
 
     //! Histogram for number of outstanding requests per cycle.
     Stats::Histogram m_outstandReqHist;
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
 
+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    Stats::Distribution numHopDelays;
+//    Stats::Distribution tcpToTccDelay;
+//    Stats::Distribution tccToSdDelay;
+//    Stats::Distribution sdToSdDelay;
+//    Stats::Distribution sdToTccDelay;
+//    Stats::Distribution tccToTcpDelay;
+//
+//    Stats::Average avgTcpToTcc;
+//    Stats::Average avgTccToSd;
+//    Stats::Average avgSdToSd;
+//    Stats::Average avgSdToTcc;
+//    Stats::Average avgTccToTcp;
+
   private:
     // Token port is used to send/receive tokens to/from GPU's global memory
     // pipeline across the port boundary. There is one per <wave size> data
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
index 0335981c0..3345f7f94 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *
 
 class RubyGPUCoalescer(RubyPort):
    type = 'RubyGPUCoalescer'
+   abstract = True
    cxx_class = 'GPUCoalescer'
    cxx_header = "mem/ruby/system/GPUCoalescer.hh"
 
@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
                                 "max requests (incl. prefetches) outstanding")
    max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
                                 "coalesced in a single cycle")
-   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
-                           "Ownership coherence");
 
    icache = Param.RubyCache("")
    dcache = Param.RubyCache("")
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index 78ad2912c..659c9fd34 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
     VIPERCoalescer(const Params *);
     ~VIPERCoalescer();
 
-    void issueMemSyncRequest(PacketPtr pkt);
+    void issueMemSyncRequest(PacketPtr pkt) override;
     void issueRequest(CoalescedRequest* crequest) override;
     void wbCallback(Addr address);
     void invCallback(Addr address);
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
index d8adb07d0..d4af1be4f 100644
--- a/src/mem/ruby/system/VIPERCoalescer.py
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
     cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
     max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
     max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
-    assume_rfo = False