From: Matt Sinclair <Matthew.Sinclair@amd.com>
Date: Wed, 7 Mar 2018 22:54:19 +0000 (-0500)
Subject: arch-gcn3: add support for unaligned accesses
X-Git-Tag: v20.1.0.0~560
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8177fc439229f2caf8f9a6768bb2d7b43dabb4e6;p=gem5.git

arch-gcn3: add support for unaligned accesses

Previously, with HSAIL, we were guaranteed by the HSA specification
that the GPU will never issue unaligned accesses.  However, now
that we are directly running GCN this is no longer true.
Accordingly, this commit adds support for unaligned accesses.
Moreover, to reduce the replication of nearly identical
code for the different request types, I also added new helper
functions that are called by all the different memory request
producing instruction types in op_encodings.hh.

Adding support for unaligned instructions requires changing
the statusBitVector used to track the status of the memory
requests for each lane from a bit per lane to an int per lane.
This is necessary because an unaligned access may span multiple
cache lines.  In the worst case, each lane may span multiple
cache lines.  There are corresponding changes in the files that
use the statusBitVector.

Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---

diff --git a/src/arch/gcn3/gpu_mem_helpers.hh b/src/arch/gcn3/gpu_mem_helpers.hh
new file mode 100644
index 000000000..40ca56561
--- /dev/null
+++ b/src/arch/gcn3/gpu_mem_helpers.hh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Matt Sinclair
+ */
+
+#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+
+#include "arch/gcn3/insts/gpu_static_inst.hh"
+#include "arch/gcn3/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**
+ * Helper function for instructions declared in op_encodings.  This function
+ * takes in all of the arguments for a given memory request we are trying to
+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+                 bool is_atomic=false)
+{
+    // local variables
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = 0, split_addr = 0;
+    bool misaligned_acc = false;
+    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+    gpuDynInst->resetEntireStatusVector();
+    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
+        if (gpuDynInst->exec_mask[lane]) {
+            vaddr = gpuDynInst->addr[lane];
+
+            /**
+             * the base address of the cache line where the the last
+             * byte of the request will be stored.
+             */
+            split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+            /**
+             * if the base cache line address of the last byte is
+             * greater than the address of the first byte then we have
+             * a misaligned access.
+             */
+            misaligned_acc = split_addr > vaddr;
+
+            if (is_atomic) {
+                req = std::make_shared<Request>(vaddr, sizeof(T), 0,
+                    gpuDynInst->computeUnit()->masterId(), 0,
+                    gpuDynInst->wfDynId,
+                    gpuDynInst->makeAtomicOpFunctor<T>(
+                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
+                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
+            } else {
+                req = std::make_shared<Request>(vaddr, req_size, 0,
+                                  gpuDynInst->computeUnit()->masterId(), 0,
+                                  gpuDynInst->wfDynId);
+            }
+
+            if (misaligned_acc) {
+                gpuDynInst->setStatusVector(lane, 2);
+                req->splitOnVaddr(split_addr, req1, req2);
+                gpuDynInst->setRequestFlags(req1);
+                gpuDynInst->setRequestFlags(req2);
+                pkt1 = new Packet(req1, mem_req_type);
+                pkt2 = new Packet(req2, mem_req_type);
+                pkt1->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                pkt2->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N + req1->getSize()]);
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
+                        "request for %#x\n", gpuDynInst->cu_id,
+                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
+                        split_addr);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
+            } else {
+                gpuDynInst->setStatusVector(lane, 1);
+                gpuDynInst->setRequestFlags(req);
+                pkt = new Packet(req, mem_req_type);
+                pkt->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
+            }
+        } else { // if lane is not active, then no pending requests
+            gpuDynInst->setStatusVector(lane, 0);
+        }
+    }
+}
+
+/**
+ * Helper function for scalar instructions declared in op_encodings.  This
+ * function takes in all of the arguments for a given memory request we are
+ * trying to initialize, then submits the request or requests depending on if
+ * the original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
+{
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = gpuDynInst->scalarAddr;
+
+    /**
+     * the base address of the cache line where the the last byte of
+     * the request will be stored.
+     */
+    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+    /**
+     * if the base cache line address of the last byte is greater
+     * than the address of the first byte then we have a misaligned
+     * access.
+     */
+    bool misaligned_acc = split_addr > vaddr;
+
+    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
+                                 gpuDynInst->computeUnit()->masterId(), 0,
+                                 gpuDynInst->wfDynId);
+
+    if (misaligned_acc) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+        gpuDynInst->numScalarReqs = 2;
+        gpuDynInst->setRequestFlags(req1);
+        gpuDynInst->setRequestFlags(req2);
+        PacketPtr pkt1 = new Packet(req1, mem_req_type);
+        PacketPtr pkt2 = new Packet(req2, mem_req_type);
+        pkt1->dataStatic(gpuDynInst->scalar_data);
+        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
+                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, split_addr);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
+    } else {
+        gpuDynInst->numScalarReqs = 1;
+        gpuDynInst->setRequestFlags(req);
+        PacketPtr pkt = new Packet(req, mem_req_type);
+        pkt->dataStatic(gpuDynInst->scalar_data);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
+    }
+}
+
+#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh
index 3197dc078..308560a5f 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -37,6 +37,7 @@
 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
 
 #include "arch/gcn3/gpu_decoder.hh"
+#include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
 #include "debug/GPUExec.hh"
@@ -174,47 +175,8 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**
-             * the base address of the cache line where the the last byte of
-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);
-                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::ReadReq);
         }
 
         /**
@@ -224,47 +186,8 @@ namespace Gcn3ISA
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**
-             * the base address of the cache line where the the last byte of
-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);
-                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::WriteReq);
         }
 
         void
@@ -566,59 +489,22 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-
-                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(),
-                        0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
         }
 
         void
         injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
         {
             // create request and set flags
-            gpuDynInst->statusBitVector = VectorMask(1);
+            gpuDynInst->resetEntireStatusVector();
+            gpuDynInst->setStatusVector(0, 1);
             RequestPtr req = std::make_shared<Request>(0, 0, 0,
                                        gpuDynInst->computeUnit()->
                                        masterId(), 0,
@@ -771,133 +657,35 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                            gpuDynInst->computeUnit()->masterId(), 0,
-                            gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()
-                        ->sendRequest(gpuDynInst, lane, pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<int N>
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            int req_size = N * sizeof(VecElemU32);
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
-                        0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId);
-
-                   gpuDynInst->setRequestFlags(req);
-                   PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                   pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * N]);
-                   gpuDynInst->computeUnit()
-                        ->sendRequest(gpuDynInst, lane, pkt);
-                }
-            }
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(),
-                            0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                                                           pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
         }
 
         template<int N>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            int req_size = N * sizeof(VecElemU32);
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
-                        0,
-                        gpuDynInst->computeUnit()->masterId(),
-                            0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * N]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
         }
 
         template<typename T>
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId,
-                        gpuDynInst->makeAtomicOpFunctor<T>(
-                            &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
-                            &(reinterpret_cast<T*>(
-                                gpuDynInst->x_data))[lane]));
-
-                    gpuDynInst->setRequestFlags(req);
-
-                    PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
         }
 
         void
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index feeb803e1..b0616d677 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -832,7 +832,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
                         gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
                         gpuDynInst->disassemble(), w->outstandingReqs,
                         w->outstandingReqs - 1);
-        if (gpuDynInst->statusBitVector.none()) {
+        if (gpuDynInst->allLanesZero()) {
             // ask gm pipe to decrement request counters, instead of directly
             // performing here, to avoid asynchronous counter update and
             // instruction retirement (which may hurt waincnt effects)
@@ -1078,7 +1078,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
             gpuDynInst->tlbHitLevel[index] = hit_level;
 
-
             // translation is done. Schedule the mem_req_event at the
             // appropriate cycle to send the timing memory request to ruby
             EventFunctionWrapper *mem_req_event =
@@ -1116,9 +1115,9 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
         }
     } else {
         if (pkt->cmd == MemCmd::MemSyncReq) {
-            gpuDynInst->statusBitVector = VectorMask(0);
+            gpuDynInst->resetEntireStatusVector();
         } else {
-            gpuDynInst->statusBitVector &= (~(1ll << index));
+            gpuDynInst->decrementStatusVector(index);
         }
 
         // New SenderState for the memory access
@@ -1289,12 +1288,10 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
     gpuDynInst->memStatusVector[paddr].pop_back();
     gpuDynInst->pAddr = pkt->req->getPaddr();
 
-    gpuDynInst->statusBitVector &= (~(1ULL << index));
-
-    DPRINTF(GPUMem, "bitvector is now %#x\n",
-            gpuDynInst->statusBitVector);
+    gpuDynInst->decrementStatusVector(index);
+    DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
 
-    if (gpuDynInst->statusBitVector == VectorMask(0)) {
+    if (gpuDynInst->allLanesZero()) {
         auto iter = gpuDynInst->memStatusVector.begin();
         auto end = gpuDynInst->memStatusVector.end();
 
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 74b963b73..2a49522da 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,9 +42,10 @@
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                        GPUStaticInst *static_inst, InstSeqNum instSeqNum)
     : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
-      (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+      (Addr)0), numScalarReqs(0), isSaveRestore(false),
       _staticInst(static_inst), _seqNum(instSeqNum)
 {
+    statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
     tlbHitLevel.assign(computeUnit()->wfSize(), -1);
     // vector instructions can have up to 4 source/destination operands
     d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 392b57d12..3d2fa0d3f 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -39,6 +39,8 @@
 
 #include "base/amo.hh"
 #include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUMem.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
@@ -307,13 +309,103 @@ class GPUDynInst : public GPUExecContext
         }
     }
 
+    // reset the number of pending memory requests for all lanes
+    void
+    resetEntireStatusVector()
+    {
+        assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            resetStatusVector(lane);
+        }
+    }
+
+    // reset the number of pending memory requests for the inputted lane
+    void
+    resetStatusVector(int lane)
+    {
+        setStatusVector(lane, 0);
+    }
+
+    // set the number of pending memory requests for the inputted lane
+    void
+    setStatusVector(int lane, int newVal)
+    {
+        // currently we can have up to 2 memory requests per lane (if the
+        // lane's request goes across multiple cache lines)
+        assert((newVal >= 0) && (newVal <= 2));
+        statusVector[lane] = newVal;
+    }
+
+    // subtracts the number of pending memory requests for the inputted lane
+    // by 1
+    void
+    decrementStatusVector(int lane)
+    {
+        // this lane may have multiple requests, so only subtract one for
+        // this request
+        assert(statusVector[lane] >= 1);
+        statusVector[lane]--;
+    }
+
+    // return the current number of pending memory requests for the inputted
+    // lane
+    int
+    getLaneStatus(int lane) const
+    {
+        return statusVector[lane];
+    }
+
+    // returns true if all memory requests from all lanes have been received,
+    // else returns false
+    bool
+    allLanesZero() const
+    {
+        // local variables
+        bool allZero = true;
+
+        // iterate over all lanes, checking the number of pending memory
+        // requests they have
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            // if any lane still has pending requests, return false
+            if (statusVector[lane] > 0) {
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
+                        "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
+                        statusVector[lane], addr[lane]);
+                allZero = false;
+            }
+        }
+
+        if (allZero) {
+            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
+                    " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
+        }
+        return allZero;
+    }
+
+    // returns a string representing the current state of the statusVector
+    std::string
+    printStatusVector() const
+    {
+        std::string statusVec_str = "[";
+
+        // iterate over all lanes, adding the current number of pending
+        // requests for this lane to the string
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            statusVec_str += std::to_string(statusVector[lane]);
+        }
+        statusVec_str += "]";
+
+        return statusVec_str;
+    }
+
     // Map returned packets and the addresses they satisfy with which lane they
     // were requested from
     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
     StatusVector memStatusVector;
 
-    // Track the status of memory requests per lane, a bit per lane
-    VectorMask statusBitVector;
+    // Track the status of memory requests per lane, an int per lane to allow
+    // unaligned accesses
+    std::vector<int> statusVector;
     // for ld_v# or st_v#
     std::vector<int> tlbHitLevel;
 
diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc
index a4d7f4916..359f6bb0f 100644
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -107,7 +107,6 @@ DataBlock::getDataMod(int offset)
 void
 DataBlock::setData(const uint8_t *data, int offset, int len)
 {
-    assert(offset + len <= RubySystem::getBlockSizeBytes());
     memcpy(&m_data[offset], data, len);
 }
 
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 83aaa1a50..92fed81dd 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -267,9 +267,6 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
                 curTick() + rs->clockPeriod());
             return true;
         }
-
-        assert(getOffset(pkt->getAddr()) + pkt->getSize() <=
-               RubySystem::getBlockSizeBytes());
     }
 
     // Save the port in the sender state object to be used later to