From c58cb8c9dbeef377da180f1fdaaa1c0eadf85550 Mon Sep 17 00:00:00 2001
From: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Date: Fri, 7 Jul 2017 14:13:11 +0100
Subject: [PATCH] cpu,mem: Add support for partial loads/stores and wide mem.
 accesses

This changeset adds support for partial (or masked) loads/stores, i.e.
loads/stores that can disable accesses to individual bytes within the
target address range.  In addition, this changeset extends the code to
crack memory accesses across most CPU models (TimingSimpleCPU still
TBD), so that arbitrarily wide memory accesses are supported.  These
changes are required for supporting ISAs with wide vectors.

Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>
- Tiago Muck <tiago.muck@arm.com>

Change-Id: Ibad33541c258ad72925c0b1d5abc3e5e8bf92d92
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13518
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
---
 src/cpu/base.hh                |   4 +-
 src/cpu/base_dyn_inst.hh       |  17 ++--
 src/cpu/checker/cpu.cc         | 120 +++++++++++++++++---------
 src/cpu/checker/cpu.hh         |  29 ++++++-
 src/cpu/exec_context.hh        |  10 ++-
 src/cpu/minor/dyn_inst.hh      |  17 +++-
 src/cpu/minor/exec_context.hh  |  22 +++--
 src/cpu/minor/execute.cc       |  10 ++-
 src/cpu/minor/lsq.cc           | 148 +++++++++++++++++++++------------
 src/cpu/minor/lsq.hh           |  11 ++-
 src/cpu/o3/cpu.hh              |   7 +-
 src/cpu/o3/lsq.hh              |  48 ++++++++---
 src/cpu/o3/lsq_impl.hh         | 129 +++++++++++++++-------------
 src/cpu/o3/lsq_unit_impl.hh    |   3 +-
 src/cpu/simple/atomic.cc       | 138 ++++++++++++++++++------------
 src/cpu/simple/atomic.hh       |  29 ++++++-
 src/cpu/simple/base.cc         |   6 +-
 src/cpu/simple/base.hh         |  14 +++-
 src/cpu/simple/exec_context.hh |  20 +++--
 src/cpu/simple/timing.cc       |  18 +++-
 src/cpu/simple/timing.hh       |  10 ++-
 src/cpu/simple_thread.cc       |   7 +-
 src/cpu/utils.hh               |  96 +++++++++++++++++++++
 src/mem/abstract_mem.cc        |   2 +-
 src/mem/cache/cache.cc         |   3 +-
 src/mem/packet.hh              |  35 +++++++-
 src/mem/request.hh             |  27 ++++++
 27 files changed, 711 insertions(+), 269 deletions(-)
 create mode 100644 src/cpu/utils.hh
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index f013a3e02..3d679f172 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -175,9 +175,9 @@ class BaseCPU : public ClockedObject
     uint32_t socketId() const { return _socketId; }
 
     /** Reads this CPU's unique data requestor ID */
-    MasterID dataMasterId() { return _dataMasterId; }
+    MasterID dataMasterId() const { return _dataMasterId; }
     /** Reads this CPU's unique instruction requestor ID */
-    MasterID instMasterId() { return _instMasterId; }
+    MasterID instMasterId() const { return _instMasterId; }
 
     /**
      * Get a port on this CPU. All CPUs have a data and
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 4084241bd..22a32ec10 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -298,10 +298,12 @@ class BaseDynInst : public ExecContext, public RefCounted
         cpu->demapPage(vaddr, asn);
     }
 
-    Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags);
+    Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags,
+            const std::vector<bool>& byteEnable = std::vector<bool>());
 
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
-                   Request::Flags flags, uint64_t *res);
+                   Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>());
 
     Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
                          AtomicOpFunctor *amo_op);
@@ -918,21 +920,24 @@ class BaseDynInst : public ExecContext, public RefCounted
 template<class Impl>
 Fault
 BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
-                                   Request::Flags flags)
+                                   Request::Flags flags,
+                                   const std::vector<bool>& byteEnable)
 {
     return cpu->pushRequest(
             dynamic_cast<typename DynInstPtr::PtrType>(this),
-            /* ld */ true, nullptr, size, addr, flags, nullptr);
+            /* ld */ true, nullptr, size, addr, flags, nullptr, nullptr,
+            byteEnable);
 }
 
 template<class Impl>
 Fault
 BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr,
-                            Request::Flags flags, uint64_t *res)
+                            Request::Flags flags, uint64_t *res,
+                            const std::vector<bool>& byteEnable)
 {
     return cpu->pushRequest(
             dynamic_cast<typename DynInstPtr::PtrType>(this),
-            /* st */ false, data, size, addr, flags, res);
+            /* st */ false, data, size, addr, flags, res, nullptr, byteEnable);
 }
 
 template<class Impl>
diff --git a/src/cpu/checker/cpu.cc b/src/cpu/checker/cpu.cc
index 7f8eada4c..cca6d6b12 100644
--- a/src/cpu/checker/cpu.cc
+++ b/src/cpu/checker/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011,2013,2017 ARM Limited
+ * Copyright (c) 2011,2013,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -52,6 +52,7 @@
 #include "cpu/simple_thread.hh"
 #include "cpu/static_inst.hh"
 #include "cpu/thread_context.hh"
+#include "cpu/utils.hh"
 #include "params/CheckerCPU.hh"
 #include "sim/full_system.hh"
 
@@ -139,31 +140,68 @@ CheckerCPU::unserialize(CheckpointIn &cp)
 {
 }
 
+RequestPtr
+CheckerCPU::genMemFragmentRequest(Addr frag_addr, int size,
+                                  Request::Flags flags,
+                                  const std::vector<bool>& byte_enable,
+                                  int& frag_size, int& size_left) const
+{
+    frag_size = std::min(
+        cacheLineSize() - addrBlockOffset(frag_addr, cacheLineSize()),
+        (Addr) size_left);
+    size_left -= frag_size;
+
+    RequestPtr mem_req;
+
+    if (!byte_enable.empty()) {
+        // Set up byte-enable mask for the current fragment
+        auto it_start = byte_enable.cbegin() + (size - (frag_size +
+                                                        size_left));
+        auto it_end = byte_enable.cbegin() + (size - size_left);
+        if (isAnyActiveElement(it_start, it_end)) {
+            mem_req = std::make_shared<Request>(0, frag_addr, frag_size,
+                    flags, masterId, thread->pcState().instAddr(),
+                    tc->contextId());
+            mem_req->setByteEnable(std::vector<bool>(it_start, it_end));
+        }
+    } else {
+        mem_req = std::make_shared<Request>(0, frag_addr, frag_size,
+                    flags, masterId, thread->pcState().instAddr(),
+                    tc->contextId());
+    }
+
+    return mem_req;
+}
+
 Fault
 CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size,
-                    Request::Flags flags)
+                    Request::Flags flags,
+                    const std::vector<bool>& byteEnable)
 {
     Fault fault = NoFault;
-    int fullSize = size;
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
     bool checked_flags = false;
     bool flags_match = true;
     Addr pAddr = 0x0;
 
-
-    if (secondAddr > addr)
-       size = secondAddr - addr;
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    bool predicate;
 
     // Need to account for multiple accesses like the Atomic and TimingSimple
     while (1) {
-        auto mem_req = std::make_shared<Request>(
-            0, addr, size, flags, masterId,
-            thread->pcState().instAddr(), tc->contextId());
+        RequestPtr mem_req = genMemFragmentRequest(frag_addr, size, flags,
+                                                   byteEnable, frag_size,
+                                                   size_left);
+
+        predicate = (mem_req != nullptr);
 
         // translate to physical address
-        fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Read);
+        if (predicate) {
+            fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Read);
+        }
 
-        if (!checked_flags && fault == NoFault && unverifiedReq) {
+        if (predicate && !checked_flags && fault == NoFault && unverifiedReq) {
             flags_match = checkFlags(unverifiedReq, mem_req->getVaddr(),
                                      mem_req->getPaddr(), mem_req->getFlags());
             pAddr = mem_req->getPaddr();
@@ -171,7 +209,7 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size,
         }
 
         // Now do the access
-        if (fault == NoFault &&
+        if (predicate && fault == NoFault &&
             !mem_req->getFlags().isSet(Request::NO_ACCESS)) {
             PacketPtr pkt = Packet::createRead(mem_req);
 
@@ -182,7 +220,7 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size,
                 dcachePort->sendFunctional(pkt);
             } else {
                 // Assume the data is correct if it's an uncached access
-                memcpy(data, unverifiedMemData, size);
+                memcpy(data, unverifiedMemData, frag_size);
             }
 
             delete pkt;
@@ -196,22 +234,21 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size,
         }
 
         //If we don't need to access a second cache line, stop now.
-        if (secondAddr <= addr)
+        if (size_left == 0)
         {
             break;
         }
 
         // Setup for accessing next cache line
-        data += size;
-        unverifiedMemData += size;
-        size = addr + fullSize - secondAddr;
-        addr = secondAddr;
+        frag_addr += frag_size;
+        data += frag_size;
+        unverifiedMemData += frag_size;
     }
 
     if (!flags_match) {
         warn("%lli: Flags do not match CPU:%#x %#x %#x Checker:%#x %#x %#x\n",
              curTick(), unverifiedReq->getVaddr(), unverifiedReq->getPaddr(),
-             unverifiedReq->getFlags(), addr, pAddr, flags);
+             unverifiedReq->getFlags(), frag_addr, pAddr, flags);
         handleError();
     }
 
@@ -220,31 +257,35 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size,
 
 Fault
 CheckerCPU::writeMem(uint8_t *data, unsigned size,
-                     Addr addr, Request::Flags flags, uint64_t *res)
+                     Addr addr, Request::Flags flags, uint64_t *res,
+                     const std::vector<bool>& byteEnable)
 {
+    assert(byteEnable.empty() || byteEnable.size() == size);
+
     Fault fault = NoFault;
     bool checked_flags = false;
     bool flags_match = true;
     Addr pAddr = 0x0;
     static uint8_t zero_data[64] = {};
 
-    int fullSize = size;
-
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
-
-    if (secondAddr > addr)
-        size = secondAddr - addr;
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    bool predicate;
 
     // Need to account for a multiple access like Atomic and Timing CPUs
     while (1) {
-        auto mem_req = std::make_shared<Request>(
-            0, addr, size, flags, masterId,
-            thread->pcState().instAddr(), tc->contextId());
+        RequestPtr mem_req = genMemFragmentRequest(frag_addr, size, flags,
+                                                   byteEnable, frag_size,
+                                                   size_left);
 
-        // translate to physical address
-        fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Write);
+        predicate = (mem_req != nullptr);
+
+        if (predicate) {
+            fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Write);
+        }
 
-        if (!checked_flags && fault == NoFault && unverifiedReq) {
+        if (predicate && !checked_flags && fault == NoFault && unverifiedReq) {
            flags_match = checkFlags(unverifiedReq, mem_req->getVaddr(),
                                     mem_req->getPaddr(), mem_req->getFlags());
            pAddr = mem_req->getPaddr();
@@ -261,7 +302,7 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size,
         bool was_prefetch = mem_req->isPrefetch();
 
         //If we don't need to access a second cache line, stop now.
-        if (fault != NoFault || secondAddr <= addr)
+        if (fault != NoFault || size_left == 0)
         {
             if (fault != NoFault && was_prefetch) {
               fault = NoFault;
@@ -269,16 +310,13 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size,
             break;
         }
 
-        //Update size and access address
-        size = addr + fullSize - secondAddr;
-        //And access the right address.
-        addr = secondAddr;
+        frag_addr += frag_size;
    }
 
    if (!flags_match) {
        warn("%lli: Flags do not match CPU:%#x %#x Checker:%#x %#x %#x\n",
             curTick(), unverifiedReq->getVaddr(), unverifiedReq->getPaddr(),
-            unverifiedReq->getFlags(), addr, pAddr, flags);
+            unverifiedReq->getFlags(), frag_addr, pAddr, flags);
        handleError();
    }
 
@@ -304,12 +342,12 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size,
    // const set of zeros.
    if (flags & Request::STORE_NO_DATA) {
        assert(!data);
-       assert(sizeof(zero_data) <= fullSize);
+       assert(sizeof(zero_data) <= size);
        data = zero_data;
    }
 
    if (unverifiedReq && unverifiedMemData &&
-       memcmp(data, unverifiedMemData, fullSize) && extraData) {
+       memcmp(data, unverifiedMemData, size) && extraData) {
            warn("%lli: Store value does not match value sent to memory! "
                   "data: %#x inst_data: %#x", curTick(), data,
                   unverifiedMemData);
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index 8c3000005..66632b720 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -531,11 +531,36 @@ class CheckerCPU : public BaseCPU, public ExecContext
         this->dtb->demapPage(vaddr, asn);
     }
 
+    /**
+     * Helper function used to generate the request for a single fragment of a
+     * memory access.
+     *
+     * Takes care of setting up the appropriate byte-enable mask for the
+     * fragment, given the mask for the entire memory access.
+     *
+     * @param frag_addr Start address of the fragment.
+     * @param size Total size of the memory access in bytes.
+     * @param flags Request flags.
+     * @param byte_enable Byte-enable mask for the entire memory access.
+     * @param[out] frag_size Fragment size.
+     * @param[in,out] size_left Size left to be processed in the memory access.
+     * @return Pointer to the allocated Request, nullptr if the byte-enable
+     * mask is all-false for the fragment.
+     */
+    RequestPtr genMemFragmentRequest(Addr frag_addr, int size,
+                                     Request::Flags flags,
+                                     const std::vector<bool>& byte_enable,
+                                     int& frag_size, int& size_left) const;
+
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
-                  Request::Flags flags) override;
+                  Request::Flags flags,
+                  const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
-                   Request::Flags flags, uint64_t *res) override;
+                   Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault amoMem(Addr addr, uint8_t* data, unsigned size,
                  Request::Flags flags, AtomicOpFunctor *amo_op) override
diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh
index 4cad9e3e1..b294387e2 100644
--- a/src/cpu/exec_context.hh
+++ b/src/cpu/exec_context.hh
@@ -235,7 +235,8 @@ class ExecContext {
      * should never be called).
      */
     virtual Fault readMem(Addr addr, uint8_t *data, unsigned int size,
-                          Request::Flags flags)
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable = std::vector<bool>())
     {
         panic("ExecContext::readMem() should be overridden\n");
     }
@@ -248,7 +249,8 @@ class ExecContext {
      * should never be called).
      */
     virtual Fault initiateMemRead(Addr addr, unsigned int size,
-                                  Request::Flags flags)
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable = std::vector<bool>())
     {
         panic("ExecContext::initiateMemRead() should be overridden\n");
     }
@@ -258,7 +260,9 @@ class ExecContext {
      * For timing-mode contexts, initiate a timing memory write operation.
      */
     virtual Fault writeMem(uint8_t *data, unsigned int size, Addr addr,
-                           Request::Flags flags, uint64_t *res) = 0;
+                           Request::Flags flags, uint64_t *res,
+                           const std::vector<bool>& byteEnable =
+                               std::vector<bool>()) = 0;
 
     /**
      * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic
diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh
index b2decb39b..0a8ff8acf 100644
--- a/src/cpu/minor/dyn_inst.hh
+++ b/src/cpu/minor/dyn_inst.hh
@@ -202,6 +202,13 @@ class MinorDynInst : public RefCounted
      *  to allow other instructions to fill the fetch delay */
     bool canEarlyIssue;
 
+    /** Flag controlling conditional execution of the instruction */
+    bool predicate;
+
+    /** Flag controlling conditional execution of the memory access associated
+     *  with the instruction (only meaningful for loads/stores) */
+    bool memAccPredicate;
+
     /** execSeqNum of the latest inst on which this inst depends.
      *  This can be used as a sanity check for dependency ordering
      *  where slightly out of order execution is required (notably
@@ -227,7 +234,7 @@ class MinorDynInst : public RefCounted
         pc(TheISA::PCState(0)), fault(fault_),
         triedToPredict(false), predictedTaken(false),
         fuIndex(0), inLSQ(false), inStoreBuffer(false),
-        canEarlyIssue(false),
+        canEarlyIssue(false), predicate(true), memAccPredicate(true),
         instToWaitFor(0), extraCommitDelay(Cycles(0)),
         extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0))
     { }
@@ -266,6 +273,14 @@ class MinorDynInst : public RefCounted
     /** ReportIF interface */
     void reportData(std::ostream &os) const;
 
+    bool readPredicate() const { return predicate; }
+
+    void setPredicate(bool val) { predicate = val; }
+
+    bool readMemAccPredicate() const { return memAccPredicate; }
+
+    void setMemAccPredicate(bool val) { memAccPredicate = val; }
+
     ~MinorDynInst();
 };
 
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index b39bbac3f..9f6fce4cd 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -96,28 +96,40 @@ class ExecContext : public ::ExecContext
     {
         DPRINTF(MinorExecute, "ExecContext setting PC: %s\n", inst->pc);
         pcState(inst->pc);
-        setPredicate(true);
+        setPredicate(inst->readPredicate());
+        setMemAccPredicate(inst->readMemAccPredicate());
         thread.setIntReg(TheISA::ZeroReg, 0);
 #if THE_ISA == ALPHA_ISA
         thread.setFloatReg(TheISA::ZeroReg, 0);
 #endif
     }
 
+    ~ExecContext()
+    {
+        inst->setPredicate(readPredicate());
+        inst->setMemAccPredicate(readMemAccPredicate());
+    }
+
     Fault
     initiateMemRead(Addr addr, unsigned int size,
-                    Request::Flags flags) override
+                    Request::Flags flags,
+                    const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL, nullptr);
+            size, addr, flags, nullptr, nullptr, byteEnable);
         return NoFault;
     }
 
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
-             Request::Flags flags, uint64_t *res) override
+             Request::Flags flags, uint64_t *res,
+             const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
+        assert(byteEnable.empty() || byteEnable.size() == size);
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res, nullptr);
+            size, addr, flags, res, nullptr, byteEnable);
         return NoFault;
     }
 
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 47f3cbc68..527eb2bc0 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -364,6 +364,8 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
         DPRINTF(MinorMem, "Completing failed request inst: %s\n",
             *inst);
         use_context_predicate = false;
+        if (!context.readMemAccPredicate())
+            inst->staticInst->completeAcc(nullptr, &context, inst->traceData);
     } else if (packet->isError()) {
         DPRINTF(MinorMem, "Trying to commit error response: %s\n",
             *inst);
@@ -481,6 +483,10 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch,
         } else {
             /* Only set this if the instruction passed its
              * predicate */
+            if (!context.readMemAccPredicate()) {
+                DPRINTF(MinorMem, "No memory access for inst: %s\n", *inst);
+                assert(context.readPredicate());
+            }
             passed_predicate = context.readPredicate();
 
             /* Set predicate in tracing */
@@ -928,7 +934,7 @@ Execute::commitInst(MinorDynInstPtr inst, bool early_memory_issue,
                  *  until it gets to the head of inFlightInsts */
                 inst->canEarlyIssue = false;
                 /* Not completed as we'll come here again to pick up
-                *  the fault when we get to the end of the FU */
+                 * the fault when we get to the end of the FU */
                 completed_inst = false;
             } else {
                 DPRINTF(MinorExecute, "Fault in execute: %s\n",
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index 6fe6c3738..1d9f17e8d 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014,2017 ARM Limited
+ * Copyright (c) 2013-2014,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -49,27 +49,13 @@
 #include "cpu/minor/exec_context.hh"
 #include "cpu/minor/execute.hh"
 #include "cpu/minor/pipeline.hh"
+#include "cpu/utils.hh"
 #include "debug/Activity.hh"
 #include "debug/MinorMem.hh"
 
 namespace Minor
 {
 
-/** Returns the offset of addr into an aligned a block of size block_size */
-static Addr
-addrBlockOffset(Addr addr, unsigned int block_size)
-{
-    return addr & (block_size - 1);
-}
-
-/** Returns true if the given [addr .. addr+size-1] transfer needs to be
- *  fragmented across a block size of block_size */
-static bool
-transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size)
-{
-    return (addrBlockOffset(addr, block_size) + size) > block_size;
-}
-
 LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
     PacketDataPtr data_, uint64_t *res_) :
     SenderState(),
@@ -88,6 +74,13 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
     request = std::make_shared<Request>();
 }
 
+void
+LSQ::LSQRequest::disableMemAccess()
+{
+    port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false);
+    DPRINTFS(MinorMem, (&port), "Disable mem access for inst:%s\n", *inst);
+}
+
 LSQ::AddrRangeCoverage
 LSQ::LSQRequest::containsAddrRangeOf(
     Addr req1_addr, unsigned int req1_size,
@@ -256,16 +249,23 @@ LSQ::SingleDataRequest::startAddrTranslation()
     ThreadContext *thread = port.cpu.getContext(
         inst->id.threadId);
 
-    port.numAccessesInDTLB++;
+    const auto &byteEnable = request->getByteEnable();
+    if (byteEnable.size() == 0 ||
+        isAnyActiveElement(byteEnable.cbegin(), byteEnable.cend())) {
+        port.numAccessesInDTLB++;
 
-    setState(LSQ::LSQRequest::InTranslation);
+        setState(LSQ::LSQRequest::InTranslation);
 
-    DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n");
-    /* Submit the translation request.  The response will come through
-     *  finish/markDelayed on the LSQRequest as it bears the Translation
-     *  interface */
-    thread->getDTBPtr()->translateTiming(
-        request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write));
+        DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n");
+        /* Submit the translation request.  The response will come through
+         *  finish/markDelayed on the LSQRequest as it bears the Translation
+         *  interface */
+        thread->getDTBPtr()->translateTiming(
+            request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write));
+    } else {
+        disableMemAccess();
+        setState(LSQ::LSQRequest::Complete);
+    }
 }
 
 void
@@ -357,6 +357,8 @@ LSQ::SplitDataRequest::makeFragmentRequests()
     unsigned int fragment_size;
     Addr fragment_addr;
 
+    std::vector<bool> fragment_write_byte_en;
+
     /* Assume that this transfer is across potentially many block snap
      * boundaries:
      *
@@ -401,6 +403,9 @@ LSQ::SplitDataRequest::makeFragmentRequests()
     /* Just past the last address in the request */
     Addr end_addr = base_addr + whole_size;
 
+    auto& byte_enable = request->getByteEnable();
+    unsigned int num_disabled_fragments = 0;
+
     for (unsigned int fragment_index = 0; fragment_index < numFragments;
          fragment_index++)
     {
@@ -421,32 +426,58 @@ LSQ::SplitDataRequest::makeFragmentRequests()
         }
 
         RequestPtr fragment = std::make_shared<Request>();
+        bool disabled_fragment = false;
 
         fragment->setContext(request->contextId());
-        fragment->setVirt(0 /* asid */,
-            fragment_addr, fragment_size, request->getFlags(),
-            request->masterId(),
-            request->getPC());
+        if (byte_enable.empty()) {
+            fragment->setVirt(0 /* asid */,
+                fragment_addr, fragment_size, request->getFlags(),
+                request->masterId(),
+                request->getPC());
+        } else {
+            // Set up byte-enable mask for the current fragment
+            auto it_start = byte_enable.begin() +
+                (fragment_addr - base_addr);
+            auto it_end = byte_enable.begin() +
+                (fragment_addr - base_addr) + fragment_size;
+            if (isAnyActiveElement(it_start, it_end)) {
+                fragment->setVirt(0 /* asid */,
+                    fragment_addr, fragment_size, request->getFlags(),
+                    request->masterId(),
+                    request->getPC());
+                fragment->setByteEnable(std::vector<bool>(it_start, it_end));
+            } else {
+                disabled_fragment = true;
+            }
+        }
 
-        DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x size: %d"
-            " (whole request addr: 0x%x size: %d) %s\n",
-            fragment_addr, fragment_size, base_addr, whole_size,
-            (is_last_fragment ? "last fragment" : ""));
+        if (!disabled_fragment) {
+            DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x"
+                " size: %d (whole request addr: 0x%x size: %d) %s\n",
+                fragment_addr, fragment_size, base_addr, whole_size,
+                (is_last_fragment ? "last fragment" : ""));
 
-        fragment_addr += fragment_size;
+            fragmentRequests.push_back(fragment);
+        } else {
+            num_disabled_fragments++;
+        }
 
-        fragmentRequests.push_back(fragment);
+        fragment_addr += fragment_size;
     }
+    assert(numFragments >= num_disabled_fragments);
+    numFragments -= num_disabled_fragments;
 }
 
 void
 LSQ::SplitDataRequest::makeFragmentPackets()
 {
+    assert(numTranslatedFragments > 0);
     Addr base_addr = request->getVaddr();
 
     DPRINTFS(MinorMem, (&port), "Making packets for request: %s\n", *inst);
 
-    for (unsigned int fragment_index = 0; fragment_index < numFragments;
+    for (unsigned int fragment_index = 0;
+         fragment_index < numTranslatedFragments;
          fragment_index++)
     {
         RequestPtr fragment = fragmentRequests[fragment_index];
@@ -490,28 +521,32 @@ LSQ::SplitDataRequest::makeFragmentPackets()
 void
 LSQ::SplitDataRequest::startAddrTranslation()
 {
-    setState(LSQ::LSQRequest::InTranslation);
-
     makeFragmentRequests();
 
-    numInTranslationFragments = 0;
-    numTranslatedFragments = 0;
+    if (numFragments > 0) {
+        setState(LSQ::LSQRequest::InTranslation);
+        numInTranslationFragments = 0;
+        numTranslatedFragments = 0;
 
-    /* @todo, just do these in sequence for now with
-     * a loop of:
-     * do {
-     *  sendNextFragmentToTranslation ; translateTiming ; finish
-     * } while (numTranslatedFragments != numFragments);
-     */
+        /* @todo, just do these in sequence for now with
+         * a loop of:
+         * do {
+         *  sendNextFragmentToTranslation ; translateTiming ; finish
+         * } while (numTranslatedFragments != numFragments);
+         */
 
-    /* Do first translation */
-    sendNextFragmentToTranslation();
+        /* Do first translation */
+        sendNextFragmentToTranslation();
+    } else {
+        disableMemAccess();
+        setState(LSQ::LSQRequest::Complete);
+    }
 }
 
 PacketPtr
 LSQ::SplitDataRequest::getHeadPacket()
 {
-    assert(numIssuedFragments < numFragments);
+    assert(numIssuedFragments < numTranslatedFragments);
 
     return fragmentPackets[numIssuedFragments];
 }
@@ -519,7 +554,7 @@ LSQ::SplitDataRequest::getHeadPacket()
 void
 LSQ::SplitDataRequest::stepToNextPacket()
 {
-    assert(numIssuedFragments < numFragments);
+    assert(numIssuedFragments < numTranslatedFragments);
 
     numIssuedFragments++;
 }
@@ -527,14 +562,13 @@ LSQ::SplitDataRequest::stepToNextPacket()
 void
 LSQ::SplitDataRequest::retireResponse(PacketPtr response)
 {
-    assert(numRetiredFragments < numFragments);
+    assert(numRetiredFragments < numTranslatedFragments);
 
     DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d"
-        " offset: 0x%x (retired fragment num: %d) %s\n",
+        " offset: 0x%x (retired fragment num: %d)\n",
         response->req->getVaddr(), response->req->getSize(),
         request->getVaddr() - response->req->getVaddr(),
-        numRetiredFragments,
-        (fault == NoFault ? "" : fault->name()));
+        numRetiredFragments);
 
     numRetiredFragments++;
 
@@ -573,7 +607,7 @@ LSQ::SplitDataRequest::retireResponse(PacketPtr response)
             packet->makeResponse();
     }
 
-    if (numRetiredFragments == numFragments)
+    if (numRetiredFragments == numTranslatedFragments)
         setState(Complete);
 
     if (!skipped && isComplete()) {
@@ -1477,7 +1511,8 @@ LSQ::needsToTick()
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res, AtomicOpFunctor *amo_op)
+                 uint64_t *res, AtomicOpFunctor *amo_op,
+                 const std::vector<bool>& byteEnable)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
 
@@ -1533,6 +1568,9 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
         inst->pc.instAddr(), amo_op);
+    if (!byteEnable.empty()) {
+        request->request->setByteEnable(byteEnable);
+    }
 
     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index 11fa8774f..23b47c53c 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014, 2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -188,6 +188,8 @@ class LSQ : public Named
         /** BaseTLB::Translation interface */
         void markDelayed() { }
 
+        void disableMemAccess();
+
       public:
         LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
             PacketDataPtr data_ = NULL, uint64_t *res_ = NULL);
@@ -441,7 +443,8 @@ class LSQ : public Named
         { return numIssuedFragments != numRetiredFragments; }
 
         /** Have we stepped past the end of fragmentPackets? */
-        bool sentAllPackets() { return numIssuedFragments == numFragments; }
+        bool sentAllPackets()
+        { return numIssuedFragments == numTranslatedFragments; }
 
         /** For loads, paste the response data into the main
          *  response packet */
@@ -700,7 +703,9 @@ class LSQ : public Named
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res, AtomicOpFunctor *amo_op);
+                     uint64_t *res, AtomicOpFunctor *amo_op,
+                     const std::vector<bool>& byteEnable =
+                         std::vector<bool>());
 
     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index c754fe8cf..db8fca20a 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -787,10 +787,13 @@ class FullO3CPU : public BaseO3CPU
     /** CPU pushRequest function, forwards request to LSQ. */
     Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
-                      uint64_t *res, AtomicOpFunctor *amo_op = nullptr)
+                      uint64_t *res, AtomicOpFunctor *amo_op = nullptr,
+                      const std::vector<bool>& byteEnable =
+                          std::vector<bool>())
+
     {
         return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
-                flags, res, amo_op);
+                flags, res, amo_op, byteEnable);
     }
 
     /** CPU read function, forwards read to LSQ. */
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index a6037b7f4..84f1411a5 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -50,6 +50,7 @@
 #include "arch/generic/tlb.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/lsq_unit.hh"
+#include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/port.hh"
 #include "sim/sim_object.hh"
@@ -251,6 +252,7 @@ class LSQ
         const Addr _addr;
         const uint32_t _size;
         const Request::Flags _flags;
+        std::vector<bool> _byteEnable;
         uint32_t _numOutstandingPackets;
         AtomicOpFunctor *_amo_op;
       protected:
@@ -351,6 +353,28 @@ class LSQ
             }
         }
 
+        /** Helper function used to add a (sub)request, given its address
+         * `addr`, size `size` and byte-enable mask `byteEnable`.
+         *
+         * The request is only added if the mask is empty or if there is at
+         * least an active element in it.
+         */
+        void
+        addRequest(Addr addr, unsigned size,
+                   const std::vector<bool>& byteEnable)
+        {
+            if (byteEnable.empty() ||
+                isAnyActiveElement(byteEnable.begin(), byteEnable.end())) {
+                auto request = std::make_shared<Request>(_inst->getASID(),
+                        addr, size, _flags, _inst->masterId(),
+                        _inst->instAddr(), _inst->contextId());
+                if (!byteEnable.empty()) {
+                    request->setByteEnable(byteEnable);
+                }
+                _requests.push_back(request);
+            }
+        }
+
         /** Destructor.
          * The LSQRequest owns the request. If the packet has already been
          * sent, the sender state will be deleted upon receiving the reply.
@@ -609,11 +633,17 @@ class LSQ
          * declaration of the names in the parent class. */
         using Flag = typename LSQRequest::Flag;
         using State = typename LSQRequest::State;
+        using LSQRequest::_addr;
         using LSQRequest::_fault;
+        using LSQRequest::_flags;
+        using LSQRequest::_size;
+        using LSQRequest::_byteEnable;
+        using LSQRequest::_requests;
         using LSQRequest::_inst;
         using LSQRequest::_packets;
         using LSQRequest::_port;
         using LSQRequest::_res;
+        using LSQRequest::_taskId;
         using LSQRequest::_senderState;
         using LSQRequest::_state;
         using LSQRequest::flags;
@@ -635,14 +665,8 @@ class LSQ
                           uint64_t* res = nullptr,
                           AtomicOpFunctor* amo_op = nullptr) :
             LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
-                       amo_op)
-        {
-            LSQRequest::_requests.push_back(
-                    std::make_shared<Request>(inst->getASID(), addr, size,
-                    flags_, inst->masterId(), inst->instAddr(),
-                    inst->contextId(), amo_op));
-            LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
-        }
+                       amo_op) {}
+
         inline virtual ~SingleDataRequest() {}
         virtual void initiateTranslation();
         virtual void finish(const Fault &fault, const RequestPtr &req,
@@ -671,6 +695,7 @@ class LSQ
         using LSQRequest::_port;
         using LSQRequest::_requests;
         using LSQRequest::_res;
+        using LSQRequest::_byteEnable;
         using LSQRequest::_senderState;
         using LSQRequest::_size;
         using LSQRequest::_state;
@@ -691,14 +716,14 @@ class LSQ
         RequestPtr mainReq;
         PacketPtr _mainPacket;
 
-
       public:
         SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
                          const Addr& addr, const uint32_t& size,
                          const Request::Flags & flags_,
                          PacketDataPtr data = nullptr,
                          uint64_t* res = nullptr) :
-            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res),
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
+                       nullptr),
             numFragments(0),
             numReceivedPackets(0),
             mainReq(nullptr),
@@ -949,7 +974,8 @@ class LSQ
 
     Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
-                      uint64_t *res, AtomicOpFunctor *amo_op);
+                      uint64_t *res, AtomicOpFunctor *amo_op,
+                      const std::vector<bool>& byteEnable);
 
     /** The CPU pointer. */
     O3CPU *cpu;
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 732712029..70621a523 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -681,29 +681,12 @@ LSQ<Impl>::dumpInsts() const
     }
 }
 
-static Addr
-addrBlockOffset(Addr addr, unsigned int block_size)
-{
-    return addr & (block_size - 1);
-}
-
-static Addr
-addrBlockAlign(Addr addr, uint64_t block_size)
-{
-    return addr & ~(block_size - 1);
-}
-
-static bool
-transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size)
-{
-    return (addrBlockOffset(addr, block_size) + size) > block_size;
-}
-
 template<class Impl>
 Fault
 LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                        unsigned int size, Addr addr, Request::Flags flags,
-                       uint64_t *res, AtomicOpFunctor *amo_op)
+                       uint64_t *res, AtomicOpFunctor *amo_op,
+                       const std::vector<bool>& byteEnable)
 {
     // This comming request can be either load, store or atomic.
     // Atomic request has a corresponding pointer to its atomic memory
@@ -735,6 +718,9 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                     size, flags, data, res, amo_op);
         }
         assert(req);
+        if (!byteEnable.empty()) {
+            req->_byteEnable = byteEnable;
+        }
         inst->setRequest();
         req->taskId(cpu->taskId());
 
@@ -756,6 +742,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
             else
                 inst->getFault() = cpu->write(req, data, inst->sqIdx);
         } else if (isLoad) {
+            inst->setMemAccPredicate(false);
             // Commit will have to clean up whatever happened.  Set this
             // instruction as executed.
             inst->setExecuted();
@@ -848,14 +835,21 @@ template<class Impl>
 void
 LSQ<Impl>::SingleDataRequest::initiateTranslation()
 {
-    _inst->translationStarted(true);
-    setState(State::Translation);
-    flags.set(Flag::TranslationStarted);
+    assert(_requests.size() == 0);
 
-    _inst->savedReq = this;
-    sendFragmentToTranslation(0);
+    this->addRequest(_addr, _size, _byteEnable);
 
-    if (isTranslationComplete()) {
+    if (_requests.size() > 0) {
+        _requests.back()->setReqInstSeqNum(_inst->seqNum);
+        _requests.back()->taskId(_taskId);
+        _inst->translationStarted(true);
+        setState(State::Translation);
+        flags.set(Flag::TranslationStarted);
+
+        _inst->savedReq = this;
+        sendFragmentToTranslation(0);
+    } else {
+        _inst->setMemAccPredicate(false);
     }
 }
 
@@ -877,11 +871,7 @@ template<class Impl>
 void
 LSQ<Impl>::SplitDataRequest::initiateTranslation()
 {
-    _inst->translationStarted(true);
-    setState(State::Translation);
-    flags.set(Flag::TranslationStarted);
-
-    unsigned int cacheLineSize = _port.cacheLineSize();
+    auto cacheLineSize = _port.cacheLineSize();
     Addr base_addr = _addr;
     Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
     Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
@@ -890,6 +880,9 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
     mainReq = std::make_shared<Request>(_inst->getASID(), base_addr,
                 _size, _flags, _inst->masterId(),
                 _inst->instAddr(), _inst->contextId());
+    if (!_byteEnable.empty()) {
+        mainReq->setByteEnable(_byteEnable);
+    }
 
     // Paddr is not used in mainReq. However, we will accumulate the flags
     // from the sub requests into mainReq by calling setFlags() in finish().
@@ -898,39 +891,63 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
     mainReq->setPaddr(0);
 
     /* Get the pre-fix, possibly unaligned. */
-    _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr,
-                next_addr - base_addr, _flags, _inst->masterId(),
-                _inst->instAddr(), _inst->contextId()));
+    if (_byteEnable.empty()) {
+        this->addRequest(base_addr, next_addr - base_addr, _byteEnable);
+    } else {
+        auto it_start = _byteEnable.begin();
+        auto it_end = _byteEnable.begin() + (next_addr - base_addr);
+        this->addRequest(base_addr, next_addr - base_addr,
+                         std::vector<bool>(it_start, it_end));
+    }
     size_so_far = next_addr - base_addr;
 
     /* We are block aligned now, reading whole blocks. */
     base_addr = next_addr;
     while (base_addr != final_addr) {
-        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
-                    base_addr, cacheLineSize, _flags, _inst->masterId(),
-                    _inst->instAddr(), _inst->contextId()));
+        if (_byteEnable.empty()) {
+            this->addRequest(base_addr, cacheLineSize, _byteEnable);
+        } else {
+            auto it_start = _byteEnable.begin() + size_so_far;
+            auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
+            this->addRequest(base_addr, cacheLineSize,
+                             std::vector<bool>(it_start, it_end));
+        }
         size_so_far += cacheLineSize;
         base_addr += cacheLineSize;
     }
 
     /* Deal with the tail. */
     if (size_so_far < _size) {
-        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
-                    base_addr, _size - size_so_far, _flags, _inst->masterId(),
-                    _inst->instAddr(), _inst->contextId()));
+        if (_byteEnable.empty()) {
+            this->addRequest(base_addr, _size - size_so_far, _byteEnable);
+        } else {
+            auto it_start = _byteEnable.begin() + size_so_far;
+            auto it_end = _byteEnable.end();
+            this->addRequest(base_addr, _size - size_so_far,
+                             std::vector<bool>(it_start, it_end));
+        }
     }
 
-    /* Setup the requests and send them to translation. */
-    for (auto& r: _requests) {
-        r->setReqInstSeqNum(_inst->seqNum);
-        r->taskId(_taskId);
-    }
-    this->_inst->savedReq = this;
-    numInTranslationFragments = 0;
-    numTranslatedFragments = 0;
+    if (_requests.size() > 0) {
+        /* Setup the requests and send them to translation. */
+        for (auto& r: _requests) {
+            r->setReqInstSeqNum(_inst->seqNum);
+            r->taskId(_taskId);
+        }
 
-    for (uint32_t i = 0; i < _requests.size(); i++) {
-        sendFragmentToTranslation(i);
+        _inst->translationStarted(true);
+        setState(State::Translation);
+        flags.set(Flag::TranslationStarted);
+        this->_inst->savedReq = this;
+        numInTranslationFragments = 0;
+        numTranslatedFragments = 0;
+        _fault.resize(_requests.size());
+
+        for (uint32_t i = 0; i < _requests.size(); i++) {
+            sendFragmentToTranslation(i);
+        }
+    } else {
+        _inst->setMemAccPredicate(false);
     }
 }
 
@@ -968,8 +985,6 @@ LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
         pktIdx++;
     assert(pktIdx < _packets.size());
-    assert(pkt->req == _requests[pktIdx]);
-    assert(pkt == _packets[pktIdx]);
     numReceivedPackets++;
     state->outstanding--;
     if (numReceivedPackets == _packets.size()) {
@@ -1012,16 +1027,19 @@ void
 LSQ<Impl>::SplitDataRequest::buildPackets()
 {
     /* Extra data?? */
-    ptrdiff_t offset = 0;
+    Addr base_address = _addr;
+
     if (_packets.size() == 0) {
         /* New stuff */
         if (isLoad()) {
             _mainPacket = Packet::createRead(mainReq);
             _mainPacket->dataStatic(_inst->memData);
         }
-        for (auto& r: _requests) {
+        for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
+            RequestPtr r = _requests[i];
             PacketPtr pkt = isLoad() ? Packet::createRead(r)
-                                    : Packet::createWrite(r);
+                                     : Packet::createWrite(r);
+            ptrdiff_t offset = r->getVaddr() - base_address;
             if (isLoad()) {
                 pkt->dataStatic(_inst->memData + offset);
             } else {
@@ -1031,12 +1049,11 @@ LSQ<Impl>::SplitDataRequest::buildPackets()
                         r->getSize());
                 pkt->dataDynamic(req_data);
             }
-            offset += r->getSize();
             pkt->senderState = _senderState;
             _packets.push_back(pkt);
         }
     }
-    assert(_packets.size() == _requests.size());
+    assert(_packets.size() > 0);
 }
 
 template<class Impl>
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 9323e8634..21bed99fa 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -542,8 +542,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
 
     load_fault = inst->initiateAcc();
 
-    if (!inst->readMemAccPredicate()) {
-        assert(load_fault == NoFault);
+    if (load_fault == NoFault && !inst->readMemAccPredicate()) {
         assert(inst->readPredicate());
         inst->setExecuted();
         inst->completeAcc(nullptr);
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index caf2427ef..c5b024532 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -49,6 +49,7 @@
 #include "base/output.hh"
 #include "config/the_isa.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/utils.hh"
 #include "debug/Drain.hh"
 #include "debug/ExecFaulting.hh"
 #include "debug/SimpleCPU.hh"
@@ -333,9 +334,43 @@ AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt)
     }
 }
 
+bool
+AtomicSimpleCPU::genMemFragmentRequest(const RequestPtr& req, Addr frag_addr,
+                                       int size, Request::Flags flags,
+                                       const std::vector<bool>& byte_enable,
+                                       int& frag_size, int& size_left) const
+{
+    bool predicate = true;
+    Addr inst_addr = threadInfo[curThread]->thread->pcState().instAddr();
+
+    frag_size = std::min(
+        cacheLineSize() - addrBlockOffset(frag_addr, cacheLineSize()),
+        (Addr) size_left);
+    size_left -= frag_size;
+
+    if (!byte_enable.empty()) {
+        // Set up byte-enable mask for the current fragment
+        auto it_start = byte_enable.begin() + (size - (frag_size + size_left));
+        auto it_end = byte_enable.begin() + (size - size_left);
+        if (isAnyActiveElement(it_start, it_end)) {
+            req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(),
+                         inst_addr);
+            req->setByteEnable(std::vector<bool>(it_start, it_end));
+        } else {
+            predicate = false;
+        }
+    } else {
+        req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(),
+                     inst_addr);
+    }
+
+    return predicate;
+}
+
 Fault
 AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
-                         Request::Flags flags)
+                         Request::Flags flags,
+                         const std::vector<bool>& byteEnable)
 {
     SimpleExecContext& t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -346,28 +381,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
     if (traceData)
         traceData->setMem(addr, size, flags);
 
-    //The size of the data we're trying to read.
-    int fullSize = size;
-
-    //The address of the second part of this access if it needs to be split
-    //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
-
-    if (secondAddr > addr)
-        size = secondAddr - addr;
-
     dcache_latency = 0;
 
     req->taskId(taskId());
+
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    bool predicate;
+    Fault fault = NoFault;
+
     while (1) {
-        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
+        predicate = genMemFragmentRequest(req, frag_addr, size, flags,
+                                          byteEnable, frag_size, size_left);
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
-                                                          BaseTLB::Read);
+        if (predicate) {
+            fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                 BaseTLB::Read);
+        }
 
         // Now do the access.
-        if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+        if (predicate && fault == NoFault &&
+            !req->getFlags().isSet(Request::NO_ACCESS)) {
             Packet pkt(req, Packet::makeReadCmd(req));
             pkt.dataStatic(data);
 
@@ -394,33 +430,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
             }
         }
 
-        //If we don't need to access a second cache line, stop now.
-        if (secondAddr <= addr)
-        {
+        // If we don't need to access further cache lines, stop now.
+        if (size_left == 0) {
             if (req->isLockedRMW() && fault == NoFault) {
                 assert(!locked);
                 locked = true;
             }
-
             return fault;
         }
 
         /*
-         * Set up for accessing the second cache line.
+         * Set up for accessing the next cache line.
          */
+        frag_addr += frag_size;
 
         //Move the pointer we're reading into to the correct location.
-        data += size;
-        //Adjust the size to get the remaining bytes.
-        size = addr + fullSize - secondAddr;
-        //And access the right address.
-        addr = secondAddr;
+        data += frag_size;
     }
 }
 
 Fault
 AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
-                          Request::Flags flags, uint64_t *res)
+                          Request::Flags flags, uint64_t *res,
+                          const std::vector<bool>& byteEnable)
 {
     SimpleExecContext& t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -439,32 +471,37 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
     if (traceData)
         traceData->setMem(addr, size, flags);
 
-    //The size of the data we're trying to read.
-    int fullSize = size;
-
-    //The address of the second part of this access if it needs to be split
-    //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
-
-    if (secondAddr > addr)
-        size = secondAddr - addr;
-
     dcache_latency = 0;
 
     req->taskId(taskId());
+
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    int curr_frag_id = 0;
+    bool predicate;
+    Fault fault = NoFault;
+
     while (1) {
-        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
+        predicate = genMemFragmentRequest(req, frag_addr, size, flags,
+                                          byteEnable, frag_size, size_left);
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), BaseTLB::Write);
+        if (predicate)
+            fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                 BaseTLB::Write);
 
         // Now do the access.
-        if (fault == NoFault) {
+        if (predicate && fault == NoFault) {
             bool do_access = true;  // flag to suppress cache access
 
             if (req->isLLSC()) {
-                do_access = TheISA::handleLockedWrite(thread, req, dcachePort.cacheBlockMask);
+                assert(curr_frag_id == 0);
+                do_access =
+                    TheISA::handleLockedWrite(thread, req,
+                                              dcachePort.cacheBlockMask);
             } else if (req->isSwap()) {
+                assert(curr_frag_id == 0);
                 if (req->isCondSwap()) {
                     assert(res);
                     req->setExtraData(*res);
@@ -488,8 +525,8 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
                 assert(!pkt.isError());
 
                 if (req->isSwap()) {
-                    assert(res);
-                    memcpy(res, pkt.getConstPtr<uint8_t>(), fullSize);
+                    assert(res && curr_frag_id == 0);
+                    memcpy(res, pkt.getConstPtr<uint8_t>(), size);
                 }
             }
 
@@ -500,14 +537,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
 
         //If there's a fault or we don't need to access a second cache line,
         //stop now.
-        if (fault != NoFault || secondAddr <= addr)
+        if (fault != NoFault || size_left == 0)
         {
             if (req->isLockedRMW() && fault == NoFault) {
-                assert(locked);
+                assert(byteEnable.empty());
+                assert(locked && curr_frag_id == 0);
                 locked = false;
             }
 
-
             if (fault != NoFault && req->isPrefetch()) {
                 return NoFault;
             } else {
@@ -516,15 +553,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
         }
 
         /*
-         * Set up for accessing the second cache line.
+         * Set up for accessing the next cache line.
          */
+        frag_addr += frag_size;
 
         //Move the pointer we're reading into to the correct location.
-        data += size;
-        //Adjust the size to get the remaining bytes.
-        size = addr + fullSize - secondAddr;
-        //And access the right address.
-        addr = secondAddr;
+        data += frag_size;
+
+        curr_frag_id++;
     }
 }
 
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index 84f379121..100306981 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -195,11 +195,36 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     void activateContext(ThreadID thread_num) override;
     void suspendContext(ThreadID thread_num) override;
 
+    /**
+     * Helper function used to set up the request for a single fragment of a
+     * memory access.
+     *
+     * Takes care of setting up the appropriate byte-enable mask for the
+     * fragment, given the mask for the entire memory access.
+     *
+     * @param req Pointer to the Request object to populate.
+     * @param frag_addr Start address of the fragment.
+     * @param size Total size of the memory access in bytes.
+     * @param flags Request flags.
+     * @param byte_enable Byte-enable mask for the entire memory access.
+     * @param[out] frag_size Fragment size.
+     * @param[in,out] size_left Size left to be processed in the memory access.
+     * @return True if the byte-enable mask for the fragment is not all-false.
+     */
+    bool genMemFragmentRequest(const RequestPtr& req, Addr frag_addr,
+                               int size, Request::Flags flags,
+                               const std::vector<bool>& byte_enable,
+                               int& frag_size, int& size_left) const;
+
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
-                  Request::Flags flags) override;
+                  Request::Flags flags,
+                  const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault writeMem(uint8_t *data, unsigned size,
-                   Addr addr, Request::Flags flags, uint64_t *res) override;
+                   Addr addr, Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault amoMem(Addr addr, uint8_t* data, unsigned size,
                  Request::Flags flags, AtomicOpFunctor *amo_op) override;
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index 298ba9f9e..816add707 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012, 2015, 2017 ARM Limited
+ * Copyright (c) 2010-2012, 2015, 2017, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -494,6 +494,10 @@ BaseSimpleCPU::preExecute()
     thread->setFloatReg(ZeroReg, 0);
 #endif // ALPHA_ISA
 
+    // resets predicates
+    t_info.setPredicate(true);
+    t_info.setMemAccPredicate(true);
+
     // check for instruction-count-based events
     comInstEventQueue[curThread]->serviceEvents(t_info.numInst);
     system->instEventQueue.serviceEvents(system->totalNumInsts);
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index 8060b07ad..5404e5df8 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012,2015 ARM Limited
+ * Copyright (c) 2011-2012,2015,2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -143,15 +143,21 @@ class BaseSimpleCPU : public BaseCPU
     void startup() override;
 
     virtual Fault readMem(Addr addr, uint8_t* data, unsigned size,
-                          Request::Flags flags)
+                          Request::Flags flags,
+                          const std::vector<bool>& byteEnable =
+                              std::vector<bool>())
     { panic("readMem() is not implemented\n"); }
 
     virtual Fault initiateMemRead(Addr addr, unsigned size,
-                                  Request::Flags flags)
+                                  Request::Flags flags,
+                                  const std::vector<bool>& byteEnable =
+                                      std::vector<bool>())
     { panic("initiateMemRead() is not implemented\n"); }
 
     virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr,
-                           Request::Flags flags, uint64_t* res)
+                           Request::Flags flags, uint64_t* res,
+                           const std::vector<bool>& byteEnable =
+                               std::vector<bool>())
     { panic("writeMem() is not implemented\n"); }
 
     virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size,
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index be7a863c5..de98d6efd 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -434,26 +434,32 @@ class SimpleExecContext : public ExecContext {
         thread->pcState(val);
     }
 
-
     Fault
     readMem(Addr addr, uint8_t *data, unsigned int size,
-            Request::Flags flags) override
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->readMem(addr, data, size, flags);
+        return cpu->readMem(addr, data, size, flags, byteEnable);
     }
 
     Fault
     initiateMemRead(Addr addr, unsigned int size,
-                    Request::Flags flags) override
+                    Request::Flags flags,
+                    const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->initiateMemRead(addr, size, flags);
+        return cpu->initiateMemRead(addr, size, flags, byteEnable);
     }
 
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
-             Request::Flags flags, uint64_t *res) override
+             Request::Flags flags, uint64_t *res,
+             const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->writeMem(data, size, addr, flags, res);
+        assert(byteEnable.empty() || byteEnable.size() == size);
+        return cpu->writeMem(data, size, addr, flags, res, byteEnable);
     }
 
     Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 637308a96..454259099 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -1,6 +1,6 @@
 /*
  * Copyright 2014 Google, Inc.
- * Copyright (c) 2010-2013,2015,2017 ARM Limited
+ * Copyright (c) 2010-2013,2015,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -417,7 +417,8 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2,
 
 Fault
 TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
-                                 Request::Flags flags)
+                                 Request::Flags flags,
+                                 const std::vector<bool>& byteEnable)
 {
     SimpleExecContext &t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -434,6 +435,9 @@ TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
     RequestPtr req = std::make_shared<Request>(
         asid, addr, size, flags, dataMasterId(), pc,
         thread->contextId());
+    if (!byteEnable.empty()) {
+        req->setByteEnable(byteEnable);
+    }
 
     req->taskId(taskId());
 
@@ -491,7 +495,8 @@ TimingSimpleCPU::handleWritePacket()
 
 Fault
 TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
-                          Addr addr, Request::Flags flags, uint64_t *res)
+                          Addr addr, Request::Flags flags, uint64_t *res,
+                          const std::vector<bool>& byteEnable)
 {
     SimpleExecContext &t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -516,6 +521,9 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     RequestPtr req = std::make_shared<Request>(
         asid, addr, size, flags, dataMasterId(), pc,
         thread->contextId());
+    if (!byteEnable.empty()) {
+        req->setByteEnable(byteEnable);
+    }
 
     req->taskId(taskId());
 
@@ -523,6 +531,10 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     assert(split_addr <= addr || split_addr - addr < block_size);
 
     _status = DTBWaitResponse;
+
+    // TODO: TimingSimpleCPU doesn't support arbitrarily long multi-line mem.
+    // accesses yet
+
     if (split_addr > addr) {
         RequestPtr req1, req2;
         assert(!req->isLLSC() && !req->isSwap());
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index ce0a4dbfc..a49822fc1 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013,2015 ARM Limited
+ * Copyright (c) 2012-2013,2015,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -283,10 +283,14 @@ class TimingSimpleCPU : public BaseSimpleCPU
     void suspendContext(ThreadID thread_num) override;
 
     Fault initiateMemRead(Addr addr, unsigned size,
-                          Request::Flags flags) override;
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable =std::vector<bool>())
+        override;
 
     Fault writeMem(uint8_t *data, unsigned size,
-                   Addr addr, Request::Flags flags, uint64_t *res) override;
+                   Addr addr, Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
                          AtomicOpFunctor *amo_op) override;
diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc
index 9067e877b..0936e41ad 100644
--- a/src/cpu/simple_thread.cc
+++ b/src/cpu/simple_thread.cc
@@ -77,7 +77,7 @@ SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys,
                            Process *_process, BaseTLB *_itb,
                            BaseTLB *_dtb, TheISA::ISA *_isa)
     : ThreadState(_cpu, _thread_num, _process), isa(_isa),
-      predicate(false), system(_sys),
+      predicate(true), memAccPredicate(true), system(_sys),
       itb(_itb), dtb(_dtb), decoder(TheISA::Decoder(_isa))
 {
     clearArchRegs();
@@ -87,8 +87,9 @@ SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys,
 SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys,
                            BaseTLB *_itb, BaseTLB *_dtb,
                            TheISA::ISA *_isa, bool use_kernel_stats)
-    : ThreadState(_cpu, _thread_num, NULL), isa(_isa), system(_sys), itb(_itb),
-      dtb(_dtb), decoder(TheISA::Decoder(_isa))
+    : ThreadState(_cpu, _thread_num, NULL), isa(_isa),
+      predicate(true), memAccPredicate(true), system(_sys),
+      itb(_itb), dtb(_dtb), decoder(TheISA::Decoder(_isa))
 {
     quiesceEvent = new EndQuiesceEvent(this);
 
diff --git a/src/cpu/utils.hh b/src/cpu/utils.hh
new file mode 100644
index 000000000..4c1318174
--- /dev/null
+++ b/src/cpu/utils.hh
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Andrew Bardsley
+ */
+
+#ifndef __CPU_UTILS_HH__
+#define __CPU_UTILS_HH__
+
+#include "base/types.hh"
+
+/**
+ * Calculates the offset of a given address wrt aligned fixed-size blocks.
+ * @param addr Input address.
+ * @param block_size Block size in bytes.
+ * @return Offset of the given address in bytes.
+ */
+inline Addr
+addrBlockOffset(Addr addr, Addr block_size)
+{
+    return addr & (block_size - 1);
+}
+
+/**
+ * Returns the address of the closest aligned fixed-size block to the given
+ * address.
+ * @param addr Input address.
+ * @param block_size Block size in bytes.
+ * @return Address of the closest aligned block.
+ */
+inline Addr
+addrBlockAlign(Addr addr, Addr block_size)
+{
+    return addr & ~(block_size - 1);
+}
+
+/**
+ * Returns true if the given memory access (address, size) needs to be
+ * fragmented across aligned fixed-size blocks.
+ * @param addr Address of the memory access.
+ * @param size Size of the memory access.
+ * @param block_size Block size in bytes.
+ * @return True if the memory access needs to be fragmented.
+ */
+inline bool
+transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size)
+{
+    return (addrBlockOffset(addr, block_size) + size) > block_size;
+}
+
+/**
+ * Test if there is any active element in an enablement range.
+ */
+inline bool
+isAnyActiveElement(const std::vector<bool>::const_iterator& it_start,
+                   const std::vector<bool>::const_iterator& it_end)
+{
+    auto it_tmp = it_start;
+    for (;it_tmp != it_end && !(*it_tmp); ++it_tmp);
+    return (it_tmp != it_end);
+}
+
+#endif // __CPU_UTILS_HH__
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
index f7b02ce17..a998530fd 100644
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012,2017 ARM Limited
+ * Copyright (c) 2010-2012,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index 494a998a5..b72ff4261 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -297,7 +297,8 @@ Cache::promoteWholeLineWrites(PacketPtr pkt)
 {
     // Cache line clearing instructions
     if (doFastWrites && (pkt->cmd == MemCmd::WriteReq) &&
-        (pkt->getSize() == blkSize) && (pkt->getOffset(blkSize) == 0)) {
+        (pkt->getSize() == blkSize) && (pkt->getOffset(blkSize) == 0) &&
+        !pkt->isMaskedWrite()) {
         pkt->cmd = MemCmd::WriteLineReq;
         DPRINTF(Cache, "packet promoted from Write to WriteLineReq\n");
     }
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 93b3ad5de..130cc41ad 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1092,6 +1092,7 @@ class Packet : public Printable
     getPtr()
     {
         assert(flags.isSet(STATIC_DATA|DYNAMIC_DATA));
+        assert(!isMaskedWrite());
         return (T*)data;
     }
 
@@ -1180,10 +1181,11 @@ class Packet : public Printable
         // same pointer from source to destination and back
         assert(p != getPtr<uint8_t>() || flags.isSet(STATIC_DATA));
 
-        if (p != getPtr<uint8_t>())
+        if (p != getPtr<uint8_t>()) {
             // for packet with allocated dynamic data, we copy data from
             // one to the other, e.g. a forwarded response to a response
             std::memcpy(getPtr<uint8_t>(), p, getSize());
+        }
     }
 
     /**
@@ -1203,7 +1205,19 @@ class Packet : public Printable
     void
     writeData(uint8_t *p) const
     {
-        std::memcpy(p, getConstPtr<uint8_t>(), getSize());
+        if (!isMaskedWrite()) {
+            std::memcpy(p, getConstPtr<uint8_t>(), getSize());
+        } else {
+            assert(req->getByteEnable().size() == getSize());
+            // Write only the enabled bytes
+            const uint8_t *base = getConstPtr<uint8_t>();
+            for (int i = 0; i < getSize(); i++) {
+                if (req->getByteEnable()[i]) {
+                    p[i] = *(base + i);
+                }
+                // Disabled bytes stay untouched
+            }
+        }
     }
 
     /**
@@ -1268,6 +1282,17 @@ class Packet : public Printable
     bool
     trySatisfyFunctional(PacketPtr other)
     {
+        if (other->isMaskedWrite()) {
+            // Do not forward data if overlapping with a masked write
+            if (_isSecure == other->isSecure() &&
+                getAddr() <= (other->getAddr() + other->getSize() - 1) &&
+                other->getAddr() <= (getAddr() + getSize() - 1)) {
+                warn("Trying to check against a masked write, skipping."
+                     " (addr: 0x%x, other addr: 0x%x)", getAddr(),
+                     other->getAddr());
+            }
+            return false;
+        }
         // all packets that are carrying a payload should have a valid
         // data pointer
         return trySatisfyFunctional(other, other->getAddr(), other->isSecure(),
@@ -1296,6 +1321,12 @@ class Packet : public Printable
         return cmd == MemCmd::CleanEvict || cmd == MemCmd::WritebackClean;
     }
 
+    bool
+    isMaskedWrite() const
+    {
+        return (cmd == MemCmd::WriteReq && !req->getByteEnable().empty());
+    }
+
     /**
      * Check a functional request against a memory value represented
      * by a base/size pair and an associated data array. If the
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 2a53c21a4..324ae382e 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -320,6 +320,9 @@ class Request
      */
     unsigned _size;
 
+    /** Byte-enable mask for writes. */
+    std::vector<bool> _byteEnable;
+
     /** The requestor ID which is unique in the system for all ports
      * that are capable of issuing a transaction
      */
@@ -567,6 +570,9 @@ class Request
      * Generate two requests as if this request had been split into two
      * pieces. The original request can't have been translated already.
      */
+    // TODO: this function is still required by TimingSimpleCPU - should be
+    // removed once TimingSimpleCPU will support arbitrarily long multi-line
+    // mem. accesses
     void splitOnVaddr(Addr split_addr, RequestPtr &req1, RequestPtr &req2)
     {
         assert(privateFlags.isSet(VALID_VADDR));
@@ -577,6 +583,14 @@ class Request
         req1->_size = split_addr - _vaddr;
         req2->_vaddr = split_addr;
         req2->_size = _size - req1->_size;
+        if (!_byteEnable.empty()) {
+            req1->_byteEnable = std::vector<bool>(
+                _byteEnable.begin(),
+                _byteEnable.begin() + req1->_size);
+            req2->_byteEnable = std::vector<bool>(
+                _byteEnable.begin() + req1->_size,
+                _byteEnable.end());
+        }
     }
 
     /**
@@ -628,6 +642,19 @@ class Request
         return _size;
     }
 
+    const std::vector<bool>&
+    getByteEnable() const
+    {
+        return _byteEnable;
+    }
+
+    void
+    setByteEnable(const std::vector<bool>& be)
+    {
+        assert(be.empty() || be.size() == _size);
+        _byteEnable = be;
+    }
+
     /** Accessor for time. */
     Tick
     time() const
-- 
2.30.2