From: Mitch Hayenga <mitch.hayenga@arm.com>
Date: Wed, 3 Sep 2014 11:42:52 +0000 (-0400)
Subject: arm: Make memory ops work on 64bit/128-bit quantities
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8f95144e161ef7bdb264eb572108a98f215785c0;p=gem5.git

arm: Make memory ops work on 64bit/128-bit quantities

Multiple instructions assume only 32-bit load operations are available,
this patch increases load sizes to 64-bit or 128-bit for many load pair and
load multiple instructions.
---

diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc
index 65cd2c3b7..1ea968328 100644
--- a/src/arch/arm/insts/macromem.cc
+++ b/src/arch/arm/insts/macromem.cc
@@ -61,14 +61,29 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst,
 {
     uint32_t regs = reglist;
     uint32_t ones = number_of_ones(reglist);
-    // Remember that writeback adds a uop or two and the temp register adds one
-    numMicroops = ones + (writeback ? (load ? 2 : 1) : 0) + 1;
+    uint32_t mem_ops = ones;
 
-    // It's technically legal to do a lot of nothing
-    if (!ones)
+    // Copy the base address register if we overwrite it, or if this instruction
+    // is basically a no-op (we have to do something)
+    bool copy_base =  (bits(reglist, rn) && load) || !ones;
+    bool force_user = user & !bits(reglist, 15);
+    bool exception_ret = user & bits(reglist, 15);
+    bool pc_temp = load && writeback && bits(reglist, 15);
+
+    if (!ones) {
         numMicroops = 1;
+    } else if (load) {
+        numMicroops = ((ones + 1) / 2)
+                    + ((ones % 2 == 0 && exception_ret) ? 1 : 0)
+                    + (copy_base ? 1 : 0)
+                    + (writeback? 1 : 0)
+                    + (pc_temp ? 1 : 0);
+    } else {
+        numMicroops = ones + (writeback ? 1 : 0);
+    }
 
     microOps = new StaticInstPtr[numMicroops];
+
     uint32_t addr = 0;
 
     if (!up)
@@ -81,94 +96,129 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst,
 
     // Add 0 to Rn and stick it in ureg0.
     // This is equivalent to a move.
-    *uop = new MicroAddiUop(machInst, INTREG_UREG0, rn, 0);
+    if (copy_base)
+        *uop++ = new MicroAddiUop(machInst, INTREG_UREG0, rn, 0);
 
     unsigned reg = 0;
-    unsigned regIdx = 0;
-    bool force_user = user & !bits(reglist, 15);
-    bool exception_ret = user & bits(reglist, 15);
+    while (mem_ops != 0) {
+        // Do load operations in pairs if possible
+        if (load && mem_ops >= 2 &&
+            !(mem_ops == 2 && bits(regs,INTREG_PC) && exception_ret)) {
+            // 64-bit memory operation
+            // Find 2 set register bits (clear them after finding)
+            unsigned reg_idx1;
+            unsigned reg_idx2;
+
+            // Find the first register
+            while (!bits(regs, reg)) reg++;
+            replaceBits(regs, reg, 0);
+            reg_idx1 = force_user ? intRegInMode(MODE_USER, reg) : reg;
+
+            // Find the second register
+            while (!bits(regs, reg)) reg++;
+            replaceBits(regs, reg, 0);
+            reg_idx2 = force_user ? intRegInMode(MODE_USER, reg) : reg;
+
+            // Load into temp reg if necessary
+            if (reg_idx2 == INTREG_PC && pc_temp)
+                reg_idx2 = INTREG_UREG1;
+
+            // Actually load both registers from memory
+            *uop = new MicroLdr2Uop(machInst, reg_idx1, reg_idx2,
+                    copy_base ? INTREG_UREG0 : rn, up, addr);
+
+            if (!writeback && reg_idx2 == INTREG_PC) {
+                // No writeback if idx==pc, set appropriate flags
+                (*uop)->setFlag(StaticInst::IsControl);
+                (*uop)->setFlag(StaticInst::IsIndirectControl);
 
-    for (int i = 0; i < ones; i++) {
-        // Find the next register.
-        while (!bits(regs, reg))
-            reg++;
-        replaceBits(regs, reg, 0);
+                if (!(condCode == COND_AL || condCode == COND_UC))
+                    (*uop)->setFlag(StaticInst::IsCondControl);
+                else
+                    (*uop)->setFlag(StaticInst::IsUncondControl);
+            }
 
-        regIdx = reg;
-        if (force_user) {
-            regIdx = intRegInMode(MODE_USER, regIdx);
-        }
+            if (up) addr += 8;
+            else addr -= 8;
+            mem_ops -= 2;
+        } else {
+            // 32-bit memory operation
+            // Find register for operation
+            unsigned reg_idx;
+            while(!bits(regs, reg)) reg++;
+            replaceBits(regs, reg, 0);
+            reg_idx = force_user ? intRegInMode(MODE_USER, reg) : reg;
+
+            if (load) {
+                if (writeback && reg_idx == INTREG_PC) {
+                    // If this instruction changes the PC and performs a
+                    // writeback, ensure the pc load/branch is the last uop.
+                    // Load into a temp reg here.
+                    *uop = new MicroLdrUop(machInst, INTREG_UREG1,
+                            copy_base ? INTREG_UREG0 : rn, up, addr);
+                } else if (reg_idx == INTREG_PC && exception_ret) {
+                    // Special handling for exception return
+                    *uop = new MicroLdrRetUop(machInst, reg_idx,
+                            copy_base ? INTREG_UREG0 : rn, up, addr);
+                } else {
+                    // standard single load uop
+                    *uop = new MicroLdrUop(machInst, reg_idx,
+                            copy_base ? INTREG_UREG0 : rn, up, addr);
+                }
+
+                // Loading pc as last operation?  Set appropriate flags.
+                if (!writeback && reg_idx == INTREG_PC) {
+                    (*uop)->setFlag(StaticInst::IsControl);
+                    (*uop)->setFlag(StaticInst::IsIndirectControl);
 
-        if (load) {
-            if (writeback && i == ones - 1) {
-                // If it's a writeback and this is the last register
-                // do the load into a temporary register which we'll move
-                // into the final one later
-                *++uop = new MicroLdrUop(machInst, INTREG_UREG1, INTREG_UREG0,
-                        up, addr);
-            } else {
-                // Otherwise just do it normally
-                if (reg == INTREG_PC && exception_ret) {
-                    // This must be the exception return form of ldm.
-                    *++uop = new MicroLdrRetUop(machInst, regIdx,
-                                               INTREG_UREG0, up, addr);
                     if (!(condCode == COND_AL || condCode == COND_UC))
                         (*uop)->setFlag(StaticInst::IsCondControl);
                     else
                         (*uop)->setFlag(StaticInst::IsUncondControl);
-                } else {
-                    *++uop = new MicroLdrUop(machInst, regIdx,
-                                            INTREG_UREG0, up, addr);
-                    if (reg == INTREG_PC) {
-                        (*uop)->setFlag(StaticInst::IsControl);
-                        if (!(condCode == COND_AL || condCode == COND_UC))
-                            (*uop)->setFlag(StaticInst::IsCondControl);
-                        else
-                            (*uop)->setFlag(StaticInst::IsUncondControl);
-                        (*uop)->setFlag(StaticInst::IsIndirectControl);
-                    }
                 }
+            } else {
+                *uop = new MicroStrUop(machInst, reg_idx, rn, up, addr);
             }
-        } else {
-            *++uop = new MicroStrUop(machInst, regIdx, INTREG_UREG0, up, addr);
+
+            if (up) addr += 4;
+            else addr -= 4;
+            --mem_ops;
         }
 
-        if (up)
-            addr += 4;
-        else
-            addr -= 4;
+        // Load/store micro-op generated, go to next uop
+        ++uop;
     }
 
     if (writeback && ones) {
-        // put the register update after we're done all loading
+        // Perform writeback uop operation
         if (up)
-            *++uop = new MicroAddiUop(machInst, rn, rn, ones * 4);
+            *uop++ = new MicroAddiUop(machInst, rn, rn, ones * 4);
         else
-            *++uop = new MicroSubiUop(machInst, rn, rn, ones * 4);
+            *uop++ = new MicroSubiUop(machInst, rn, rn, ones * 4);
+
+        // Write PC after address writeback?
+        if (pc_temp) {
+            if (exception_ret) {
+                *uop = new MicroUopRegMovRet(machInst, 0, INTREG_UREG1);
+            } else {
+                *uop = new MicroUopRegMov(machInst, INTREG_PC, INTREG_UREG1);
+            }
+            (*uop)->setFlag(StaticInst::IsControl);
+            (*uop)->setFlag(StaticInst::IsIndirectControl);
 
-        // If this was a load move the last temporary value into place
-        // this way we can't take an exception after we update the base
-        // register.
-        if (load && reg == INTREG_PC && exception_ret) {
-            *++uop = new MicroUopRegMovRet(machInst, 0, INTREG_UREG1);
             if (!(condCode == COND_AL || condCode == COND_UC))
                 (*uop)->setFlag(StaticInst::IsCondControl);
             else
                 (*uop)->setFlag(StaticInst::IsUncondControl);
-        } else if (load) {
-            *++uop = new MicroUopRegMov(machInst, regIdx, INTREG_UREG1);
-            if (reg == INTREG_PC) {
-                (*uop)->setFlag(StaticInst::IsControl);
-                (*uop)->setFlag(StaticInst::IsCondControl);
-                (*uop)->setFlag(StaticInst::IsIndirectControl);
-                // This is created as a RAS POP
-                if (rn == INTREG_SP)
-                    (*uop)->setFlag(StaticInst::IsReturn);
 
-            }
+            if (rn == INTREG_SP)
+                (*uop)->setFlag(StaticInst::IsReturn);
+
+            ++uop;
         }
     }
 
+    --uop;
     (*uop)->setLastMicroop();
 
     /* Take the control flags from the last microop for the macroop */
@@ -176,16 +226,15 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst,
         setFlag(StaticInst::IsControl);
     if ((*uop)->isCondCtrl())
         setFlag(StaticInst::IsCondControl);
+    if ((*uop)->isUncondCtrl())
+        setFlag(StaticInst::IsUncondControl);
     if ((*uop)->isIndirectCtrl())
         setFlag(StaticInst::IsIndirectControl);
     if ((*uop)->isReturn())
         setFlag(StaticInst::IsReturn);
 
-    for (StaticInstPtr *curUop = microOps;
-            !(*curUop)->isLastMicroop(); curUop++) {
-        MicroOp * uopPtr = dynamic_cast<MicroOp *>(curUop->get());
-        assert(uopPtr);
-        uopPtr->setDelayedCommit();
+    for (StaticInstPtr *uop = microOps; !(*uop)->isLastMicroop(); uop++) {
+        (*uop)->setDelayedCommit();
     }
 }
 
@@ -196,95 +245,96 @@ PairMemOp::PairMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
                      IntRegIndex rn, IntRegIndex rt, IntRegIndex rt2) :
     PredMacroOp(mnem, machInst, __opClass)
 {
+    bool post = (mode == AddrMd_PostIndex);
     bool writeback = (mode != AddrMd_Offset);
-    numMicroops = 1 + (size / 4) + (writeback ? 1 : 0);
+
+    if (load) {
+        // Use integer rounding to round up loads of size 4
+        numMicroops = (post ? 0 : 1) + ((size + 4) / 8) + (writeback ? 1 : 0);
+    } else {
+        numMicroops = (post ? 0 : 1) + (size / 4) + (writeback ? 1 : 0);
+    }
     microOps = new StaticInstPtr[numMicroops];
 
     StaticInstPtr *uop = microOps;
 
-    bool post = (mode == AddrMd_PostIndex);
-
     rn = makeSP(rn);
 
-    *uop = new MicroAddXiSpAlignUop(machInst, INTREG_UREG0, rn, post ? 0 : imm);
+    if (!post) {
+        *uop++ = new MicroAddXiSpAlignUop(machInst, INTREG_UREG0, rn,
+                post ? 0 : imm);
+    }
 
     if (fp) {
         if (size == 16) {
             if (load) {
-                *++uop = new MicroLdrQBFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroLdrQTFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroLdrQBFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
-                *++uop = new MicroLdrQTFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+                *uop++ = new MicroLdFp16Uop(machInst, rt,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroLdFp16Uop(machInst, rt2,
+                        post ? rn : INTREG_UREG0, 16, noAlloc, exclusive, acrel);
             } else {
-                *++uop = new MicroStrQBFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroStrQTFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroStrQBFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
-                *++uop = new MicroStrQTFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrQBFpXImmUop(machInst, rt,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrQTFpXImmUop(machInst, rt,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrQBFpXImmUop(machInst, rt2,
+                        post ? rn : INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrQTFpXImmUop(machInst, rt2,
+                        post ? rn : INTREG_UREG0, 16, noAlloc, exclusive, acrel);
             }
         } else if (size == 8) {
             if (load) {
-                *++uop = new MicroLdrFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroLdrFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 8, noAlloc, exclusive, acrel);
+                *uop++ = new MicroLdPairFp8Uop(machInst, rt, rt2,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
             } else {
-                *++uop = new MicroStrFpXImmUop(machInst, rt,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
-                *++uop = new MicroStrFpXImmUop(machInst, rt2,
-                        INTREG_UREG0, 8, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrFpXImmUop(machInst, rt,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrFpXImmUop(machInst, rt2,
+                        post ? rn : INTREG_UREG0, 8, noAlloc, exclusive, acrel);
             }
         } else if (size == 4) {
             if (load) {
-                *++uop = new MicroLdrDFpXImmUop(machInst, rt, rt2,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroLdrDFpXImmUop(machInst, rt, rt2,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
             } else {
-                *++uop = new MicroStrDFpXImmUop(machInst, rt, rt2,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrDFpXImmUop(machInst, rt, rt2,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
             }
         }
     } else {
         if (size == 8) {
             if (load) {
-                *++uop = new MicroLdrXImmUop(machInst, rt, INTREG_UREG0,
-                        0, noAlloc, exclusive, acrel);
-                *++uop = new MicroLdrXImmUop(machInst, rt2, INTREG_UREG0,
-                        size, noAlloc, exclusive, acrel);
+                *uop++ = new MicroLdPairUop(machInst, rt, rt2,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
             } else {
-                *++uop = new MicroStrXImmUop(machInst, rt, INTREG_UREG0,
+                *uop++ = new MicroStrXImmUop(machInst, rt, post ? rn : INTREG_UREG0,
                         0, noAlloc, exclusive, acrel);
-                *++uop = new MicroStrXImmUop(machInst, rt2, INTREG_UREG0,
+                *uop++ = new MicroStrXImmUop(machInst, rt2, post ? rn : INTREG_UREG0,
                         size, noAlloc, exclusive, acrel);
             }
         } else if (size == 4) {
             if (load) {
                 if (signExt) {
-                    *++uop = new MicroLdrDSXImmUop(machInst, rt, rt2,
-                            INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                    *uop++ = new MicroLdrDSXImmUop(machInst, rt, rt2,
+                            post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
                 } else {
-                    *++uop = new MicroLdrDUXImmUop(machInst, rt, rt2,
-                            INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                    *uop++ = new MicroLdrDUXImmUop(machInst, rt, rt2,
+                            post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
                 }
             } else {
-                *++uop = new MicroStrDXImmUop(machInst, rt, rt2,
-                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *uop++ = new MicroStrDXImmUop(machInst, rt, rt2,
+                        post ? rn : INTREG_UREG0, 0, noAlloc, exclusive, acrel);
             }
         }
     }
 
     if (writeback) {
-        *++uop = new MicroAddXiUop(machInst, rn, INTREG_UREG0,
+        *uop++ = new MicroAddXiUop(machInst, rn, post ? rn : INTREG_UREG0,
                                    post ? imm : 0);
     }
 
-    (*uop)->setLastMicroop();
+    assert(uop == &microOps[numMicroops]);
+    (*--uop)->setLastMicroop();
 
     for (StaticInstPtr *curUop = microOps;
             !(*curUop)->isLastMicroop(); curUop++) {
@@ -297,18 +347,19 @@ BigFpMemImmOp::BigFpMemImmOp(const char *mnem, ExtMachInst machInst,
                              IntRegIndex base, int64_t imm) :
     PredMacroOp(mnem, machInst, __opClass)
 {
-    numMicroops = 2;
+    numMicroops = load ? 1 : 2;
     microOps = new StaticInstPtr[numMicroops];
 
+    StaticInstPtr *uop = microOps;
+
     if (load) {
-        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, imm);
-        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, imm);
+        *uop = new MicroLdFp16Uop(machInst, dest, base, imm);
     } else {
-        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
-        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
+        *uop = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
+        (*uop)->setDelayedCommit();
+        *++uop = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
     }
-    microOps[0]->setDelayedCommit();
-    microOps[1]->setLastMicroop();
+    (*uop)->setLastMicroop();
 }
 
 BigFpMemPostOp::BigFpMemPostOp(const char *mnem, ExtMachInst machInst,
@@ -316,21 +367,24 @@ BigFpMemPostOp::BigFpMemPostOp(const char *mnem, ExtMachInst machInst,
                                IntRegIndex base, int64_t imm) :
     PredMacroOp(mnem, machInst, __opClass)
 {
-    numMicroops = 3;
+    numMicroops = load ? 2 : 3;
     microOps = new StaticInstPtr[numMicroops];
 
+    StaticInstPtr *uop = microOps;
+
     if (load) {
-        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, 0);
-        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, 0);
+        *uop++ = new MicroLdFp16Uop(machInst, dest, base, 0);
     } else {
-        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, 0);
-        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, 0);
+        *uop++= new MicroStrQBFpXImmUop(machInst, dest, base, 0);
+        *uop++ = new MicroStrQTFpXImmUop(machInst, dest, base, 0);
     }
-    microOps[2] = new MicroAddXiUop(machInst, base, base, imm);
+    *uop = new MicroAddXiUop(machInst, base, base, imm);
+    (*uop)->setLastMicroop();
 
-    microOps[0]->setDelayedCommit();
-    microOps[1]->setDelayedCommit();
-    microOps[2]->setLastMicroop();
+    for (StaticInstPtr *curUop = microOps;
+            !(*curUop)->isLastMicroop(); curUop++) {
+        (*curUop)->setDelayedCommit();
+    }
 }
 
 BigFpMemPreOp::BigFpMemPreOp(const char *mnem, ExtMachInst machInst,
@@ -338,21 +392,24 @@ BigFpMemPreOp::BigFpMemPreOp(const char *mnem, ExtMachInst machInst,
                              IntRegIndex base, int64_t imm) :
     PredMacroOp(mnem, machInst, __opClass)
 {
-    numMicroops = 3;
+    numMicroops = load ? 2 : 3;
     microOps = new StaticInstPtr[numMicroops];
 
+    StaticInstPtr *uop = microOps;
+
     if (load) {
-        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, imm);
-        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, imm);
+        *uop++ = new MicroLdFp16Uop(machInst, dest, base, imm);
     } else {
-        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
-        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
+        *uop++ = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
+        *uop++ = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
     }
-    microOps[2] = new MicroAddXiUop(machInst, base, base, imm);
+    *uop = new MicroAddXiUop(machInst, base, base, imm);
+    (*uop)->setLastMicroop();
 
-    microOps[0]->setDelayedCommit();
-    microOps[1]->setDelayedCommit();
-    microOps[2]->setLastMicroop();
+    for (StaticInstPtr *curUop = microOps;
+            !(*curUop)->isLastMicroop(); curUop++) {
+        (*curUop)->setDelayedCommit();
+    }
 }
 
 BigFpMemRegOp::BigFpMemRegOp(const char *mnem, ExtMachInst machInst,
@@ -361,23 +418,23 @@ BigFpMemRegOp::BigFpMemRegOp(const char *mnem, ExtMachInst machInst,
                              ArmExtendType type, int64_t imm) :
     PredMacroOp(mnem, machInst, __opClass)
 {
-    numMicroops = 2;
+    numMicroops = load ? 1 : 2;
     microOps = new StaticInstPtr[numMicroops];
 
+    StaticInstPtr *uop = microOps;
+
     if (load) {
-        microOps[0] = new MicroLdrQBFpXRegUop(machInst, dest, base,
-                                              offset, type, imm);
-        microOps[1] = new MicroLdrQTFpXRegUop(machInst, dest, base,
-                                              offset, type, imm);
+        *uop = new MicroLdFp16RegUop(machInst, dest, base,
+                                  offset, type, imm);
     } else {
-        microOps[0] = new MicroStrQBFpXRegUop(machInst, dest, base,
-                                              offset, type, imm);
-        microOps[1] = new MicroStrQTFpXRegUop(machInst, dest, base,
-                                              offset, type, imm);
+        *uop = new MicroStrQBFpXRegUop(machInst, dest, base,
+                                       offset, type, imm);
+        (*uop)->setDelayedCommit();
+        *++uop = new MicroStrQTFpXRegUop(machInst, dest, base,
+                                         offset, type, imm);
     }
 
-    microOps[0]->setDelayedCommit();
-    microOps[1]->setLastMicroop();
+    (*uop)->setLastMicroop();
 }
 
 BigFpMemLitOp::BigFpMemLitOp(const char *mnem, ExtMachInst machInst,
@@ -385,14 +442,11 @@ BigFpMemLitOp::BigFpMemLitOp(const char *mnem, ExtMachInst machInst,
                              int64_t imm) :
     PredMacroOp(mnem, machInst, __opClass)
 {
-    numMicroops = 2;
+    numMicroops = 1;
     microOps = new StaticInstPtr[numMicroops];
 
-    microOps[0] = new MicroLdrQBFpXLitUop(machInst, dest, imm);
-    microOps[1] = new MicroLdrQTFpXLitUop(machInst, dest, imm);
-
-    microOps[0]->setDelayedCommit();
-    microOps[1]->setLastMicroop();
+    microOps[0] = new MicroLdFp16LitUop(machInst, dest, imm);
+    microOps[0]->setLastMicroop();
 }
 
 VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
@@ -1538,4 +1592,20 @@ MicroMemOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
     return ss.str();
 }
 
+std::string
+MicroMemPairOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest);
+    ss << ",";
+    printReg(ss, dest2);
+    ss << ", [";
+    printReg(ss, urb);
+    ss << ", ";
+    ccprintf(ss, "#%d", imm);
+    ss << "]";
+    return ss.str();
+}
+
 }
diff --git a/src/arch/arm/insts/macromem.hh b/src/arch/arm/insts/macromem.hh
index fc8e3e1b7..412337d06 100644
--- a/src/arch/arm/insts/macromem.hh
+++ b/src/arch/arm/insts/macromem.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013 ARM Limited
+ * Copyright (c) 2010-2014 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -395,6 +395,26 @@ class MicroMemOp : public MicroIntImmOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class MicroMemPairOp : public MicroOp
+{
+  protected:
+    RegIndex dest, dest2, urb;
+    bool up;
+    int32_t imm;
+    unsigned memAccessFlags;
+
+    MicroMemPairOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+            RegIndex _dreg1, RegIndex _dreg2, RegIndex _base,
+            bool _up, uint8_t _imm)
+        : MicroOp(mnem, machInst, __opClass),
+        dest(_dreg1), dest2(_dreg2), urb(_base), up(_up), imm(_imm),
+        memAccessFlags(TLB::MustBeOne | TLB::AlignWord)
+    {
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 /**
  * Base class for microcoded integer memory instructions.
  */
diff --git a/src/arch/arm/isa/insts/ldr64.isa b/src/arch/arm/isa/insts/ldr64.isa
index 78460f661..eea925e66 100644
--- a/src/arch/arm/isa/insts/ldr64.isa
+++ b/src/arch/arm/isa/insts/ldr64.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2011-2013 ARM Limited
+// Copyright (c) 2011-2014 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -187,35 +187,32 @@ let {{
                         AA64FpDestP2_uw = 0;
                         AA64FpDestP3_uw = 0;
                     '''
-                elif self.size == 8 or (self.size == 16 and not self.top):
+                elif self.size == 8:
                     accCode = '''
                         uint64_t data = cSwap(Mem%s,
                                               isBigEndian64(xc->tcBase()));
                         AA64FpDestP0_uw = (uint32_t)data;
                         AA64FpDestP1_uw = (data >> 32);
+                        AA64FpDestP2_uw = 0;
+                        AA64FpDestP3_uw = 0;
                     '''
-                    # Only zero out the other half if this isn't part of a
-                    # pair of 8 byte loads implementing a 16 byte load.
-                    if self.size == 8:
-                        accCode += '''
-                            AA64FpDestP2_uw = 0;
-                            AA64FpDestP3_uw = 0;
-                        '''
-                elif self.size == 16 and self.top:
+                elif self.size == 16:
                     accCode = '''
-                        uint64_t data = cSwap(Mem%s,
-                                              isBigEndian64(xc->tcBase()));
-                        AA64FpDestP2_uw = (uint32_t)data;
-                        AA64FpDestP3_uw = (data >> 32);
+                    Twin64_t data = cSwap(Mem%s,
+                                          isBigEndian64(xc->tcBase()));
+
+
+                    AA64FpDestP0_uw = (uint32_t)data.a;
+                    AA64FpDestP1_uw = (data.a >> 32);
+                    AA64FpDestP2_uw = (uint32_t)data.b;
+                    AA64FpDestP3_uw = (data.b >> 32);
                     '''
             elif self.flavor == "widen" or self.size == 8:
                 accCode = "XDest = cSwap(Mem%s, isBigEndian64(xc->tcBase()));"
             else:
                 accCode = "WDest = cSwap(Mem%s, isBigEndian64(xc->tcBase()));"
-            if self.size == 16:
-                accCode = accCode % buildMemSuffix(self.sign, 8)
-            else:
-                accCode = accCode % buildMemSuffix(self.sign, self.size)
+
+            accCode = accCode % buildMemSuffix(self.sign, self.size)
 
             self.codeBlobs["memacc_code"] = accCode
 
@@ -231,17 +228,29 @@ let {{
 
             # Code that actually handles the access
             if self.flavor == "fp":
-                accCode = '''
-                    uint64_t data = cSwap(Mem_ud, isBigEndian64(xc->tcBase()));
-                    AA64FpDestP0_uw = (uint32_t)data;
-                    AA64FpDestP1_uw = 0;
-                    AA64FpDestP2_uw = 0;
-                    AA64FpDestP3_uw = 0;
-                    AA64FpDest2P0_uw = (data >> 32);
-                    AA64FpDest2P1_uw = 0;
-                    AA64FpDest2P2_uw = 0;
-                    AA64FpDest2P3_uw = 0;
-                '''
+                if self.size == 4:
+                    accCode = '''
+                        uint64_t data = cSwap(Mem_ud, isBigEndian64(xc->tcBase()));
+                        AA64FpDestP0_uw = (uint32_t)data;
+                        AA64FpDestP1_uw = 0;
+                        AA64FpDestP2_uw = 0;
+                        AA64FpDestP3_uw = 0;
+                        AA64FpDest2P0_uw = (data >> 32);
+                        AA64FpDest2P1_uw = 0;
+                        AA64FpDest2P2_uw = 0;
+                        AA64FpDest2P3_uw = 0;
+                    '''
+                elif self.size == 8:
+                    accCode = '''
+                        AA64FpDestP0_uw = (uint32_t)Mem_tud.a;
+                        AA64FpDestP1_uw = (uint32_t)(Mem_tud.a >> 32);
+                        AA64FpDestP2_uw = 0;
+                        AA64FpDestP3_uw = 0;
+                        AA64FpDest2P0_uw = (uint32_t)Mem_tud.b;
+                        AA64FpDest2P1_uw = (uint32_t)(Mem_tud.b >> 32);
+                        AA64FpDest2P2_uw = 0;
+                        AA64FpDest2P3_uw = 0;
+                    '''
             else:
                 if self.sign:
                     if self.size == 4:
@@ -253,8 +262,8 @@ let {{
                         '''
                     elif self.size == 8:
                         accCode = '''
-                            XDest = sext<64>(Mem_tud.a);
-                            XDest2 = sext<64>(Mem_tud.b);
+                            XDest = Mem_tud.a;
+                            XDest2 = Mem_tud.b;
                         '''
                 else:
                     if self.size == 4:
@@ -416,6 +425,11 @@ let {{
         decConstBase = 'LoadStoreLitU64'
         micro = True
 
+    LoadImmDU64("ldp_uop", "MicroLdPairUop", 8).emit()
+    LoadImmDU64("ldp_fp8_uop", "MicroLdPairFp8Uop", 8, flavor="fp").emit()
+    LoadImmU64("ldfp16_uop", "MicroLdFp16Uop", 16, flavor="fp").emit()
+    LoadReg64("ldfp16reg_uop", "MicroLdFp16RegUop", 16, flavor="fp").emit()
+
     LoadImmDouble64("ldaxp", "LDAXPW64", 4, flavor="acexp").emit()
     LoadImmDouble64("ldaxp", "LDAXPX64", 8, flavor="acexp").emit()
     LoadImmDouble64("ldxp", "LDXPW64", 4, flavor="exp").emit()
@@ -428,18 +442,8 @@ let {{
     LoadRegU64("ldrfpxr_uop", "MicroLdrFpXRegUop", 8, flavor="fp").emit()
     LoadLitU64("ldrfpxl_uop", "MicroLdrFpXLitUop", 8, literal=True,
                flavor="fp").emit()
-    LoadImmU64("ldrqbfpxi_uop", "MicroLdrQBFpXImmUop",
-               16, flavor="fp", top = False).emit()
-    LoadRegU64("ldrqbfpxr_uop", "MicroLdrQBFpXRegUop",
-               16, flavor="fp", top = False).emit()
-    LoadLitU64("ldrqbfpxl_uop", "MicroLdrQBFpXLitUop",
-               16, literal=True, flavor="fp", top = False).emit()
-    LoadImmU64("ldrqtfpxi_uop", "MicroLdrQTFpXImmUop",
-               16, flavor="fp", top = True).emit()
-    LoadRegU64("ldrqtfpxr_uop", "MicroLdrQTFpXRegUop",
-               16, flavor="fp", top = True).emit()
-    LoadLitU64("ldrqtfpxl_uop", "MicroLdrQTFpXLitUop",
-               16, literal=True, flavor="fp", top = True).emit()
+    LoadLitU64("ldfp16_lit__uop", "MicroLdFp16LitUop",
+               16, literal=True, flavor="fp").emit()
     LoadImmDU64("ldrduxi_uop", "MicroLdrDUXImmUop", 4, sign=False).emit()
     LoadImmDU64("ldrdsxi_uop", "MicroLdrDSXImmUop", 4, sign=True).emit()
     LoadImmDU64("ldrdfpxi_uop", "MicroLdrDFpXImmUop", 4, flavor="fp").emit()
diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa
index f164595dd..41060ff01 100644
--- a/src/arch/arm/isa/insts/macromem.isa
+++ b/src/arch/arm/isa/insts/macromem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2013 ARM Limited
+// Copyright (c) 2010-2014 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -55,6 +55,18 @@ let {{
                                     'predicate_test': predicateTest},
                                    ['IsMicroop'])
 
+    microLdr2UopCode = '''
+                        uint64_t data = Mem_ud;
+                        Dest = cSwap((uint32_t) data, ((CPSR)Cpsr).e);
+                        Dest2 = cSwap((uint32_t) (data >> 32), ((CPSR)Cpsr).e);
+                        '''
+    microLdr2UopIop = InstObjParams('ldr2_uop', 'MicroLdr2Uop',
+                                   'MicroMemPairOp',
+                                   {'memacc_code': microLdr2UopCode,
+                                    'ea_code': 'EA = URb + (up ? imm : -imm);',
+                                    'predicate_test': predicateTest},
+                                   ['IsMicroop'])
+
     microLdrFpUopCode = "Fa_uw = cSwap(Mem_uw, ((CPSR)Cpsr).e);"
     microLdrFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrFpUop',
                                       'MicroMemOp',
@@ -159,8 +171,8 @@ let {{
 
     header_output = decoder_output = exec_output = ''
 
-    loadIops = (microLdrUopIop, microLdrRetUopIop, microLdrFpUopIop,
-                microLdrDBFpUopIop, microLdrDTFpUopIop)
+    loadIops = (microLdrUopIop, microLdrRetUopIop,
+                microLdrFpUopIop, microLdrDBFpUopIop, microLdrDTFpUopIop)
     storeIops = (microStrUopIop, microStrFpUopIop,
                  microStrDBFpUopIop, microStrDTFpUopIop)
     for iop in loadIops + storeIops:
@@ -174,6 +186,12 @@ let {{
         exec_output += StoreExecute.subst(iop) + \
                        StoreInitiateAcc.subst(iop) + \
                        StoreCompleteAcc.subst(iop)
+
+    header_output += MicroMemPairDeclare.subst(microLdr2UopIop)
+    decoder_output += MicroMemPairConstructor.subst(microLdr2UopIop)
+    exec_output += LoadExecute.subst(microLdr2UopIop) + \
+                   LoadInitiateAcc.subst(microLdr2UopIop) + \
+                   LoadCompleteAcc.subst(microLdr2UopIop)
 }};
 
 let {{
diff --git a/src/arch/arm/isa/insts/mem.isa b/src/arch/arm/isa/insts/mem.isa
index aed6bab0d..7323b02c9 100644
--- a/src/arch/arm/isa/insts/mem.isa
+++ b/src/arch/arm/isa/insts/mem.isa
@@ -193,7 +193,9 @@ let {{
         return Name
 
     def buildMemSuffix(sign, size):
-        if size == 8:
+        if size == 16:
+            memSuffix = '_tud'
+        elif size == 8:
             memSuffix = '_ud'
         elif size == 4:
             if sign:
diff --git a/src/arch/arm/isa/templates/macromem.isa b/src/arch/arm/isa/templates/macromem.isa
index 9a6de16cc..b252c91e7 100644
--- a/src/arch/arm/isa/templates/macromem.isa
+++ b/src/arch/arm/isa/templates/macromem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2013 ARM Limited
+// Copyright (c) 2010-2014 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -77,6 +77,39 @@ def template MicroMemConstructor {{
     }
 }};
 
+
+def template MicroMemPairDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst,
+                       RegIndex _dreg1, RegIndex _dreg2, RegIndex _base,
+                       bool _up, uint8_t _imm);
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+    };
+}};
+
+def template MicroMemPairConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                   RegIndex _dreg1,
+                                   RegIndex _dreg2,
+                                   RegIndex _base,
+                                   bool _up,
+                                   uint8_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dreg1, _dreg2, _base, _up, _imm)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
 ////////////////////////////////////////////////////////////////////
 //
 // Neon load/store microops