From 3e04fe8f31972b048a90270a973b192fa74a3a1b Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Sat, 20 Aug 2022 12:46:11 +0100
Subject: [PATCH]

---
 openpower/sv/ldst.mdwn | 55 ++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/openpower/sv/ldst.mdwn b/openpower/sv/ldst.mdwn
index 02e6358a9..ad03600fd 100644
--- a/openpower/sv/ldst.mdwn
+++ b/openpower/sv/ldst.mdwn
@@ -438,12 +438,13 @@ Below is the pseudocode for Unit-Strided LD (which includes Vector capability).
 both Immediate and Indexed LD/ST,
 does not have element-width overriding applied to it.
 
-Note that twin predication, predication-zeroing, saturation
-and other modes have all been removed, for clarity and simplicity:
+Note that predication, predication-zeroing,
+and other modes except saturation have all been removed,
+for clarity and simplicity:
 
-    # LD not VLD! (ldbrx if brev=True)
+    # LD not VLD!
     # this covers unit stride mode and a type of vector offset
-    function op_ld(RT, RA, brev, op_width, imm_offs, svctx)
+    function op_ld(RT, RA, op_width, imm_offs, svctx)
       for (int i = 0, int j = 0; i < svctx.VL && j < svctx.VL):
         if not svctx.unit/el-strided:
             # strange vector mode, compute 64 bit address which is
@@ -455,16 +456,9 @@ and other modes have all been removed, for clarity and simplicity:
             # adjust for unit/el-stride
             srcbase += ....
 
-        # takes care of (merges) processor LE/BE and ld/ldbrx
-        bytereverse = brev XNOR MSR.LE
-
         # read the underlying memory
         memread <= MEM(srcbase + imm_offs, op_width)
 
-        # optionally performs byteswap at op width
-        if (bytereverse):
-            memread = byteswap(memread, op_width)
-
         # check saturation.
         if svpctx.saturation_mode:
             # ... saturation adjustment...
@@ -483,11 +477,14 @@ and other modes have all been removed, for clarity and simplicity:
         i++;
         j++;
 
-For LD/Indexed, the key here is that in the calculation of the Effective Address,
-RA has no elwidth override but RB does.
+Note above that the source elwidth is *not used at all* in LD-immediate
 
-    # LD not VLD!
-    function op_ld(RT, RA, RB, op_width, svctx)
+For LD/Indexed, the key is that in the calculation of the Effective Address,
+RA has no elwidth override but RB does.  Pseudocode below is simplified
+for clarity: predication and all modes except saturation are removed:
+
+    # LD not VLD! ld*rx if brev else ld*
+    function op_ld(RT, RA, RB, op_width, svctx, brev)
       for (int i = 0, int j = 0; i < svctx.VL && j < svctx.VL):
         if not svctx.el-strided:
             # RA not polymorphic! elwidth hardcoded to 64 here
@@ -499,11 +496,33 @@ RA has no elwidth override but RB does.
         offs = get_polymorphed_reg(RB, svctx.src_elwidth, i)
         # sign-extend
         if svctx.SEA: offs = sext(offs, svctx.src_elwidth, 64)
+
+        # takes care of (merges) processor LE/BE and ld/ldbrx
+        bytereverse = brev XNOR MSR.LE
+
         # read the underlying memory
         memread <= MEM(srcbase + offs, op_width)
-        # proceed to check saturation 
-        ...
-        ...
+
+        # optionally performs byteswap at op width
+        if (bytereverse):
+            memread = byteswap(memread, op_width)
+
+        if svpctx.saturation_mode:
+            # ... saturation adjustment...
+            memread = clamp(memread, op_width, svctx.dest_elwidth)
+        else:
+            # truncate/extend to over-ridden dest width.
+            memread = adjust_wid(memread, op_width, svctx.dest_elwidth)
+
+        # takes care of inserting memory-read (now correctly byteswapped)
+        # into regfile underlying LE-defined order, into the right place
+        # within the NEON-like register, respecting destination element
+        # bitwidth, and the element index (j)
+        set_polymorphed_reg(RT, svctx.dest_elwidth, j, memread)
+
+        # increments both src and dest element indices (no predication here)
+        i++;
+        j++;
 
 # Remapped LD/ST
 
-- 
2.30.2