From 92699fd1e435223cb3a6154ab0a9cc831f000572 Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Sat, 20 May 2023 11:24:53 +0100
Subject: [PATCH]

---
 openpower/sv/ldst.mdwn | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/openpower/sv/ldst.mdwn b/openpower/sv/ldst.mdwn
index 836c98573..5b497a31b 100644
--- a/openpower/sv/ldst.mdwn
+++ b/openpower/sv/ldst.mdwn
@@ -636,26 +636,28 @@ have all been removed, for clarity and simplicity:
             # unit / element stride mode, compute 64 bit address
             srcbase = get_polymorphed_reg(RA, 64, 0)
             # adjust for unit/el-stride
-            srcbase += ....
+            srcbase += .... uses op_width here
 
         # read the underlying memory
         memread <= MEM(srcbase + imm_offs, op_width)
 
         # truncate/extend to over-ridden dest width.
-        memread = adjust_wid(memread, op_width, svctx.dest_elwidth)
+        memread = adjust_wid(memread, op_width, svctx.elwidth)
 
         # takes care of inserting memory-read (now correctly byteswapped)
         # into regfile underlying LE-defined order, into the right place
-        # within the NEON-like register, respecting destination element
-        # bitwidth, and the element index (j)
-        set_polymorphed_reg(RT, svctx.dest_elwidth, j, memread)
+        # using Element-Packing starting at register RT, respecting destination
+        # element bitwidth, and the element index (j)
+        set_polymorphed_reg(RT, svctx.elwidth, j, memread)
 
         # increments both src and dest element indices (no predication here)
         i++;
         j++;
 ```
 
-Note above that the source elwidth is *not used at all* in LD-immediate.
+Note above that the source elwidth is *not used at all* in LD-immediate: RA
+never has elwidth overrides, leaving the elwidth free for truncation/extension
+of the result.
 
 For LD/Indexed, the key is that in the calculation of the Effective Address,
 RA has no elwidth override but RB does.  Pseudocode below is simplified
@@ -687,19 +689,31 @@ for clarity: predication and all modes are removed:
             memread = byteswap(memread, op_width)
 
         # truncate/extend to over-ridden dest width.
-        memread = adjust_wid(memread, op_width, svctx.dest_elwidth)
+        dest_width = op_width if RT.isvec else 64
+        memread = adjust_wid(memread, op_width, dest_width)
 
         # takes care of inserting memory-read (now correctly byteswapped)
         # into regfile underlying LE-defined order, into the right place
         # within the NEON-like register, respecting destination element
         # bitwidth, and the element index (j)
-        set_polymorphed_reg(RT, svctx.dest_elwidth, j, memread)
+        set_polymorphed_reg(RT, destwidth, j, memread)
 
         # increments both src and dest element indices (no predication here)
         i++;
         j++;
 ```
 
+*Programmer's note: with no destination elwidth override the destination
+width must be implicitly ascertained.  The assumption is that if the destination
+is a Scalar that the entire 64-bit register must be written, thus the width is
+extended to 64-bit.  If however the destination is a Vector then it is deemed
+appropriate to use the LD/ST width and to perform contiguous register element
+packing at that width.  The justification for doing so is that if further
+sign-extension or saturation is required after a LD, these may be performed by a
+follow-up instruction that uses a source elwidth override matching the exact width
+of the LD operation.  Correspondingly for a ST a destination elwidth override
+on a prior instruction may match the exact width of the ST instruction.*
+
 ## Remapped LD/ST
 
 In the [[sv/remap]] page the concept of "Remapping" is described.  Whilst
-- 
2.30.2