From af358c5f7774d57396a493e558704d30d7093efe Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Fri, 12 Aug 2022 01:44:50 +0100
Subject: [PATCH] cleanup of page to remove shift and bitrev

---
 openpower/sv/ldst.mdwn | 42 +++++++++---------------------------------
 1 file changed, 9 insertions(+), 33 deletions(-)

diff --git a/openpower/sv/ldst.mdwn b/openpower/sv/ldst.mdwn
index 8fe06719d..a3d937e7e 100644
--- a/openpower/sv/ldst.mdwn
+++ b/openpower/sv/ldst.mdwn
@@ -72,7 +72,7 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
 
     # LD not VLD!  format - ldop RT, immed(RA)
     # op_width: lb=1, lh=2, lw=4, ld=8
-    op_load(RT, RA, RC, op_width, immed, svctx, RAupdate):
+    op_load(RT, RA, op_width, immed, svctx, RAupdate):
      Â ps = get_pred_val(FALSE, RA); # predication on src
      Â pd = get_pred_val(FALSE, RT); # ... AND on dest
      Â for (i=0, j=0, u=0; i < VL && j < VL;):
@@ -80,14 +80,7 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
         if (RA.isvec) while (!(ps & 1<<i)) i++;
         if (RAupdate.isvec) while (!(ps & 1<<u)) u++;
         if (RT.isvec) while (!(pd & 1<<j)) j++;
-        if svctx.ldstmode == shifted: # for FFT/DCT
-          # FFT/DCT shifted mode
-          if (RA.isvec)
-            srcbase = ireg[RA+i]
-          else
-            srcbase = ireg[RA]
-          offs = (i * immed) << RC
-        elif svctx.ldstmode == elementstride:
+        if svctx.ldstmode == elementstride:
           # element stride mode
           srcbase = ireg[RA]
           offs = i * immed              # j*immed for a ST
@@ -117,15 +110,6 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
         if (RAupdate.isvec) u++;
         if (RT.isvec) j++;
 
-    # reverses the bitorder up to "width" bits
-    def bitrev(val, VL):
-      width = log2(VL)
-      result = 0
-      for _ in range(width):
-        result = (result << 1) | (val & 1)
-        val >>= 1
-      return result
-
 Indexed LD is:
  
     # format: ldop RT, RA, RB
@@ -174,18 +158,12 @@ an alternative table meaning for [[sv/svp64]] mode.  The following modes make se
 * fail-first (where Vector Indexed is banned)
 * Signed Effective Address computation (Vector Indexed only)
 
-Also, given that FFT, DCT and other related algorithms
-are of such high importance in so many areas of Computer
-Science, a special "shift" mode has been added which
-allows part of the immediate to be used instead as RC, a register
-which shifts the immediate `DS << GPR(RC)`.
-
 The table for [[sv/svp64]] for `immed(RA)` is:
 
 | 0-1 |  2  |  3   4  |  description               |
 | --- | --- |---------|--------------------------- |
 | 00  | 0   |  zz els | normal mode                |
-| 00  | 1   |  zz shf | shift mode                 |
+| 00  | 1   |  rsvd   | reserved                   |
 | 01  | inv | CR-bit  | Rc=1: ffirst CR sel        |
 | 01  | inv | els RC1 |  Rc=0: ffirst z/nonz       |
 | 10  |   N | zz  els |  sat mode: N=0/1 u/s       |
@@ -195,9 +173,7 @@ The table for [[sv/svp64]] for `immed(RA)` is:
 The `els` bit is only relevant when `RA.isvec` is clear: this indicates
 whether stride is unit or element:
 
-    if bitreversed:
-        svctx.ldstmode = bitreversed
-    elif RA.isvec:
+    if RA.isvec:
         svctx.ldstmode = indexed
     elif els == 0:
         svctx.ldstmode = unitstride
@@ -233,7 +209,7 @@ The modes for `RA+RB` indexed version are slightly different:
 | 0-1 |  2  |  3   4  |  description              |
 | --- | --- |---------|-------------------------- |
 | 00  | SEA |  dz  sz | normal mode        |
-| 01  | SEA | dz sz  | Strided (scalar only source)   |
+| 01  | SEA | dz sz   | Strided (scalar only source)   |
 | 10  |   N | dz   sz |  sat mode: N=0/1 u/s |
 | 11  | inv | CR-bit  |  Rc=1: pred-result CR sel |
 | 11  | inv | zz  RC1 |  Rc=0: pred-result z/nonz |
@@ -287,13 +263,13 @@ to *always* set VL=1 which will have the effect of terminating any
 speculative probing (and also adversely affect performance), but will
 at least not require applications to be rewritten.
 
-Low-performance simpler hardware implementations may
+Low-performance simpler hardware implementations may also
 choose (always) to also set VL=1 as the bare minimum compliant implementation of
 LD/ST Fail-First. It is however critically important to remember that
 the first element LD/ST **MUST** be treated as an ordinary LD/ST, i.e.
 **MUST** raise exceptions exactly like an ordinary LD/ST.
 
-For ffirst LD/STs, VL may be truncated arbitrarily to a nonzero value for any implementation-specific reason. For example: it is perfectly reasonable for implementations to alter VL when ffirst LD or ST operations are initiated on a nonaligned boundary, such that within a loop the subsequent iteration of that loop begins subsequent ffirst LD/ST operations on an aligned boundary
+For ffirst LD/STs, VL may be truncated arbitrarily to a nonzero value for any implementation-specific reason. For example: it is perfectly reasonable for implementations to alter VL when ffirst LD or ST operations are initiated on a nonaligned boundary, such that within a loop the subsequent iteration of that loop begins the following ffirst LD/ST operations on an aligned boundary
 such as the beginning of a cache line, or beginning of a Virtual Memory
 page. Likewise, to reduce workloads or balance resources.
 
@@ -304,7 +280,7 @@ order as part of explicit loops, it is neither possible nor
 safe to make speculative assumptions about future LD/STs.
 Therefore, Fail-First LD/ST in Vertical-First is `UNDEFINED`.
 This is very different from Arithmetic (Data-dependent) FFirst
-where Vertical-First Mode is deterministic, not speculative.
+where Vertical-First Mode is fully deterministic, not speculative.
 
 # LOAD/STORE Elwidths <a name="elwidth"></a>
 
@@ -407,7 +383,7 @@ and other modes have all been removed, for clarity and simplicity:
 
 # Remapped LD/ST
 
-In the [[sv/propagation]] page the concept of "Remapping" is described.
+In the [[sv/remap]] page the concept of "Remapping" is described.
 Whilst it is expensive to set up (2 64-bit opcodes minimum) it provides
 a way to arbitrarily perform 1D, 2D and 3D "remapping" of up to 64
 elements worth of LDs or STs.  The usual interest in such re-mapping
-- 
2.30.2