From cbedd169c2d0848fb5bedb076ae401946067ec91 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 1 Aug 2021 18:37:45 +0100
Subject: [PATCH] bit of a big update, remove all bit-reversed LD operations,
 replace with LD-with-shift, and fix LDST, DCT and FFT unit tests to use new
 bitrev-with-half-swap REMAP modes

---
 openpower/isa/simplev.mdwn                    |  7 +-
 openpower/isa/svfixedload.mdwn                | 22 ++---
 openpower/isa/svfpload.mdwn                   | 16 ++--
 src/openpower/consts.py                       |  2 +-
 src/openpower/decoder/isa/caller.py           | 30 ++----
 src/openpower/decoder/isa/remap_dct_yield.py  | 10 +-
 src/openpower/decoder/isa/svshape.py          |  2 +-
 .../decoder/isa/test_caller_svp64_dct.py      |  4 +-
 .../decoder/isa/test_caller_svp64_fft.py      | 17 +++-
 .../decoder/isa/test_caller_svp64_ldst.py     | 93 +++++++------------
 src/openpower/decoder/power_decoder2.py       |  8 +-
 src/openpower/decoder/power_enums.py          |  2 +-
 src/openpower/decoder/power_svp64_rm.py       |  6 +-
 src/openpower/sv/trans/svp64.py               | 40 ++++----
 14 files changed, 114 insertions(+), 145 deletions(-)

diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn
index 0ad1da1b..91a53841 100644
--- a/openpower/isa/simplev.mdwn
+++ b/openpower/isa/simplev.mdwn
@@ -245,13 +245,16 @@ Pseudo-code:
         SVSHAPE1[28:29] <- 0b10           # ci schedule
         SVSHAPE2[28:29] <- 0b11           # size schedule
     # set schedule up for iDCT / DCT inverse of half-swapped ordering
-    if (SVRM = 0b0110) | (SVRM = 0b1110) then
+    if (SVRM = 0b0110) | (SVRM = 0b1110) | (SVRM = 0b1111) then
         vlen[0:6] <- (0b00 || SVxd) + 0b0000001
         # set up template in SVSHAPE0
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         if (SVRM = 0b1110) then
             SVSHAPE0[18:20] <- 0b001     # DCT opposite half-swap
-        SVSHAPE0[30:31] <- 0b01          # DCT/FFT mode
+        if (SVRM = 0b1111) then
+            SVSHAPE0[30:31] <- 0b01          # FFT mode
+        else
+            SVSHAPE0[30:31] <- 0b11          # DCT mode
         SVSHAPE0[6:11] <- 0b000101       # DCT "half-swap" mode
     # set VL, MVL and Vertical-First
     SVSTATE[0:6] <- vlen
diff --git a/openpower/isa/svfixedload.mdwn b/openpower/isa/svfixedload.mdwn
index f8d9de9c..d97eca95 100644
--- a/openpower/isa/svfixedload.mdwn
+++ b/openpower/isa/svfixedload.mdwn
@@ -12,7 +12,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0]*56 || MEM(EA, 1)
 
 Special Registers Altered:
@@ -28,7 +28,7 @@ SVD-Form
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0] * 56 || MEM(EA, 1)
     RA <- EA
 
@@ -46,7 +46,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0] * 48 || MEM(EA, 2)
 
 Special Registers Altered:
@@ -62,7 +62,7 @@ SVD-Form
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0] * 48 || MEM(EA, 2)
     RA <- EA
 
@@ -80,7 +80,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     RT <- EXTS(MEM(EA, 2))
 
 Special Registers Altered:
@@ -96,7 +96,7 @@ SVD-Form
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     RT <- EXTS(MEM(EA, 2))
     RA <- EA
 
@@ -114,7 +114,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0] * 32 || MEM(EA, 4)
 
 Special Registers Altered:
@@ -130,7 +130,7 @@ SVD-Form
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     RT <- [0]*32 || MEM(EA, 4)
     RA <- EA
 
@@ -148,7 +148,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(bitrev(EXTS(SVDS || 0b00), n)
+    EA <- b + SHL64(srcstep * EXTS(SVDS || 0b00), n)
     RT <- EXTS(MEM(EA, 4))
 
 Special Registers Altered:
@@ -165,7 +165,7 @@ Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(EXTS(SVDS || 0b00), n)
+    EA <- b + SHL64(srcstep * EXTS(SVDS || 0b00), n)
     RT <- MEM(EA, 8)
 
 Special Registers Altered:
@@ -181,7 +181,7 @@ SVDS-Form
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(EXTS(SVDS || 0b00), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVDS || 0b00), n)
     RT <- MEM(EA, 8)
     RA <- EA
 
diff --git a/openpower/isa/svfpload.mdwn b/openpower/isa/svfpload.mdwn
index 533ec829..d8856c50 100644
--- a/openpower/isa/svfpload.mdwn
+++ b/openpower/isa/svfpload.mdwn
@@ -6,13 +6,13 @@
 
 SVD-Form
 
-* lfsbr FRT,SVD(RA),RC
+* lfssh FRT,SVD(RA),RC
 
 Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     FRT <- DOUBLE(MEM(EA, 4))
 
 Special Registers Altered:
@@ -23,12 +23,12 @@ Special Registers Altered:
 
 SVD-Form
 
-* lfsubr FRT,SVD(RA),RC
+* lfsush FRT,SVD(RA),RC
 
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     FRT <- DOUBLE(MEM(EA, 4))
     RA <- EA
 
@@ -40,13 +40,13 @@ Special Registers Altered:
 
 SVD-Form
 
-* lfdbr FRT,SVD(RA),RC
+* lfdsh FRT,SVD(RA),RC
 
 Pseudo-code:
 
     b <- (RA|0)
     n <- (RC)[58:63]
-    EA <- b + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n)
+    EA <- b + SHL64(srcstep * EXTS(SVD), n)
     FRT <- MEM(EA, 8)
 
 Special Registers Altered:
@@ -57,12 +57,12 @@ Special Registers Altered:
 
 SVD-Form
 
-* lfdubr FRT,SVD(RA),RC
+* lfdush FRT,SVD(RA),RC
 
 Pseudo-code:
 
     n <- (RC)[58:63]
-    EA <- (RA) + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n)
+    EA <- (RA) + SHL64(srcstep * EXTS(SVD), n)
     FRT <- MEM(EA, 8)
     RA <- EA
 
diff --git a/src/openpower/consts.py b/src/openpower/consts.py
index 5c60bd97..747c9d12 100644
--- a/src/openpower/consts.py
+++ b/src/openpower/consts.py
@@ -222,7 +222,7 @@ class SVP64MODEb:
     # mode bits
     MOD2_MSB = 0
     MOD2_LSB = 1
-    LDST_BITREV = 2 # set =1 for bitreverse mode
+    LDST_SHIFT = 2 # set =1 for shift mode
     # when predicate not set: 0=ignore/skip 1=zero
     DZ = 3  # for destination
     SZ = 4  # for source
diff --git a/src/openpower/decoder/isa/caller.py b/src/openpower/decoder/isa/caller.py
index 8150785b..413494ca 100644
--- a/src/openpower/decoder/isa/caller.py
+++ b/src/openpower/decoder/isa/caller.py
@@ -28,7 +28,7 @@ from openpower.decoder.power_enums import (spr_dict, spr_byname, XER_bits,
 
 from openpower.decoder.power_enums import SVPtype
 
-from openpower.decoder.helpers import (exts, gtu, ltu, undefined, bitrev)
+from openpower.decoder.helpers import (exts, gtu, ltu, undefined)
 from openpower.consts import PIb, MSRb  # big-endian (PowerISA versions)
 from openpower.consts import SVP64CROffs
 from openpower.decoder.power_svp64 import SVP64RM, decode_extra
@@ -931,8 +931,6 @@ class ISACaller:
         yield self.dec2.dec.raw_opcode_in.eq(ins & 0xffffffff) # v3.0B suffix
         yield self.dec2.sv_rm.eq(sv_rm)                        # svp64 prefix
         yield Settle()
-        # store this for use in get_src_dststeps()
-        self.ldstmode = yield self.dec2.rm_dec.ldstmode
 
     def execute_one(self):
         """execute one instruction
@@ -1318,13 +1316,13 @@ class ISACaller:
         replace_d = False # update / replace constant in pseudocode
         if self.is_svp64_mode:
             ldstmode = yield self.dec2.rm_dec.ldstmode
-            # bitreverse mode reads SVD (or SVDS - TODO)
+            # shift mode reads SVD (or SVDS - TODO)
             # *BUT*... because this is "overloading" of LD operations,
             # it gets *STORED* into D (or DS, TODO)
-            if ldstmode == SVP64LDSTmode.BITREVERSE.value:
+            if ldstmode == SVP64LDSTmode.SHIFT.value:
                 imm = yield self.dec2.dec.fields.FormSVD.SVD[0:11]
                 imm = exts(imm, 11) # sign-extend to integer
-                log ("bitrev SVD", imm)
+                log ("shift SVD", imm)
                 replace_d = True
             else:
                 if info.form == 'DS':
@@ -1348,11 +1346,11 @@ class ISACaller:
                 offsmul = dststep
                 log("D-field dst", imm, offsmul)
             # bit-reverse mode, rev already done through get_src_dst_steps()
-            if ldstmode == SVP64LDSTmode.BITREVERSE.value:
+            if ldstmode == SVP64LDSTmode.SHIFT.value:
                 # manually look up RC, sigh
                 RC = yield self.dec2.dec.RC[0:5]
                 RC = self.gpr(RC)
-                log ("LD-BITREVERSE:", "VL", vl,
+                log ("LD-SHIFT:", "VL", vl,
                       "RC", RC.value, "imm", imm,
                      "offs", bin(offsmul),
                      )
@@ -1665,21 +1663,9 @@ class ISACaller:
         log ("    new dststep", dststep)
 
     def get_src_dststeps(self):
-        """gets srcstep and dststep but performs bit-reversal on srcstep if
-        required.  use this ONLY to perform calculations, do NOT update
-        SVSTATE with the bit-reversed value of srcstep
-
-        ARGH, had to store self.ldstmode and VL due to yield issues
+        """gets srcstep and dststep 
         """
-        srcstep, dststep = self.new_srcstep, self.new_dststep
-        if self.is_svp64_mode:
-            if self.ldstmode == SVP64LDSTmode.BITREVERSE.value:
-                vl = self.svstate.vl
-                log ("SRCSTEP-BITREVERSE:", "VL", vl, "srcstep", srcstep,
-                     "rev", bin(bitrev(srcstep, vl)))
-                srcstep = bitrev(srcstep, vl)
-
-        return (srcstep, dststep)
+        return self.new_srcstep, self.new_dststep
 
     def update_new_svstate_steps(self):
         # note, do not get the bit-reversed srcstep here!
diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py
index e97f2ec6..c2758444 100644
--- a/src/openpower/decoder/isa/remap_dct_yield.py
+++ b/src/openpower/decoder/isa/remap_dct_yield.py
@@ -57,13 +57,15 @@ def iterate_dct_inner_halfswap_loadstore(SVSHAPE):
     ji = list(range(n))
 
     levels = n.bit_length() - 1
-    if SVSHAPE.submode2 == 0b001:
+    ri = [reverse_bits(i, levels) for i in range(n)]
+
+    if SVSHAPE.mode == 0b01: # FFT, bitrev only
+        ji = [ji[ri[i]] for i in range(n)]
+    elif SVSHAPE.submode2 == 0b001:
+        ji = [ji[ri[i]] for i in range(n)]
         ji = halfrev2(ji, True)
     else:
         ji = halfrev2(ji, False)
-
-    if False: # swap: TODO, add extra bit-reverse mode
-        ri = [reverse_bits(i, levels) for i in range(n)]
         ji = [ji[ri[i]] for i in range(n)]
 
     # invert order if requested
diff --git a/src/openpower/decoder/isa/svshape.py b/src/openpower/decoder/isa/svshape.py
index e445f583..4d4cb1d2 100644
--- a/src/openpower/decoder/isa/svshape.py
+++ b/src/openpower/decoder/isa/svshape.py
@@ -128,7 +128,7 @@ class SVSHAPE(SelectableInt):
                 iterate_fn = iterate_dct_outer_butterfly_indices
             elif self.ydimsz in [5, 13]:
                 iterate_fn = iterate_dct_inner_costable_indices
-            elif self.ydimsz == 6:
+            elif self.ydimsz in [6, 14, 15]:
                 iterate_fn = iterate_dct_inner_halfswap_loadstore
         # create a **NEW** iterator each time this is called
         return iterate_fn(deepcopy(self))
diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py
index ccdae236..f19b1d3e 100644
--- a/src/openpower/decoder/isa/test_caller_svp64_dct.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py
@@ -921,7 +921,7 @@ class DCTTestCase(FHDLTestCase):
         """>>> lst = [# LOAD bit-reversed with half-swap
                       "svshape 8, 1, 1, 6, 0",
                       "svremap 1, 0, 0, 0, 0, 0, 0, 1",
-                      "sv.lfsbr 0.v, 4(1), 2",
+                      "sv.lfssh 0.v, 4(1), 2",
                       # Inner butterfly, twin +/- MUL-ADD-SUB
                       "svremap 31, 1, 0, 2, 0, 1, 1",
                       "svshape 8, 1, 1, 4, 0",
@@ -939,7 +939,7 @@ class DCTTestCase(FHDLTestCase):
         lst = SVP64Asm( ["addi 1, 0, 0x000",
                          "svshape 8, 1, 1, 6, 0",
                          "svremap 1, 0, 0, 0, 0, 0, 0, 1",
-                         "sv.lfsbr 0.v, 4(1), 2",
+                         "sv.lfssh 0.v, 4(1), 2",
                          "svremap 31, 1, 0, 2, 0, 1, 1",
                          "svshape 8, 1, 1, 4, 0",
                          "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py
index 28aca452..25970efd 100644
--- a/src/openpower/decoder/isa/test_caller_svp64_fft.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py
@@ -530,10 +530,16 @@ class FFTTestCase(FHDLTestCase):
 
             however it turns out that they can be *merged*, and for
             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
-            *ignore* their REMAPs (by definition), and for the second
-            one (sv.ffads) exactly the right REMAPs are also ignored!
+            *ignore* their REMAPs (by definition, because you can't REMAP
+            scalar operands), and for the second one (sv.ffads) exactly the
+            right REMAPs are also ignored!
 
+            therefore we can merge:
+                "svremap 5, 1, 0, 2, 0, 0, 1",
+                "svremap 26, 0, 0, 0, 0, 1, 1",
+            into:
                 "svremap 31, 1, 0, 2, 0, 1, 1",
+            and save one instruction.
         """
         lst = SVP64Asm( [
                         # set triple butterfly mode with persistent "REMAP"
@@ -682,7 +688,7 @@ class FFTTestCase(FHDLTestCase):
 
     def test_sv_remap_fpmadds_fft_ldst(self):
         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
-                         "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
+                         "sv.lfssh 0.v, 4(0), 20", # bit-reversed
                          "svshape 8, 1, 1, 1, 0",
                          "svremap 31, 1, 0, 2, 0, 1, 0",
                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
@@ -690,8 +696,9 @@ class FFTTestCase(FHDLTestCase):
             runs a full in-place O(N log2 N) butterfly schedule for
             Discrete Fourier Transform, using bit-reversed LD/ST
         """
-        lst = SVP64Asm( ["setvl 0, 0, 8, 0, 1, 1",
-                         "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
+        lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
+                         "svremap 1, 0, 0, 0, 0, 0, 0, 0",
+                         "sv.lfssh 0.v, 4(0), 20", # shifted
                          "svshape 8, 1, 1, 1, 0",
                          "svremap 31, 1, 0, 2, 0, 1, 0",
                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
diff --git a/src/openpower/decoder/isa/test_caller_svp64_ldst.py b/src/openpower/decoder/isa/test_caller_svp64_ldst.py
index eee4d51f..c76fc874 100644
--- a/src/openpower/decoder/isa/test_caller_svp64_ldst.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_ldst.py
@@ -118,7 +118,7 @@ class DecoderTestCase(FHDLTestCase):
             self.assertEqual(sim.gpr(12), SelectableInt(0x1234, 64))
             self.assertEqual(sim.gpr(13), SelectableInt(0x1235, 64))
 
-    def test_sv_load_store_bitreverse(self):
+    def test_sv_load_store_shifted(self):
         """>>> lst = ["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0004",
                         "addi 3, 0, 0x0002",
@@ -127,21 +127,11 @@ class DecoderTestCase(FHDLTestCase):
                         "addi 7, 0, 0x303",
                         "addi 8, 0, 0x404",
                         "sv.stw 5.v, 0(1)",
-                        "sv.lwzbr 12.v, 4(1), 2"]
+                        "sv.lwzsh 12.v, 4(1), 2"]
 
-        note: bitreverse mode is... odd.  it's the butterfly generator
-        from Cooley-Tukey FFT:
-        https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
-
-        bitreverse LD is computed as:
+        shifted LD is computed as:
         for i in range(VL):
-            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
-
-        bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
-        produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
-
-        and thus creates the butterfly needed for one iteration of FFT.
-        the RC (shift) is to be able to offset the LDs by Radix-2 spans
+            EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC
         """
         lst = SVP64Asm(["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0000",
@@ -150,7 +140,7 @@ class DecoderTestCase(FHDLTestCase):
                         "addi 7, 0, 0x303",
                         "addi 8, 0, 0x404",
                         "sv.stw 5.v, 0(1)",  # scalar r1 + 0 + wordlen*offs
-                        "sv.lwzbr 12.v, 4(1), 2"]) # bit-reversed
+                        "sv.lwzsh 12.v, 4(1), 2"]) # bit-reversed
         lst = list(lst)
 
         # SVSTATE (in this case, VL=4)
@@ -173,21 +163,21 @@ class DecoderTestCase(FHDLTestCase):
             self.assertEqual(sim.gpr(7), SelectableInt(0x303, 64))
             self.assertEqual(sim.gpr(8), SelectableInt(0x404, 64))
             # r1=0x10, RC=0, offs=4: contents of memory expected at:
-            #    element 0:   EA = r1 + bitrev(0b00)*4 => 0x10 + 0b00*4 => 0x10
-            #    element 1:   EA = r1 + bitrev(0b01)*4 => 0x10 + 0b10*4 => 0x18
-            #    element 2:   EA = r1 + bitrev(0b10)*4 => 0x10 + 0b01*4 => 0x14
-            #    element 3:   EA = r1 + bitrev(0b11)*4 => 0x10 + 0b10*4 => 0x1c
+            #    element 0:   EA = r1 + 0b00*4 => 0x10 + 0b00*4 => 0x10
+            #    element 1:   EA = r1 + 0b01*4 => 0x10 + 0b01*4 => 0x18
+            #    element 2:   EA = r1 + 0b10*4 => 0x10 + 0b10*4 => 0x14
+            #    element 3:   EA = r1 + 0b11*4 => 0x10 + 0b11*4 => 0x1c
             # therefore loaded from (bit-reversed indexing):
             #    r9  => mem[0x10] which was stored from r5
             #    r10 => mem[0x18] which was stored from r6
             #    r11 => mem[0x18] which was stored from r7
             #    r12 => mem[0x1c] which was stored from r8
             self.assertEqual(sim.gpr(12), SelectableInt(0x101, 64))
-            self.assertEqual(sim.gpr(13), SelectableInt(0x303, 64))
-            self.assertEqual(sim.gpr(14), SelectableInt(0x202, 64))
+            self.assertEqual(sim.gpr(13), SelectableInt(0x202, 64))
+            self.assertEqual(sim.gpr(14), SelectableInt(0x303, 64))
             self.assertEqual(sim.gpr(15), SelectableInt(0x404, 64))
 
-    def test_sv_load_store_bitreverse_fp(self):
+    def test_sv_load_store_shifted_fp(self):
         """>>> lst = ["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0004",
                         "addi 3, 0, 0x0002",
@@ -198,19 +188,9 @@ class DecoderTestCase(FHDLTestCase):
                         "sv.std 5.v, 0(1)",
                         "sv.lfdbr 12.v, 4(1), 2"]
 
-        note: bitreverse mode is... odd.  it's the butterfly generator
-        from Cooley-Tukey FFT:
-        https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
-
-        bitreverse LD is computed as:
+        shifted LD is computed as:
         for i in range(VL):
-            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
-
-        bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
-        produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
-
-        and thus creates the butterfly needed for one iteration of FFT.
-        the RC (shift) is to be able to offset the LDs by Radix-2 spans
+            EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC
         """
         lst = SVP64Asm(["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0000",
@@ -219,7 +199,7 @@ class DecoderTestCase(FHDLTestCase):
                         "addi 7, 0, 0x303",
                         "addi 8, 0, 0x404",
                         "sv.std 5.v, 0(1)", # scalar r1 + 0 + wordlen*offs
-                        "sv.lfdbr 12.v, 8(1), 2"]) # bit-reversed
+                        "sv.lfdsh 12.v, 8(1), 2"]) # shifted
         lst = list(lst)
 
         # SVSTATE (in this case, VL=4)
@@ -258,35 +238,26 @@ class DecoderTestCase(FHDLTestCase):
             #    r11 => mem[0x18] which was stored from r7
             #    r12 => mem[0x1c] which was stored from r8
             self.assertEqual(sim.fpr(12), SelectableInt(0x101, 64))
-            self.assertEqual(sim.fpr(13), SelectableInt(0x303, 64))
-            self.assertEqual(sim.fpr(14), SelectableInt(0x202, 64))
+            self.assertEqual(sim.fpr(13), SelectableInt(0x202, 64))
+            self.assertEqual(sim.fpr(14), SelectableInt(0x303, 64))
             self.assertEqual(sim.fpr(15), SelectableInt(0x404, 64))
 
-    def test_sv_load_store_bitreverse2(self):
+    def test_sv_load_store_shifted2(self):
         """>>> lst = ["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0004",
                         "addi 3, 0, 0x0002",
                         "sv.stfs 4.v, 0(1)",
-                        "sv.lfsbr 12.v, 4(1), 2"]
-
-        note: bitreverse mode is... odd.  it's the butterfly generator
-        from Cooley-Tukey FFT:
-        https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
+                        "sv.lfssh 12.v, 4(1), 2"]
 
-        bitreverse LD is computed as:
+        shifted LD is computed as:
         for i in range(VL):
-            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
-
-        bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
-        produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
+            EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC
 
-        and thus creates the butterfly needed for one iteration of FFT.
-        the RC (shift) is to be able to offset the LDs by Radix-2 spans
         """
         lst = SVP64Asm(["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0000",
                         "sv.stfs 4.v, 0(1)",  # scalar r1 + 0 + wordlen*offs
-                        "sv.lfsbr 12.v, 4(1), 2"]) # bit-reversed
+                        "sv.lfssh 12.v, 4(1), 2"]) # shifted (by zero, but hey)
         lst = list(lst)
 
         # SVSTATE (in this case, VL=4)
@@ -306,8 +277,8 @@ class DecoderTestCase(FHDLTestCase):
         # expected results, remember that bit-reversed load has been done
         expected_fprs = deepcopy(fprs)
         expected_fprs[12] = fprs[4] # 0b00 -> 0b00
-        expected_fprs[13] = fprs[6] # 0b01 -> 0b10
-        expected_fprs[14] = fprs[5] # 0b10 -> 0b01
+        expected_fprs[13] = fprs[5] # 0b10 -> 0b01
+        expected_fprs[14] = fprs[6] # 0b01 -> 0b10
         expected_fprs[15] = fprs[7] # 0b11 -> 0b11
 
         with Program(lst, bigendian=False) as program:
@@ -362,7 +333,7 @@ class DecoderTestCase(FHDLTestCase):
                         "svshape 3, 3, 4, 0, 0",
                         "svremap 1, 1, 2, 0, 0, 0, 0, 1",
                         "sv.lwz 20.v, 0(1)",
-                        #"sv.lwzbr 12.v, 4(1), 2", # bit-reversed
+                        #"sv.lwzsh 12.v, 4(1), 2", # bit-reversed
                         ])
         lst = list(lst)
 
@@ -419,11 +390,11 @@ class DecoderTestCase(FHDLTestCase):
                         "sv.stw 5.v, 0(1)",
                         "svshape 8, 1, 1, 6, 0",
                         "svremap 31, 1, 2, 3, 0, 0, 0, 0",
-                        "sv.lwzbr 12.v, 4(1), 2"]
+                        "sv.lwzsh 12.v, 4(1), 2"]
 
-        bitreverse LD is computed as:
+        shifted LD is computed as:
         for i in range(VL):
-            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
+            EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC
 
         bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
         produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
@@ -448,7 +419,7 @@ class DecoderTestCase(FHDLTestCase):
                         "svshape 8, 1, 1, 6, 0",
                         "svremap 1, 0, 0, 0, 0, 0, 0, 1",
                         #"setvl 0, 0, 8, 0, 1, 1",
-                        "sv.lwzbr 12.v, 4(1), 2",  # bit-reversed
+                        "sv.lwzsh 12.v, 4(1), 2",  # bit-reversed
                         #"sv.lwz 12.v, 0(1)"
                         ])
         lst = list(lst)
@@ -506,11 +477,11 @@ class DecoderTestCase(FHDLTestCase):
                         "sv.stw 5.v, 0(1)",
                         "svshape 8, 1, 1, 6, 0",
                         "svremap 31, 1, 2, 3, 0, 0, 0, 0",
-                        "sv.lwzbr 12.v, 4(1), 2"]
+                        "sv.lwzsh 12.v, 4(1), 2"]
 
         bitreverse LD is computed as:
         for i in range(VL):
-            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
+            EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC
 
         bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
         produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
@@ -535,7 +506,7 @@ class DecoderTestCase(FHDLTestCase):
                         "svshape 8, 1, 1, 14, 0",
                         "svremap 16, 0, 0, 0, 0, 0, 0, 1",
                         #"setvl 0, 0, 8, 0, 1, 1",
-                        "sv.lwzbr 12.v, 4(1), 2",  # bit-reversed
+                        "sv.lwzsh 12.v, 4(1), 2",  # bit-reversed
                         #"sv.lwz 12.v, 0(1)"
                         ])
         lst = list(lst)
diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py
index 8bc3ec6b..b20a014a 100644
--- a/src/openpower/decoder/power_decoder2.py
+++ b/src/openpower/decoder/power_decoder2.py
@@ -1008,7 +1008,7 @@ class PowerDecodeSubset(Elaboratable):
             with m.If(self.is_svp64_mode & is_major_ld):
                 # straight-up: "it's a LD".  this gives enough info
                 # for SVP64 RM Mode decoding to detect LD/ST, and
-                # consequently detect the BITREVERSE mode. sigh
+                # consequently detect the SHIFT mode. sigh
                 comb += rm_dec.fn_in.eq(Function.LDST)
             with m.Else():
                 comb += rm_dec.fn_in.eq(fn) # decode needs to know Fn type
@@ -1021,9 +1021,9 @@ class PowerDecodeSubset(Elaboratable):
 
             # main PowerDecoder2 determines if different SVP64 modes enabled
             if not self.final:
-                # if bit-reverse mode requested
-                bitrev = rm_dec.ldstmode == SVP64LDSTmode.BITREVERSE
-                comb += self.use_svp64_ldst_dec.eq(bitrev)
+                # if shift mode requested
+                shiftmode = rm_dec.ldstmode == SVP64LDSTmode.SHIFT
+                comb += self.use_svp64_ldst_dec.eq(shiftmode)
             # detect if SVP64 FFT mode enabled (really bad hack),
             # exclude fcfids and others
             # XXX this is a REALLY bad hack, REALLY has to be done better.
diff --git a/src/openpower/decoder/power_enums.py b/src/openpower/decoder/power_enums.py
index 4f999f73..aeca8efe 100644
--- a/src/openpower/decoder/power_enums.py
+++ b/src/openpower/decoder/power_enums.py
@@ -216,7 +216,7 @@ class SVP64LDSTmode(Enum):
     INDEXED = 1
     ELSTRIDE = 2
     UNITSTRIDE = 3
-    BITREVERSE = 4
+    SHIFT = 4
 
 
 # supported instructions: make sure to keep up-to-date with CSV files
diff --git a/src/openpower/decoder/power_svp64_rm.py b/src/openpower/decoder/power_svp64_rm.py
index c052924d..fe78f65a 100644
--- a/src/openpower/decoder/power_svp64_rm.py
+++ b/src/openpower/decoder/power_svp64_rm.py
@@ -182,9 +182,9 @@ class SVP64RMModeDecode(Elaboratable):
                     with m.If(self.rc_in):
                         comb += els.eq(mode[SVP64MODE.ELS_FFIRST_PRED])
 
-            # Bit-reversed Mode
-            with m.If(mode[SVP64MODE.LDST_BITREV]):
-                comb += self.ldstmode.eq(SVP64LDSTmode.BITREVERSE)
+            # Shifted Mode
+            with m.If(mode[SVP64MODE.LDST_SHIFT]):
+                comb += self.ldstmode.eq(SVP64LDSTmode.SHIFT)
             # RA is vectorised
             with m.Elif(self.ldst_ra_vec):
                 comb += self.ldstmode.eq(SVP64LDSTmode.INDEXED)
diff --git a/src/openpower/sv/trans/svp64.py b/src/openpower/sv/trans/svp64.py
index deed174a..2898a3d9 100644
--- a/src/openpower/sv/trans/svp64.py
+++ b/src/openpower/sv/trans/svp64.py
@@ -290,17 +290,17 @@ class SVP64Asm:
             v30b_op = v30b_op[:-1]
 
         # sigh again, have to recognised LD/ST bit-reverse instructions
-        # this has to be "processed" to fit into a v3.0B without the "br"
-        # e.g. ldbr is actually ld
-        ldst_bitreverse = v30b_op.startswith("l") and v30b_op.endswith("br")
+        # this has to be "processed" to fit into a v3.0B without the "sh"
+        # e.g. ldsh is actually ld
+        ldst_shift = v30b_op.startswith("l") and v30b_op.endswith("sh")
 
         if v30b_op not in isa.instr:
             raise Exception("opcode %s of '%s' not supported" % \
                             (v30b_op, insn))
 
-        if ldst_bitreverse:
+        if ldst_shift:
             # okaay we need to process the fields and make this:
-            #     ldbr RT, SVD(RA), RC  - 11 bits for SVD, 5 for RC
+            #     ldsh RT, SVD(RA), RC  - 11 bits for SVD, 5 for RC
             # into this:
             #     ld RT, D(RA)          - 16 bits
             # likewise same for SVDS (9 bits for SVDS, 5 for RC, 14 bits for DS)
@@ -329,9 +329,9 @@ class SVP64Asm:
             newfields[1] = "%d(%s)" % (immed, RA)
             fields = newfields
 
-            # and strip off "br" from end, and add "br" to opmodes, instead
+            # and strip off "sh" from end, and add "sh" to opmodes, instead
             v30b_op = v30b_op[:-2]
-            opmodes.append("br")
+            opmodes.append("sh")
             log ("rewritten", v30b_op, opmodes, fields)
 
         if v30b_op not in svp64.instrs:
@@ -630,9 +630,9 @@ class SVP64Asm:
                 smmode, smask = decode_predicate(encmode[3:])
                 mmode = smmode
                 has_smask = True
-            # bitreverse LD/ST
-            elif encmode.startswith("br"):
-                ldst_bitreverse = True
+            # shifted LD/ST
+            elif encmode.startswith("sh"):
+                ldst_shift = True
             # vec2/3/4
             elif encmode.startswith("vec"):
                 subvl = decode_subvl(encmode[3:])
@@ -730,10 +730,10 @@ class SVP64Asm:
             assert has_pmask or mask_m_specified, \
                 "dest zeroing requires a dest predicate"
 
-        # check LDST bitreverse, only available in "normal" mode
-        if is_ldst and ldst_bitreverse:
+        # check LDST shifted, only available in "normal" mode
+        if is_ldst and ldst_shift:
             assert sv_mode is None, \
-                "LD bit-reverse cannot have modes (%s) applied" % sv_mode
+                "LD shift cannot have modes (%s) applied" % sv_mode
 
         ######################################
         # "normal" mode
@@ -743,9 +743,9 @@ class SVP64Asm:
             if is_ldst:
                 # TODO: for now, LD/ST-indexed is ignored.
                 mode |= ldst_elstride << SVP64MODE.ELS_NORMAL # element-strided
-                # bitreverse mode
-                if ldst_bitreverse:
-                    mode |= 1 << SVP64MODE.LDST_BITREV
+                # shifted mode
+                if ldst_shift:
+                    mode |= 1 << SVP64MODE.LDST_SHIFT
             else:
                 # TODO, reduce and subvector mode
                 # 00  1   dz CRM  reduce mode (mapreduce), SUBVL=1
@@ -1089,8 +1089,8 @@ if __name__ == '__main__':
     lst = [
              'sv.addi win2.v, win.v, -1',
              'sv.add./mrr 5.v, 2.v, 1.v',
-             #'sv.lhzbr 5.v, 11(9.v), 15',
-             #'sv.lwzbr 5.v, 11(9.v), 15',
+             #'sv.lhzsh 5.v, 11(9.v), 15',
+             #'sv.lwzsh 5.v, 11(9.v), 15',
              'sv.ffmadds 6.v, 2.v, 4.v, 6.v',
     ]
     lst = [
@@ -1101,8 +1101,8 @@ if __name__ == '__main__':
              'svshape 8, 1, 1, 1, 1',
             ]
     lst = [
-             #'sv.lfsbr 4.v, 11(8.v), 15',
-             #'sv.lwzbr 4.v, 11(8.v), 15',
+             #'sv.lfssh 4.v, 11(8.v), 15',
+             #'sv.lwzsh 4.v, 11(8.v), 15',
              #'sv.svstep. 2.v, 4, 0',
              #'sv.fcfids. 48.v, 64.v',
              'sv.fcoss. 80.v, 0.v',
-- 
2.30.2