From 17f19a80183d260238e2cf4ba9b78ccac9fe5807 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Wed, 29 Mar 2023 10:08:56 +0100
Subject: [PATCH] remove DCT/iDCT redundant modes which require less-efficient
 cos tables turns out that values are often repeated so why waste space
 especially when the svshape instruction is under pressure this goes into
 https://libre-soc.org/openpower/sv/rfc/ls009/

---
 openpower/isa/simplev.mdwn                    |  12 +-
 .../decoder/isa/test_caller_svp64_dct.py      | 372 +-----------------
 2 files changed, 5 insertions(+), 379 deletions(-)

diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn
index b22007dc..c350b3cf 100644
--- a/openpower/isa/simplev.mdwn
+++ b/openpower/isa/simplev.mdwn
@@ -164,10 +164,9 @@ Pseudo-code:
         # FRC (coefficients)
         SVSHAPE2[28:29] <- 0b10           # k schedule
     # set schedule up for (i)DCT Inner butterfly
-    # SVrm Mode 2 (Mode 6 for iDCT) is for pre-calculated coefficients,
     # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
-    if ((SVrm = 0b0010) | (SVrm = 0b0100) |
-        (SVrm = 0b1010) | (SVrm = 0b1100)) then
+    if ((SVrm = 0b0100) |
+        (SVrm = 0b1100)) then
         # calculate O(N log2 N)
         n <- [0] * 3
         do while n < 5
@@ -181,17 +180,14 @@ Pseudo-code:
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
         mscale <- (0b0 || SVzd) + 1
-        if (SVrm = 0b1010) | (SVrm = 0b1100) then
+        if (SVrm = 0b1100) then
             SVSHAPE0[30:31] <- 0b11          # iDCT mode
             SVSHAPE0[18:20] <- 0b011         # iDCT Inner Butterfly sub-mode
         else
             SVSHAPE0[30:31] <- 0b01          # DCT mode
             SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
             SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
-        if (SVrm = 0b1100) | (SVrm = 0b0100) then
-            SVSHAPE0[6:11] <- 0b000011       # (i)DCT Inner Butterfly mode 4
-        else
-            SVSHAPE0[6:11] <- 0b000001       # (i)DCT Inner Butterfly mode 2
+        SVSHAPE0[6:11] <- 0b000011       # (i)DCT Inner Butterfly mode 4
         # copy
         SVSHAPE1[0:31] <- SVSHAPE0[0:31]
         SVSHAPE2[0:31] <- SVSHAPE0[0:31]
diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py
index 35e74226..78e47529 100644
--- a/src/openpower/decoder/isa/test_caller_svp64_dct.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py
@@ -308,149 +308,6 @@ class DCTTestCase(FHDLTestCase):
                 self.assertEqual(sim.fpr(i+0), t)
                 self.assertEqual(sim.fpr(i+4), u)
 
-    def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
-        self.sv_remap_fpmadds_dct_inner_4(stride=2)
-
-    def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
-        self.sv_remap_fpmadds_dct_inner_4(stride=1)
-
-    def sv_remap_fpmadds_dct_inner_4(self, stride=2):
-        """>>> lst = ["svshape 4, 1, 1, 2, 0",
-                     "svremap 27, 1, 0, 2, 0, 1, 0",
-                        "sv.fdmadds *0, *0, *0, *32"
-                     ]
-            runs a full in-place 4-long O(N log2 N) inner butterfly schedule
-            for DCT
-
-            SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
-            (3 inputs, 2 outputs)
-
-            Note that the coefficient (FRC) is not on a "schedule", it
-            is straight Vectorised (0123...) because DCT coefficients
-            cannot be shared between butterfly layers (due to +0.5)
-        """
-        lst = SVP64Asm(["svshape 4, 1, %d, 2, 0" % stride,
-                        "svremap 27, 1, 0, 2, 0, 1, 0",
-                        "sv.fdmadds *0, *0, *0, *16"
-                        ])
-        lst = list(lst)
-
-        # array and coefficients to test
-        n = 4
-        av = [7.0, -9.8, 3.0, -32.3]
-        coe = [-0.25, 0.5, 3.1, 6.2]  # 4 coefficients
-
-        levels = n.bit_length() - 1
-        ri = list(range(n))
-        ri = [ri[reverse_bits(i, levels)] for i in range(n)]
-        avi = [7.0, -0.8, 2.0, -2.3]  # first half of array 0..3
-        av = halfrev2(avi, False)
-        av = [av[ri[i]] for i in range(n)]
-
-        # store in regfile
-        fprs = [0] * 64
-        for i, c in enumerate(coe):
-            fprs[i+16] = fp64toselectable(1.0 / c)  # invert
-        for i, a in enumerate(av):
-            fprs[i*stride+0] = fp64toselectable(a)
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, initial_fprs=fprs)
-            print("spr svshape0", sim.spr['SVSHAPE0'])
-            print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
-            print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
-            print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
-            print("spr svshape1", sim.spr['SVSHAPE1'])
-            print("spr svshape2", sim.spr['SVSHAPE2'])
-            print("spr svshape3", sim.spr['SVSHAPE3'])
-
-            # work out the results with the twin mul/add-sub
-            res = transform_inner_radix2_dct(avi, coe)
-
-            for i, expected in enumerate(res):
-                print("i", i*stride, float(sim.fpr(i*stride)),
-                      "expected", expected)
-            for i, expected in enumerate(res):
-                # convert to Power single
-                expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
-                expected = float(expected)
-                actual = float(sim.fpr(i*stride))
-                # approximate error calculation, good enough test
-                # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
-                # and the rounding is different
-                err = abs((actual - expected) / expected)
-                print("err", i, err)
-                self.assertTrue(err < 1e-6)
-
-    def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
-        self.sv_remap_fpmadds_idct_inner_4(stride=2)
-
-    def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
-        self.sv_remap_fpmadds_idct_inner_4(stride=1)
-
-    def sv_remap_fpmadds_idct_inner_4(self, stride=2):
-        """>>> lst = ["svshape 4, 1, 1, 10, 0",
-                      "svremap 27, 0, 1, 2, 1, 0, 0",
-                      "sv.ffmadds *0, *0, *0, *8"
-                     ]
-            runs a full in-place 4-long O(N log2 N) inner butterfly schedule
-            for inverse-DCT
-
-            SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
-            (3 inputs, 2 outputs)
-
-            Note that the coefficient (FRC) is not on a "schedule", it
-            is straight Vectorised (0123...) because DCT coefficients
-            cannot be shared between butterfly layers (due to +0.5)
-        """
-        lst = SVP64Asm(["svshape 4, 1, %d, 10, 0" % stride,
-                        "svremap 27, 0, 1, 2, 1, 0, 0",
-                        "sv.ffmadds *0, *0, *0, *16"
-                        ])
-        lst = list(lst)
-
-        # array and coefficients to test
-        n = 4
-        levels = n.bit_length() - 1
-        coe = [-0.25, 0.5, 3.1, 6.2]  # 4 coefficients
-        avi = [7.0, -0.8, 2.0, -2.3]  # first half of array 0..3
-        av = halfrev2(avi, False)
-
-        # store in regfile
-        fprs = [0] * 64
-        for i, c in enumerate(coe):
-            fprs[i+16] = fp64toselectable(1.0 / c)  # invert
-        for i, a in enumerate(av):
-            fprs[i*stride+0] = fp64toselectable(a)
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, initial_fprs=fprs)
-            print("spr svshape0", sim.spr['SVSHAPE0'])
-            print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
-            print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
-            print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
-            print("spr svshape1", sim.spr['SVSHAPE1'])
-            print("spr svshape2", sim.spr['SVSHAPE2'])
-            print("spr svshape3", sim.spr['SVSHAPE3'])
-
-            # work out the results with the twin mul/add-sub
-            res = transform_inner_radix2_idct(avi, coe)
-
-            for i, expected in enumerate(res):
-                print("i", i*stride, float(sim.fpr(i*stride)),
-                      "expected", expected)
-            for i, expected in enumerate(res):
-                # convert to Power single
-                expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
-                expected = float(expected)
-                actual = float(sim.fpr(i*stride))
-                # approximate error calculation, good enough test
-                # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
-                # and the rounding is different
-                err = abs((actual - expected) / expected)
-                print("err", i, err)
-                self.assertTrue(err < 1e-6)
-
     def test_sv_remap_fpmadds_idct_outer_8(self, stride=2):
         """>>> lst = ["svshape 8, 1, 1, 11, 0",
                      "svremap 27, 0, 1, 2, 1, 0, 0",
@@ -562,233 +419,6 @@ class DCTTestCase(FHDLTestCase):
                 print("err", i, err)
                 self.assertTrue(err < 1e-6)
 
-    def test_sv_remap_fpmadds_idct_8(self, stride=2):
-        """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
-                         "svshape 8, 1, 1, 11, 0",
-                         "sv.fadds *0, *0, *0",
-                         "svshape 8, 1, 1, 10, 0",
-                         "sv.ffmadds *0, *0, *0, *16"
-                     ]
-            runs a full in-place 8-long O(N log2 N) inverse-DCT, both
-            inner and outer butterfly "REMAP" schedules.
-        """
-        lst = SVP64Asm(["svremap 27, 0, 1, 2, 1, 0, 1",
-                        "svshape 8, 1, %d, 11, 0" % stride,
-                        "sv.fadds *0, *0, *0",
-                        "svshape 8, 1, %d, 10, 0" % stride,
-                        "sv.ffmadds *0, *0, *0, *16"
-                        ])
-        lst = list(lst)
-
-        # array and coefficients to test
-        avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
-        n = len(avi)
-        levels = n.bit_length() - 1
-        ri = list(range(n))
-        ri = [ri[reverse_bits(i, levels)] for i in range(n)]
-        av = [avi[ri[i]] for i in range(n)]
-        av = halfrev2(av, True)
-
-        # divide first value by 2.0, manually.  rev and halfrev should
-        # not have moved it
-        av[0] /= 2.0
-        #avi[0] /= 2.0
-
-        print("input data pre idct", av)
-
-        ctable = []
-        size = 2
-        while size <= n:
-            halfsize = size // 2
-            for i in range(n//size):
-                for ci in range(halfsize):
-                    ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
-            size *= 2
-
-        # store in regfile
-        fprs = [0] * 32
-        for i, a in enumerate(av):
-            fprs[i*stride+0] = fp64toselectable(a)
-        for i, c in enumerate(ctable):
-            fprs[i+16] = fp64toselectable(1.0 / c)  # invert
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, initial_fprs=fprs)
-            print("spr svshape0", sim.spr['SVSHAPE0'])
-            print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
-            print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
-            print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
-            print("spr svshape1", sim.spr['SVSHAPE1'])
-            print("spr svshape2", sim.spr['SVSHAPE2'])
-            print("spr svshape3", sim.spr['SVSHAPE3'])
-
-            # inverse DCT
-            expected = [-15.793373940443367, 27.46969091937703,
-                        -24.712331606496313, 27.03601462756265]
-
-            #res = inverse_transform_iter(avi)
-            res = inverse_transform2(avi)
-            #res = transform_outer_radix2_idct(avi)
-
-            for i, expected in enumerate(res):
-                print("i", i*stride, float(sim.fpr(i*stride)),
-                      "expected", expected)
-            for i, expected in enumerate(res):
-                # convert to Power single
-                expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
-                expected = float(expected)
-                actual = float(sim.fpr(i*stride))
-                # approximate error calculation, good enough test
-                # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
-                # and the rounding is different
-                err = abs((actual - expected) / expected)
-                print("err", i*stride, err)
-                self.assertTrue(err < 1e-5)
-
-    def test_sv_remap_fpmadds_dct_8(self, stride=2):
-        """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
-                      "svshape 8, 1, 1, 2, 0",
-                      "sv.fdmadds *0, *0, *0, *8"
-                      "svshape 8, 1, 1, 3, 0",
-                      "sv.fadds *0, *0, *0"
-                     ]
-            runs a full in-place 8-long O(N log2 N) DCT, both
-            inner and outer butterfly "REMAP" schedules.
-        """
-        lst = SVP64Asm(["svremap 27, 1, 0, 2, 0, 1, 1",
-                        "svshape 8, 1, %d, 2, 0" % stride,
-                        "sv.fdmadds *0, *0, *0, *16",
-                        "svshape 8, 1, %d, 3, 0" % stride,
-                        "sv.fadds *0, *0, *0"
-                        ])
-        lst = list(lst)
-
-        # array and coefficients to test
-        avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
-        n = len(avi)
-        levels = n.bit_length() - 1
-        ri = list(range(n))
-        ri = [ri[reverse_bits(i, levels)] for i in range(n)]
-        av = halfrev2(avi, False)
-        av = [av[ri[i]] for i in range(n)]
-        ctable = []
-        size = n
-        while size >= 2:
-            halfsize = size // 2
-            for i in range(n//size):
-                for ci in range(halfsize):
-                    ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
-            size //= 2
-
-        # store in regfile
-        fprs = [0] * 32
-        for i, a in enumerate(av):
-            fprs[i*stride+0] = fp64toselectable(a)
-        for i, c in enumerate(ctable):
-            fprs[i+16] = fp64toselectable(1.0 / c)  # invert
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, initial_fprs=fprs)
-            print("spr svshape0", sim.spr['SVSHAPE0'])
-            print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
-            print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
-            print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
-            print("spr svshape1", sim.spr['SVSHAPE1'])
-            print("spr svshape2", sim.spr['SVSHAPE2'])
-            print("spr svshape3", sim.spr['SVSHAPE3'])
-
-            # outer iterative sum
-            res = transform2(avi)
-
-            for i, expected in enumerate(res):
-                print("i", i*stride, float(sim.fpr(i*stride)),
-                      "expected", expected)
-            for i, expected in enumerate(res):
-                # convert to Power single
-                expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
-                expected = float(expected)
-                actual = float(sim.fpr(i*stride))
-                # approximate error calculation, good enough test
-                # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
-                # and the rounding is different
-                err = abs((actual - expected) / expected)
-                print("err", i, err)
-                self.assertTrue(err < 1e-5)
-
-    def test_sv_remap_dct_cos_precompute_8(self):
-        """pre-computes a DCT COS table, deliberately using a lot of
-        registers so as to be able to see what is going on (dumping all
-        regs after the run).
-
-        the simpler (scalar) version is in test_caller_transcendentals.py
-        (test_fp_coss_cvt), this is the SVP64 variant.  TODO: really
-        need the new version of fcfids which doesn't spam memory with
-        LD/STs.
-        """
-        lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
-                        "svremap 0, 0, 0, 2, 0, 1, 1",
-                        "sv.svstep *4, 4, 1",  # svstep get vector of ci
-                        "sv.svstep *16, 3, 1",  # svstep get vector of step
-                        "addi 1, 0, 0x0000",
-                        "setvl 0, 0, 12, 0, 1, 1",
-                        "sv.std *4, 0(1)",
-                        "sv.lfd  *64, 0(1)",
-                        "sv.fcfids *48, *64",
-                        "addi 1, 0, 0x0060",
-                        "sv.std *16, 0(1)",
-                        "sv.lfd  *12, 0(1)",
-                        "sv.fcfids *24, *12",
-                        "sv.fadds *0, *24, 43",  # plus 0.5
-                        "sv.fmuls *0, *0, 41",  # times PI
-                        "sv.fdivs *0, *0, *48",  # div size
-                        "sv.fcoss *80, *0",
-                        "sv.fdivs *80, 43, *80",  # div 0.5 / x
-                        ])
-        lst = list(lst)
-
-        gprs = [0] * 32
-        fprs = [0] * 128
-        # constants
-        fprs[43] = fp64toselectable(0.5)         # 0.5
-        fprs[41] = fp64toselectable(math.pi)  # pi
-        fprs[44] = fp64toselectable(2.0)     # 2.0
-
-        n = 8
-
-        ctable = []
-        size = n
-        while size >= 2:
-            halfsize = size // 2
-            for i in range(n//size):
-                for ci in range(halfsize):
-                    ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
-            size //= 2
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
-            print("MEM")
-            sim.mem.dump()
-            print("ci FP")
-            for i in range(len(ctable)):
-                actual = float(sim.fpr(i+24))
-                print("i", i, actual)
-            print("size FP")
-            for i in range(len(ctable)):
-                actual = float(sim.fpr(i+48))
-                print("i", i, actual)
-            print("temps")
-            for i in range(len(ctable)):
-                actual = float(sim.fpr(i))
-                print("i", i, actual)
-            for i in range(len(ctable)):
-                expected = 1.0/ctable[i]
-                actual = float(sim.fpr(i+80))
-                err = abs((actual - expected) / expected)
-                print("i", i, actual, "1/expect", 1/expected,
-                      "expected", expected,
-                      "err", err)
-                self.assertTrue(err < 1e-6)
-
     def test_sv_remap_dct_cos_precompute_inner_8(self):
         """pre-computes a DCT COS table, using the shorter costable
         indices schedule.  turns out, some COS values are repeated
@@ -1037,7 +667,7 @@ class DCTTestCase(FHDLTestCase):
                       "svshape 8, 1, 1, 11, 0",
                       "sv.fadds *0, *0, *0",
                       # Inner butterfly, twin +/- MUL-ADD-SUB
-                      "svshape 8, 1, 1, 10, 0",
+                      "svshape 8, 1, 1, 12, 0",
                       "sv.ffmadds *0, *0, *0, *8"
                      ]
             runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
-- 
2.30.2