From a77af7ef383640c695cc435124b855540e0f0ba1 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Wed, 7 Jul 2021 14:07:27 +0100
Subject: [PATCH] add in DFT test from Project Nayuki to verify results of
 butterfly schedule

---
 .../decoder/isa/test_caller_svp64_fft.py      | 187 +++++++++++-------
 1 file changed, 112 insertions(+), 75 deletions(-)

diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py
index fe7c7f6d..23bd38e5 100644
--- a/src/openpower/decoder/isa/test_caller_svp64_fft.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py
@@ -17,71 +17,65 @@ from copy import deepcopy
 from openpower.decoder.helpers import fp64toselectable
 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
 
+
+def transform_radix2(vec, exptable):
+    """
+    # FFT and convolution test (Python), based on Project Nayuki
+    #
+    # Copyright (c) 2020 Project Nayuki. (MIT License)
+    # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+
+    """
+    # bits of the integer 'val'.
+    def reverse_bits(val, width):
+        result = 0
+        for _ in range(width):
+            result = (result << 1) | (val & 1)
+            val >>= 1
+        return result
+
+    # Initialization
+    n = len(vec)
+    levels = n.bit_length() - 1
+
+    # Copy with bit-reversed permutation
+    #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
+
+    size = 2
+    while size <= n:
+        halfsize = size // 2
+        tablestep = n // size
+        for i in range(0, n, size):
+            k = 0
+            for j in range(i, i + halfsize):
+                # exact same actual computation, just embedded in
+                # triple-nested for-loops
+                jl, jh = j, j+halfsize
+                temp1 = vec[jh] * exptable[k]
+                temp2 = vec[jl]
+                vec[jh] = temp2 - temp1
+                vec[jl] = temp2 + temp1
+                print ("transform_radix2 jl jh k", jl, jh, k,
+                       "temp1, temp2", temp1, temp2,
+                       "v[jh] v[jl]", vec[jh], vec[jl])
+                k += tablestep
+        size *= 2
+
+    return vec
+
+
 class DecoderTestCase(FHDLTestCase):
 
     def _check_regs(self, sim, expected):
         for i in range(32):
             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 
-    def tst_sv_fpmadds_fft(self):
-        """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
-                        ]
-            four in-place vector mul-adds, four in-place vector mul-subs
-
-            this is the twin "butterfly" mul-add-sub from Cooley-Tukey
-            https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
-
-            there is the *option* to target a different location (non-in-place)
-            just in case.
-
-            SVP64 "FFT" mode will *automatically* offset FRB and an implicit
-            FRS to perform the two multiplies.  one add, one subtract.
-
-            sv.ffmadds FRT, FRA, FRC, FRB  actually does:
-                fmadds  FRT   , FRA, FRC, FRA
-                fnmsubs FRT+vl, FRA, FRC, FRB+vl
-        """
-        lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
-                        ])
-        lst = list(lst)
-
-        fprs = [0] * 32
-        av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
-        bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
-        coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
-        res = []
-        # work out the results with the twin mul/add-sub
-        for i, (a, b, c) in enumerate(zip(av, bv, coe)):
-            fprs[i+2] = fp64toselectable(a)
-            fprs[i+6] = fp64toselectable(b)
-            fprs[i+10] = fp64toselectable(c)
-            mul = a * c
-            t = a + mul
-            u = b - mul
-            t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
-            u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
-            res.append((t, u))
-            print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
-
-        # SVSTATE (in this case, VL=2)
-        svstate = SVP64State()
-        svstate.vl[0:7] = 4 # VL
-        svstate.maxvl[0:7] = 4 # MAXVL
-        print ("SVSTATE", bin(svstate.spr.asint()))
-
-        with Program(lst, bigendian=False) as program:
-            sim = self.run_tst_program(program, svstate=svstate,
-                                       initial_fprs=fprs)
-            # confirm that the results are as expected
-            for i, (t, u) in enumerate(res):
-                self.assertEqual(sim.fpr(i+2), t)
-                self.assertEqual(sim.fpr(i+6), u)
-
     def test_sv_remap_fpmadds_fft(self):
         """>>> lst = ["svremap 8, 1, 1, 1",
                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
                      ]
-            four in-place vector mul-adds, four in-place vector mul-subs
+            runs a full in-place O(N log2 N) butterfly schedule for
+            Discrete Fourier Transform.
 
             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
@@ -89,38 +83,28 @@ class DecoderTestCase(FHDLTestCase):
             there is the *option* to target a different location (non-in-place)
             just in case.
 
-            SVP64 "FFT" mode will *automatically* offset FRB and an implicit
-            FRS to perform the two multiplies.  one add, one subtract.
-
-            sv.ffmadds FRT, FRA, FRC, FRB  actually does:
-                fmadds  FRT   , FRA, FRC, FRA
-                fnmsubs FRT+vl, FRA, FRC, FRB+vl
+            SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
+            (3 inputs, 2 outputs)
         """
         lst = SVP64Asm( ["svremap 8, 1, 1, 1",
                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
                         ])
         lst = list(lst)
 
-        fprs = [0] * 32
+        # array and coefficients to test
         av = [7.0, -9.8, 3.0, -32.3,
               -2.0, 5.0, -9.8, 31.3] # array 0..7
         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
+
         # store in regfile
+        fprs = [0] * 32
         for i, c in enumerate(coe):
             fprs[i+8] = fp64toselectable(c)
         for i, a in enumerate(av):
             fprs[i+0] = fp64toselectable(a)
+
         # work out the results with the twin mul/add-sub
-        res = []
-        for i, (a, c) in enumerate(zip(av, coe)):
-            continue
-            mul = a * c
-            t = a + mul
-            u = b - mul
-            t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
-            u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
-            res.append((t, u))
-            print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
+        res = transform_radix2(av, coe)
 
         # set total. err don't know how to calculate how many there are...
         # do it manually for now
@@ -151,10 +135,63 @@ class DecoderTestCase(FHDLTestCase):
             print ("spr svshape1", sim.spr['SVSHAPE1'])
             print ("spr svshape2", sim.spr['SVSHAPE2'])
             print ("spr svshape3", sim.spr['SVSHAPE3'])
-            for i in range(8):
-                print ("i", i, float(sim.fpr(i)))
+            for i, expected in enumerate(res):
+                print ("i", i, float(sim.fpr(i)), "expected", expected)
+            for i, expected in enumerate(res):
+                # convert to Power single
+                expected = DOUBLE2SINGLE(fp64toselectable(expected))
+                self.assertEqual(sim.fpr(i), expected)
+
+
+    def test_sv_fpmadds_fft(self):
+        """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
+                        ]
+            four in-place vector mul-adds, four in-place vector mul-subs
+
+            this is the twin "butterfly" mul-add-sub from Cooley-Tukey
+            https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
+
+            there is the *option* to target a different location (non-in-place)
+            just in case.
+
+            SVP64 "FFT" mode will *automatically* offset FRB and an implicit
+            FRS to perform the two multiplies.  one add, one subtract.
+
+            sv.ffmadds FRT, FRA, FRC, FRB  actually does:
+                fmadds  FRT   , FRA, FRC, FRA
+                fnmsubs FRT+vl, FRA, FRC, FRB+vl
+        """
+        lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
+                        ])
+        lst = list(lst)
 
-            return
+        fprs = [0] * 32
+        av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
+        bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
+        coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
+        res = []
+        # work out the results with the twin mul/add-sub
+        for i, (a, b, c) in enumerate(zip(av, bv, coe)):
+            fprs[i+2] = fp64toselectable(a)
+            fprs[i+6] = fp64toselectable(b)
+            fprs[i+10] = fp64toselectable(c)
+            mul = a * c
+            t = a + mul
+            u = b - mul
+            t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
+            u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
+            res.append((t, u))
+            print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
+
+        # SVSTATE (in this case, VL=2)
+        svstate = SVP64State()
+        svstate.vl[0:7] = 4 # VL
+        svstate.maxvl[0:7] = 4 # MAXVL
+        print ("SVSTATE", bin(svstate.spr.asint()))
+
+        with Program(lst, bigendian=False) as program:
+            sim = self.run_tst_program(program, svstate=svstate,
+                                       initial_fprs=fprs)
             # confirm that the results are as expected
             for i, (t, u) in enumerate(res):
                 self.assertEqual(sim.fpr(i+2), t)
-- 
2.30.2