adding reduced COS table DCT test
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 27 Jul 2021 15:40:38 +0000 (16:40 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 27 Jul 2021 15:40:38 +0000 (16:40 +0100)
openpower/isa/simplev.mdwn
src/openpower/decoder/isa/remap_dct_yield.py
src/openpower/decoder/isa/test_caller_svp64_dct.py

index fd0c242a57b07a2eed36b5f7ffc423a536231917..7d70691bd4ff595f7e072d021a4f80c47b79c009 100644 (file)
@@ -164,8 +164,10 @@ Pseudo-code:
         # set up FRB and FRS
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         SVSHAPE0[30:31] <- 0b01          # DCT/FFT mode
-        #if (SVRM = 0b0100) then
-        SVSHAPE0[6:11] <- 0b000001       # DCT Inner Butterfly mode
+        if (SVRM = 0b0100) then
+            SVSHAPE0[6:11] <- 0b000011       # DCT Inner Butterfly mode 4
+        else
+            SVSHAPE0[6:11] <- 0b000001       # DCT Inner Butterfly mode 2
         SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
         SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
         # copy
index 344d3404419e2d55b20eebfcca03d815d23775cd..3b2bf64c276beaf278c8afeaae78f4b7f9ca020e 100644 (file)
@@ -108,7 +108,7 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE):
     # get indices to iterate over, in the required order
     n = SVSHAPE.lims[0]
     mode = SVSHAPE.lims[1]
-    #print ("inner butterfly", mode)
+    print ("inner butterfly", mode)
     # creating lists of indices to iterate over in each dimension
     # has to be done dynamically, because it depends on the size
     # first, the size-based loop (which can be done statically)
@@ -145,6 +145,8 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE):
 
     # start an infinite (wrapping) loop
     skip = 0
+    k = 0
+    k_start = 0
     while True:
         for size in x_r:           # loop over 3rd order dimension (size)
             x_end = size == x_r[-1]
@@ -171,22 +173,30 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE):
                     jr = j_r[:hz2]
                 #print ("xform jr", jr)
                 # loop over 1st order dimension
+                k = k_start
                 for ci, (jl, jh) in enumerate(zip(j, jr)):
                     z_end = jl == j[-1]
                     # now depending on MODE return the index.  inner butterfly
                     if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
                         result = ri[ji[jl]]        # lower half
                     elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
-                        result = ri[ji[jh]] # upper half, reverse order
-                    elif SVSHAPE.skip == 0b10: #
-                        result = ci # coefficient helper
-                    elif SVSHAPE.skip == 0b11: #
-                        result = size # coefficient helper
+                        result = ri[ji[jh]] # upper half
+                    elif mode == 4:
+                        # COS table pre-generated mode
+                        if SVSHAPE.skip == 0b10: #
+                            result = k # cos table offset
+                    else: # mode 2
+                        # COS table generated on-demand ("Vertical-First") mode
+                        if SVSHAPE.skip == 0b10: #
+                            result = ci # coefficient helper
+                        elif SVSHAPE.skip == 0b11: #
+                            result = size # coefficient helper
                     loopends = (z_end |
                                ((y_end and z_end)<<1) |
                                 ((y_end and x_end and z_end)<<2))
 
                     yield result + SVSHAPE.offset, loopends
+                    k += 1
 
                 # now in-place swap
                 if inplace_mode:
@@ -196,6 +206,9 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE):
                         tmp1, tmp2 = ji[jlh], ji[jh]
                         ji[jlh], ji[jh] = tmp2, tmp1
 
+            # new k_start point for cos tables( runs inside x_r loop NOT i loop)
+            k_start += halfsize
+
 
 # python "yield" can be iterated. use this to make it clear how
 # the indices are generated by using natural-looking nested loops
index 4b7c62b55bf884c934f7d31f06fc99164eb05cfe..49fc8df7413395e06ca1973b92a4bcfb53f34d53 100644 (file)
@@ -457,7 +457,8 @@ class DCTTestCase(FHDLTestCase):
 
     def test_sv_remap_dct_cos_precompute_inner_8(self):
         """pre-computes a DCT COS table, using the shorter costable
-        indices schedule
+        indices schedule.  turns out, some COS values are repeated
+        in each layer of the DCT butterfly.
 
         the simpler (scalar) version is in test_caller_transcendentals.py
         (test_fp_coss_cvt), this is the SVP64 variant.  TODO: really
@@ -530,6 +531,75 @@ class DCTTestCase(FHDLTestCase):
                                         "err", err)
                 self.assertTrue(err < 1e-6)
 
+    def test_sv_remap_fpmadds_dct_8_mode_4(self):
+        """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
+                      "svshape 8, 1, 1, 4, 0",
+                      "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
+                      "svshape 8, 1, 1, 3, 0",
+                      "sv.fadds 0.v, 0.v, 0.v"
+                     ]
+            runs a full in-place 8-long O(N log2 N) DCT, both
+            inner and outer butterfly "REMAP" schedules.
+            uses shorter tables: FRC also needs to be on a Schedule
+        """
+        lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
+                         "svshape 8, 1, 1, 4, 0",
+                         "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
+                         "svshape 8, 1, 1, 3, 0",
+                         "sv.fadds 0.v, 0.v, 0.v"
+                        ])
+        lst = list(lst)
+
+        # array and coefficients to test
+        avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
+        n = len(avi)
+        levels = n.bit_length() - 1
+        ri = list(range(n))
+        ri = [ri[reverse_bits(i, levels)] for i in range(n)]
+        av = halfrev2(avi, False)
+        av = [av[ri[i]] for i in range(n)]
+        ctable = []
+        size = n
+        while size >= 2:
+            halfsize = size // 2
+            for ci in range(halfsize):
+                ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
+            size //= 2
+
+        # store in regfile
+        fprs = [0] * 32
+        for i, a in enumerate(av):
+            fprs[i+0] = fp64toselectable(a)
+        for i, c in enumerate(ctable):
+            fprs[i+8] = fp64toselectable(1.0 / c) # invert
+
+        with Program(lst, bigendian=False) as program:
+            sim = self.run_tst_program(program, initial_fprs=fprs)
+            print ("spr svshape0", sim.spr['SVSHAPE0'])
+            print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
+            print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
+            print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
+            print ("spr svshape1", sim.spr['SVSHAPE1'])
+            print ("spr svshape2", sim.spr['SVSHAPE2'])
+            print ("spr svshape3", sim.spr['SVSHAPE3'])
+
+            # outer iterative sum
+            res = transform2(avi)
+
+            for i, expected in enumerate(res):
+                print ("i", i, float(sim.fpr(i)), "expected", expected)
+            for i, expected in enumerate(res):
+                # convert to Power single
+                expected = DOUBLE2SINGLE(fp64toselectable(expected))
+                expected = float(expected)
+                actual = float(sim.fpr(i))
+                # approximate error calculation, good enough test
+                # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
+                # and the rounding is different
+                err = abs((actual - expected) / expected)
+                print ("err", i, err)
+                self.assertTrue(err < 1e-5)
+
     def run_tst_program(self, prog, initial_regs=None,
                               svstate=None,
                               initial_mem=None,