small inner DCT butterfly test, fix up order of fdmadds
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
openpower/isa/simplev.mdwn
openpower/isa/svfparith.mdwn
src/openpower/decoder/isa/remap_dct_yield.py
src/openpower/decoder/isa/test_caller_svp64_dct.py

index c4eb7711e1fa1f4679ebaac32a62833898217862..0a735cb775f21e9ac11068b260cb04f0c8aeeffe 100644 (file)
@@ -129,14 +129,15 @@ Pseudo-code:
         n <- ((0b0 || SVxd) + 1) * n
         vlen[0:6] <- n[1:7]
         # set up template in SVSHAPE0, then copy to 1-3
-        # for FRA and FRT
+        # set up FRB and FRS
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         SVSHAPE0[30:31] <- 0b01          # Butterfly mode
         SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
+        SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
         # copy
         SVSHAPE1[0:31] <- SVSHAPE0[0:31]
-        # set up FRB and FRS
-        SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
+        # for FRA and FRT
+        SVSHAPE0[28:29] <- 0b01           # j+halfstep schedule
     # set VL, MVL and Vertical-First
     SVSTATE[0:6] <- vlen
     SVSTATE[7:13] <- vlen
index 7cc02df83b80f033e94454107a717e183f5a477c..2bbf14295bcb8849e9749e626833c557f4cc31d8 100644 (file)
@@ -168,9 +168,9 @@ A-Form
 
 Pseudo-code:
 
-    FRT <- FPADD32(FRA, FRB)
-    sub <- FPSUB32(FRB, FRA)
-    FRS <- FPMUL32(FRC, sub)
+    FRS <- FPADD32(FRA, FRB)
+    sub <- FPSUB32(FRA, FRB)
+    FRT <- FPMUL32(FRC, sub)
 
 Special Registers Altered:
 
index d4a973be3d6455fbbb2ff6f45ed4ccf18833c520..091904ab9e2d583627db191501f9996a9ad21955 100644 (file)
@@ -326,7 +326,7 @@ def transform2(vec):
         vec[jh] = (t1 - t2) * (1/coeff)
         print ("coeff", size, i, "ci", ci,
                 "jl", jl, "jh", jh,
-               "i/n", (ci+0.5)/size, coeff, vec[jl],
+               "i/n", (ci+0.5)/size, 1.0/coeff, vec[jl],
                                             vec[jh],
                 "end", bin(jle), bin(jhe))
         if jle == 0b111: # all loops end
@@ -339,7 +339,6 @@ def transform2(vec):
     # j schedule
     SVSHAPE0 = SVSHAPE()
     SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
     SVSHAPE0.submode2 = 0b10
     SVSHAPE0.mode = 0b01
     SVSHAPE0.skip = 0b00
@@ -348,7 +347,6 @@ def transform2(vec):
     # j+halfstep schedule
     SVSHAPE1 = SVSHAPE()
     SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
     SVSHAPE1.mode = 0b01
     SVSHAPE1.submode2 = 0b10
     SVSHAPE1.skip = 0b01
@@ -389,7 +387,7 @@ def demo():
     # j schedule
     SVSHAPE0 = SVSHAPE()
     SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
+    SVSHAPE0.submode2 = 0b010
     SVSHAPE0.mode = 0b01
     SVSHAPE0.skip = 0b00
     SVSHAPE0.offset = 0       # experiment with different offset, here
@@ -397,7 +395,7 @@ def demo():
     # j+halfstep schedule
     SVSHAPE1 = SVSHAPE()
     SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
+    SVSHAPE1.submode2 = 0b010
     SVSHAPE1.mode = 0b01
     SVSHAPE1.skip = 0b01
     SVSHAPE1.offset = 0       # experiment with different offset, here
@@ -424,7 +422,6 @@ def demo():
     # j schedule
     SVSHAPE0 = SVSHAPE()
     SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
     SVSHAPE0.mode = 0b10
     SVSHAPE0.submode2 = 0b100
     SVSHAPE0.skip = 0b10
@@ -433,7 +430,6 @@ def demo():
     # j+halfstep schedule
     SVSHAPE1 = SVSHAPE()
     SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
     SVSHAPE1.mode = 0b10
     SVSHAPE1.submode2 = 0b100
     SVSHAPE1.skip = 0b11
index d6c29edf0caa3debecc95e6c1b4a6b6505768665..53fd51fa23c806464c467e7b5e7f70bfec8422ad 100644 (file)
@@ -68,10 +68,11 @@ def transform_inner_radix2(vec, ctable):
         t1, t2 = vec[jl], vec[jh]
         coeff = ctable[k]
         vec[jl] = t1 + t2
-        vec[jh] = (t1 - t2) * (1/coeff)
+        vec[jh] = (t1 - t2) * (1.0/coeff)
         print ("coeff", "ci", k,
                 "jl", jl, "jh", jh,
-               "i/n", (k+0.5), coeff, vec[jl], vec[jh],
+               "i/n", (k+0.5), 1.0/coeff,
+                "t1, t2", t1, t2, "res", vec[jl], vec[jh],
                 "end", bin(jle), bin(jhe))
         if jle == 0b111: # all loops end
             break
@@ -85,7 +86,7 @@ class DCTTestCase(FHDLTestCase):
         for i in range(32):
             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 
-    def tst_sv_ffadds_dct(self):
+    def test_sv_ffadds_dct(self):
         """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                         ]
             four in-place vector adds, four in-place vector mul-subs
@@ -116,14 +117,14 @@ class DCTTestCase(FHDLTestCase):
             # this isn't quite a perfect replication of the
             # FP32 mul-add-sub.  better really to use FPMUL32, FPADD32
             # and FPSUB32 directly to be honest.
-            t = b + a
-            diff = (b - a)
+            t = a + b
+            diff = (a - b)
             diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
             diff = float(diff)
             u = diff * c
             tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
             uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
-            res.append((tc, uc))
+            res.append((uc, tc))
             print ("DCT", i, "in", a, b, "c", c, "res", t, u)
 
         # SVSTATE (in this case, VL=2)
@@ -146,20 +147,24 @@ class DCTTestCase(FHDLTestCase):
                 self.assertEqual(sim.fpr(i+0), t)
                 self.assertEqual(sim.fpr(i+4), u)
 
-    def test_sv_remap_fpmadds_dct(self):
+    def test_sv_remap_fpmadds_dct_4(self):
         """>>> lst = ["svshape 4, 1, 1, 2, 0",
-                     "svremap 31, 1, 0, 2, 0, 1, 0",
+                     "svremap 27, 1, 0, 2, 0, 1, 0",
                         "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                      ]
-            runs a full in-place O(N log2 N) butterfly schedule for
-            DCT
+            runs a full in-place 4-long O(N log2 N) inner butterfly schedule
+            for DCT
 
             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
             (3 inputs, 2 outputs)
+
+            Note that the coefficient (FRC) is not on a "schedule", it
+            is straight Vectorised (0123...) because DCT coefficients
+            cannot be shared between butterfly layers (due to +0.5)
         """
         lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
-                         "svremap 31, 1, 0, 2, 0, 1, 0",
-                        "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
+                         "svremap 27, 1, 0, 2, 0, 1, 0",
+                         "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                         ])
         lst = list(lst)
 
@@ -178,7 +183,7 @@ class DCTTestCase(FHDLTestCase):
         # store in regfile
         fprs = [0] * 32
         for i, c in enumerate(coe):
-            fprs[i+8] = fp64toselectable(c)
+            fprs[i+8] = fp64toselectable(1.0 / c) # invert
         for i, a in enumerate(av):
             fprs[i+0] = fp64toselectable(a)
 
@@ -207,7 +212,7 @@ class DCTTestCase(FHDLTestCase):
                 # and the rounding is different
                 err = abs((actual - expected) / expected)
                 print ("err", i, err)
-                self.assertTrue(err < 1e-7)
+                self.assertTrue(err < 1e-6)
 
     def run_tst_program(self, prog, initial_regs=None,
                               svstate=None,