small inner DCT butterfly test, fix up order of fdmadds

author Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)

committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
author Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn

index c4eb7711e1fa1f4679ebaac32a62833898217862..0a735cb775f21e9ac11068b260cb04f0c8aeeffe 100644 (file)
--- a/openpower/isa/simplev.mdwn
+++ b/openpower/isa/simplev.mdwn
@@ -129,14 +129,15 @@ Pseudo-code:
          n <- ((0b0 || SVxd) + 1) * n
          vlen[0:6] <- n[1:7]
          # set up template in SVSHAPE0, then copy to 1-3
-        # for FRA and FRT
+        # set up FRB and FRS
          SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
          SVSHAPE0[30:31] <- 0b01          # Butterfly mode
          SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
+        SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
          # copy
          SVSHAPE1[0:31] <- SVSHAPE0[0:31]
-        # set up FRB and FRS
-        SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
+        # for FRA and FRT
+        SVSHAPE0[28:29] <- 0b01           # j+halfstep schedule
      # set VL, MVL and Vertical-First
      SVSTATE[0:6] <- vlen
      SVSTATE[7:13] <- vlen
diff --git a/openpower/isa/svfparith.mdwn b/openpower/isa/svfparith.mdwn

index 7cc02df83b80f033e94454107a717e183f5a477c..2bbf14295bcb8849e9749e626833c557f4cc31d8 100644 (file)
--- a/openpower/isa/svfparith.mdwn
+++ b/openpower/isa/svfparith.mdwn
@@ -168,9 +168,9 @@ A-Form
  
  Pseudo-code:
  
-    FRT <- FPADD32(FRA, FRB)
-    sub <- FPSUB32(FRB, FRA)
-    FRS <- FPMUL32(FRC, sub)
+    FRS <- FPADD32(FRA, FRB)
+    sub <- FPSUB32(FRA, FRB)
+    FRT <- FPMUL32(FRC, sub)
  
  Special Registers Altered:
  
diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py

index d4a973be3d6455fbbb2ff6f45ed4ccf18833c520..091904ab9e2d583627db191501f9996a9ad21955 100644 (file)
--- a/src/openpower/decoder/isa/remap_dct_yield.py
+++ b/src/openpower/decoder/isa/remap_dct_yield.py
@@ -326,7 +326,7 @@ def transform2(vec):
          vec[jh] = (t1 - t2) * (1/coeff)
          print ("coeff", size, i, "ci", ci,
                  "jl", jl, "jh", jh,
-               "i/n", (ci+0.5)/size, coeff, vec[jl],
+               "i/n", (ci+0.5)/size, 1.0/coeff, vec[jl],
                                              vec[jh],
                  "end", bin(jle), bin(jhe))
          if jle == 0b111: # all loops end
@@ -339,7 +339,6 @@ def transform2(vec):
      # j schedule
      SVSHAPE0 = SVSHAPE()
      SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
      SVSHAPE0.submode2 = 0b10
      SVSHAPE0.mode = 0b01
      SVSHAPE0.skip = 0b00
@@ -348,7 +347,6 @@ def transform2(vec):
      # j+halfstep schedule
      SVSHAPE1 = SVSHAPE()
      SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
      SVSHAPE1.mode = 0b01
      SVSHAPE1.submode2 = 0b10
      SVSHAPE1.skip = 0b01
@@ -389,7 +387,7 @@ def demo():
      # j schedule
      SVSHAPE0 = SVSHAPE()
      SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
+    SVSHAPE0.submode2 = 0b010
      SVSHAPE0.mode = 0b01
      SVSHAPE0.skip = 0b00
      SVSHAPE0.offset = 0       # experiment with different offset, here
@@ -397,7 +395,7 @@ def demo():
      # j+halfstep schedule
      SVSHAPE1 = SVSHAPE()
      SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
+    SVSHAPE1.submode2 = 0b010
      SVSHAPE1.mode = 0b01
      SVSHAPE1.skip = 0b01
      SVSHAPE1.offset = 0       # experiment with different offset, here
@@ -424,7 +422,6 @@ def demo():
      # j schedule
      SVSHAPE0 = SVSHAPE()
      SVSHAPE0.lims = [xdim, ydim, zdim]
-    SVSHAPE0.order = [0,1,2]  # experiment with different permutations, here
      SVSHAPE0.mode = 0b10
      SVSHAPE0.submode2 = 0b100
      SVSHAPE0.skip = 0b10
@@ -433,7 +430,6 @@ def demo():
      # j+halfstep schedule
      SVSHAPE1 = SVSHAPE()
      SVSHAPE1.lims = [xdim, ydim, zdim]
-    SVSHAPE1.order = [0,1,2]  # experiment with different permutations, here
      SVSHAPE1.mode = 0b10
      SVSHAPE1.submode2 = 0b100
      SVSHAPE1.skip = 0b11
diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py

index d6c29edf0caa3debecc95e6c1b4a6b6505768665..53fd51fa23c806464c467e7b5e7f70bfec8422ad 100644 (file)
--- a/src/openpower/decoder/isa/test_caller_svp64_dct.py
+++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py
@@ -68,10 +68,11 @@ def transform_inner_radix2(vec, ctable):
          t1, t2 = vec[jl], vec[jh]
          coeff = ctable[k]
          vec[jl] = t1 + t2
-        vec[jh] = (t1 - t2) * (1/coeff)
+        vec[jh] = (t1 - t2) * (1.0/coeff)
          print ("coeff", "ci", k,
                  "jl", jl, "jh", jh,
-               "i/n", (k+0.5), coeff, vec[jl], vec[jh],
+               "i/n", (k+0.5), 1.0/coeff,
+                "t1, t2", t1, t2, "res", vec[jl], vec[jh],
                  "end", bin(jle), bin(jhe))
          if jle == 0b111: # all loops end
              break
@@ -85,7 +86,7 @@ class DCTTestCase(FHDLTestCase):
          for i in range(32):
              self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
  
-    def tst_sv_ffadds_dct(self):
+    def test_sv_ffadds_dct(self):
          """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                          ]
              four in-place vector adds, four in-place vector mul-subs
@@ -116,14 +117,14 @@ class DCTTestCase(FHDLTestCase):
              # this isn't quite a perfect replication of the
              # FP32 mul-add-sub.  better really to use FPMUL32, FPADD32
              # and FPSUB32 directly to be honest.
-            t = b + a
-            diff = (b - a)
+            t = a + b
+            diff = (a - b)
              diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
              diff = float(diff)
              u = diff * c
              tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
              uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
-            res.append((tc, uc))
+            res.append((uc, tc))
              print ("DCT", i, "in", a, b, "c", c, "res", t, u)
  
          # SVSTATE (in this case, VL=2)
@@ -146,20 +147,24 @@ class DCTTestCase(FHDLTestCase):
                  self.assertEqual(sim.fpr(i+0), t)
                  self.assertEqual(sim.fpr(i+4), u)
  
-    def test_sv_remap_fpmadds_dct(self):
+    def test_sv_remap_fpmadds_dct_4(self):
          """>>> lst = ["svshape 4, 1, 1, 2, 0",
-                     "svremap 31, 1, 0, 2, 0, 1, 0",
+                     "svremap 27, 1, 0, 2, 0, 1, 0",
                          "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                       ]
-            runs a full in-place O(N log2 N) butterfly schedule for
-            DCT
+            runs a full in-place 4-long O(N log2 N) inner butterfly schedule
+            for DCT
  
              SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
              (3 inputs, 2 outputs)
+
+            Note that the coefficient (FRC) is not on a "schedule", it
+            is straight Vectorised (0123...) because DCT coefficients
+            cannot be shared between butterfly layers (due to +0.5)
          """
          lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
-                         "svremap 31, 1, 0, 2, 0, 1, 0",
-                        "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
+                         "svremap 27, 1, 0, 2, 0, 1, 0",
+                         "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
                          ])
          lst = list(lst)
  
@@ -178,7 +183,7 @@ class DCTTestCase(FHDLTestCase):
          # store in regfile
          fprs = [0] * 32
          for i, c in enumerate(coe):
-            fprs[i+8] = fp64toselectable(c)
+            fprs[i+8] = fp64toselectable(1.0 / c) # invert
          for i, a in enumerate(av):
              fprs[i+0] = fp64toselectable(a)
  
@@ -207,7 +212,7 @@ class DCTTestCase(FHDLTestCase):
                  # and the rounding is different
                  err = abs((actual - expected) / expected)
                  print ("err", i, err)
-                self.assertTrue(err < 1e-7)
+                self.assertTrue(err < 1e-6)
  
      def run_tst_program(self, prog, initial_regs=None,
                                svstate=None,
author	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
committer	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Fri, 23 Jul 2021 15:57:31 +0000 (16:57 +0100)
openpower/isa/simplev.mdwn		patch \| blob \| history
openpower/isa/svfparith.mdwn		patch \| blob \| history
src/openpower/decoder/isa/remap_dct_yield.py		patch \| blob \| history
src/openpower/decoder/isa/test_caller_svp64_dct.py		patch \| blob \| history