do not set striding on costables, keep them contiguous.
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 21 Sep 2022 19:17:41 +0000 (20:17 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 21 Sep 2022 19:17:44 +0000 (20:17 +0100)
not totally sure this is a good idea, but hey

openpower/isa/simplev.mdwn
src/openpower/decoder/isa/remap_dct_yield.py
src/openpower/decoder/isa/test_caller_svp64_dct.py

index ab7af687c7e5ee41b4567329dccdc973f1f5b6d2..3a8387c0ae06938f3cc047dfc8896e54cf822128 100644 (file)
@@ -204,6 +204,7 @@ Pseudo-code:
         SVSHAPE0[28:29] <- 0b01           # j+halfstep schedule
         # for cos coefficient
         SVSHAPE2[28:29] <- 0b10           # ci (k for mode 4) schedule
+        SVSHAPE2[12:17] <- 0b000000       # reset costable "striding" to 1
         if (SVrm != 0b0100) & (SVrm != 0b1100) then
             SVSHAPE3[28:29] <- 0b11           # size schedule
     # set schedule up for (i)DCT Outer butterfly
@@ -236,10 +237,12 @@ Pseudo-code:
             SVSHAPE0[18:20] <- 0b100     # DCT Outer Butterfly sub-mode
         SVSHAPE0[6:11] <- 0b000010       # DCT Butterfly mode
         # copy
-        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
-        SVSHAPE2[0:31] <- SVSHAPE0[0:31]
+        SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
+        SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
         # for FRA and FRT
         SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
+        # reset costable "striding" to 1
+        SVSHAPE2[12:17] <- 0b000000
     # set schedule up for DCT COS table generation
     if (SVrm = 0b0101) | (SVrm = 0b1101) then
         # calculate O(N log2 N)
@@ -310,8 +313,8 @@ Pseudo-code:
     # set VL, MVL and Vertical-First
     m[0:12] <- vlen * mscale
     maxvl[0:6] <- m[6:12]
-    SVSTATE[0:6] <- vlen    # VL
-    SVSTATE[7:13] <- maxvl  # MAVXL
+    SVSTATE[0:6] <- maxvl  # MAVXL
+    SVSTATE[7:13] <- vlen  # VL
     SVSTATE[63] <- vf
 
 Special Registers Altered:
index 66cddffbf078e00823300176edfb378c546c9cb5..6dcc2217305c4fa18350c071ab3e895d3ab670cc 100644 (file)
@@ -275,7 +275,9 @@ def iterate_dct_outer_butterfly_indices(SVSHAPE):
     if len(x_r) == 0:
         return
 
-    print ("outer butterfly", mode, SVSHAPE.skip, "submode", SVSHAPE.submode2)
+    print ("outer butterfly", mode, SVSHAPE.skip,
+           "submode", SVSHAPE.submode2,
+           "stride", stride)
 
     # I-DCT, reference (read/write) the in-place data in *reverse-bit-order*
     ri = list(range(n))
index 436755e4a27a0ef81f37cee42874f8d311055fb4..3e47a6528d9867f7177f3d7cffa7ccb8f2acf9c6 100644 (file)
@@ -313,7 +313,13 @@ class DCTTestCase(FHDLTestCase):
                 self.assertEqual(sim.fpr(i+0), t)
                 self.assertEqual(sim.fpr(i+4), u)
 
-    def test_sv_remap_fpmadds_dct_inner_4(self, stride=1):
+    def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
+        self.sv_remap_fpmadds_dct_inner_4(stride=2)
+
+    def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
+        self.sv_remap_fpmadds_dct_inner_4(stride=1)
+
+    def sv_remap_fpmadds_dct_inner_4(self, stride=2):
         """>>> lst = ["svshape 4, 1, 1, 2, 0",
                      "svremap 27, 1, 0, 2, 0, 1, 0",
                         "sv.fdmadds *0, *0, *0, *32"
@@ -330,7 +336,7 @@ class DCTTestCase(FHDLTestCase):
         """
         lst = SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride,
                          "svremap 27, 1, 0, 2, 0, 1, 0",
-                         "sv.fdmadds *0, *0, *0, *32"
+                         "sv.fdmadds *0, *0, *0, *16"
                         ])
         lst = list(lst)
 
@@ -349,7 +355,7 @@ class DCTTestCase(FHDLTestCase):
         # store in regfile
         fprs = [0] * 64
         for i, c in enumerate(coe):
-            fprs[i*stride+32] = fp64toselectable(1.0 / c) # invert
+            fprs[i+16] = fp64toselectable(1.0 / c) # invert
         for i, a in enumerate(av):
             fprs[i*stride+0] = fp64toselectable(a)
 
@@ -381,7 +387,7 @@ class DCTTestCase(FHDLTestCase):
                 print ("err", i, err)
                 self.assertTrue(err < 1e-6)
 
-    def test_sv_remap_fpmadds_idct_inner_4(self):
+    def test_sv_remap_fpmadds_idct_inner_4(self, stride=2):
         """>>> lst = ["svshape 4, 1, 1, 10, 0",
                       "svremap 27, 0, 1, 2, 1, 0, 0",
                       "sv.ffmadds *0, *0, *0, *8"
@@ -396,9 +402,9 @@ class DCTTestCase(FHDLTestCase):
             is straight Vectorised (0123...) because DCT coefficients
             cannot be shared between butterfly layers (due to +0.5)
         """
-        lst = SVP64Asm( ["svshape 4, 1, 1, 10, 0",
+        lst = SVP64Asm( ["svshape 4, 1, %d, 10, 0" % stride,
                          "svremap 27, 0, 1, 2, 1, 0, 0",
-                         "sv.ffmadds *0, *0, *0, *8"
+                         "sv.ffmadds *0, *0, *0, *16"
                         ])
         lst = list(lst)
 
@@ -410,11 +416,11 @@ class DCTTestCase(FHDLTestCase):
         av = halfrev2(avi, False)
 
         # store in regfile
-        fprs = [0] * 32
+        fprs = [0] * 64
         for i, c in enumerate(coe):
-            fprs[i+8] = fp64toselectable(1.0 / c) # invert
+            fprs[i+16] = fp64toselectable(1.0 / c) # invert
         for i, a in enumerate(av):
-            fprs[i+0] = fp64toselectable(a)
+            fprs[i*stride+0] = fp64toselectable(a)
 
         with Program(lst, bigendian=False) as program:
             sim = self.run_tst_program(program, initial_fprs=fprs)
@@ -430,12 +436,13 @@ class DCTTestCase(FHDLTestCase):
             res = transform_inner_radix2_idct(avi, coe)
 
             for i, expected in enumerate(res):
-                print ("i", i, float(sim.fpr(i)), "expected", expected)
+                print ("i", i*stride, float(sim.fpr(i*stride)),
+                       "expected", expected)
             for i, expected in enumerate(res):
                 # convert to Power single
                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
                 expected = float(expected)
-                actual = float(sim.fpr(i))
+                actual = float(sim.fpr(i*stride))
                 # approximate error calculation, good enough test
                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
                 # and the rounding is different