From 650653b92c343ca21e57129c2daac153f94df0cb Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 21 Sep 2022 20:17:41 +0100 Subject: [PATCH] do not set striding on costables, keep them contiguous. not totally sure this is a good idea, but hey --- openpower/isa/simplev.mdwn | 11 ++++--- src/openpower/decoder/isa/remap_dct_yield.py | 4 ++- .../decoder/isa/test_caller_svp64_dct.py | 29 ++++++++++++------- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index ab7af687..3a8387c0 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -204,6 +204,7 @@ Pseudo-code: SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule # for cos coefficient SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule + SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1 if (SVrm != 0b0100) & (SVrm != 0b1100) then SVSHAPE3[28:29] <- 0b11 # size schedule # set schedule up for (i)DCT Outer butterfly @@ -236,10 +237,12 @@ Pseudo-code: SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode # copy - SVSHAPE1[0:31] <- SVSHAPE0[0:31] - SVSHAPE2[0:31] <- SVSHAPE0[0:31] + SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule + SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients # for FRA and FRT SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule + # reset costable "striding" to 1 + SVSHAPE2[12:17] <- 0b000000 # set schedule up for DCT COS table generation if (SVrm = 0b0101) | (SVrm = 0b1101) then # calculate O(N log2 N) @@ -310,8 +313,8 @@ Pseudo-code: # set VL, MVL and Vertical-First m[0:12] <- vlen * mscale maxvl[0:6] <- m[6:12] - SVSTATE[0:6] <- vlen # VL - SVSTATE[7:13] <- maxvl # MAVXL + SVSTATE[0:6] <- maxvl # MAVXL + SVSTATE[7:13] <- vlen # VL SVSTATE[63] <- vf Special Registers Altered: diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py index 66cddffb..6dcc2217 100644 --- a/src/openpower/decoder/isa/remap_dct_yield.py +++ b/src/openpower/decoder/isa/remap_dct_yield.py @@ -275,7 +275,9 @@ def iterate_dct_outer_butterfly_indices(SVSHAPE): if len(x_r) == 0: return - print ("outer butterfly", mode, SVSHAPE.skip, "submode", SVSHAPE.submode2) + print ("outer butterfly", mode, SVSHAPE.skip, + "submode", SVSHAPE.submode2, + "stride", stride) # I-DCT, reference (read/write) the in-place data in *reverse-bit-order* ri = list(range(n)) diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index 436755e4..3e47a652 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -313,7 +313,13 @@ class DCTTestCase(FHDLTestCase): self.assertEqual(sim.fpr(i+0), t) self.assertEqual(sim.fpr(i+4), u) - def test_sv_remap_fpmadds_dct_inner_4(self, stride=1): + def test_sv_remap_fpmadds_dct_inner_4_stride_1(self): + self.sv_remap_fpmadds_dct_inner_4(stride=2) + + def test_sv_remap_fpmadds_dct_inner_4_stride_1(self): + self.sv_remap_fpmadds_dct_inner_4(stride=1) + + def sv_remap_fpmadds_dct_inner_4(self, stride=2): """>>> lst = ["svshape 4, 1, 1, 2, 0", "svremap 27, 1, 0, 2, 0, 1, 0", "sv.fdmadds *0, *0, *0, *32" @@ -330,7 +336,7 @@ class DCTTestCase(FHDLTestCase): """ lst = SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride, "svremap 27, 1, 0, 2, 0, 1, 0", - "sv.fdmadds *0, *0, *0, *32" + "sv.fdmadds *0, *0, *0, *16" ]) lst = list(lst) @@ -349,7 +355,7 @@ class DCTTestCase(FHDLTestCase): # store in regfile fprs = [0] * 64 for i, c in enumerate(coe): - fprs[i*stride+32] = fp64toselectable(1.0 / c) # invert + fprs[i+16] = fp64toselectable(1.0 / c) # invert for i, a in enumerate(av): fprs[i*stride+0] = fp64toselectable(a) @@ -381,7 +387,7 @@ class DCTTestCase(FHDLTestCase): print ("err", i, err) self.assertTrue(err < 1e-6) - def test_sv_remap_fpmadds_idct_inner_4(self): + def test_sv_remap_fpmadds_idct_inner_4(self, stride=2): """>>> lst = ["svshape 4, 1, 1, 10, 0", "svremap 27, 0, 1, 2, 1, 0, 0", "sv.ffmadds *0, *0, *0, *8" @@ -396,9 +402,9 @@ class DCTTestCase(FHDLTestCase): is straight Vectorised (0123...) because DCT coefficients cannot be shared between butterfly layers (due to +0.5) """ - lst = SVP64Asm( ["svshape 4, 1, 1, 10, 0", + lst = SVP64Asm( ["svshape 4, 1, %d, 10, 0" % stride, "svremap 27, 0, 1, 2, 1, 0, 0", - "sv.ffmadds *0, *0, *0, *8" + "sv.ffmadds *0, *0, *0, *16" ]) lst = list(lst) @@ -410,11 +416,11 @@ class DCTTestCase(FHDLTestCase): av = halfrev2(avi, False) # store in regfile - fprs = [0] * 32 + fprs = [0] * 64 for i, c in enumerate(coe): - fprs[i+8] = fp64toselectable(1.0 / c) # invert + fprs[i+16] = fp64toselectable(1.0 / c) # invert for i, a in enumerate(av): - fprs[i+0] = fp64toselectable(a) + fprs[i*stride+0] = fp64toselectable(a) with Program(lst, bigendian=False) as program: sim = self.run_tst_program(program, initial_fprs=fprs) @@ -430,12 +436,13 @@ class DCTTestCase(FHDLTestCase): res = transform_inner_radix2_idct(avi, coe) for i, expected in enumerate(res): - print ("i", i, float(sim.fpr(i)), "expected", expected) + print ("i", i*stride, float(sim.fpr(i*stride)), + "expected", expected) for i, expected in enumerate(res): # convert to Power single expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) expected = float(expected) - actual = float(sim.fpr(i)) + actual = float(sim.fpr(i*stride)) # approximate error calculation, good enough test # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB # and the rounding is different -- 2.30.2