From 48ac3d31bec17633e16486a0c2d4e66865b00890 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 21 Sep 2022 18:18:14 +0100 Subject: [PATCH] scale-up svshape pseudo-code for striding in DCT/FFT --- openpower/isa/simplev.mdwn | 13 +++++++++++-- .../decoder/isa/test_caller_svp64_dct.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index e96acbf1..ab7af687 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -105,6 +105,7 @@ Pseudo-code: # for convenience, VL to be calculated and stored in SVSTATE vlen <- [0] * 7 + mscale[0:6] <- 0b0000001 # for scaling MAXVL itercount[0:6] <- [0] * 7 SVSTATE[0:31] <- [0] * 32 # only overwrite REMAP if "persistence" is zero @@ -156,6 +157,7 @@ Pseudo-code: # for FRA and FRT SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT) + mscale <- (0b0 || SVzd) + 1 SVSHAPE0[30:31] <- 0b01 # Butterfly mode # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] @@ -181,6 +183,7 @@ Pseudo-code: # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) + mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1010) | (SVrm = 0b1100) then SVSHAPE0[30:31] <- 0b11 # iDCT mode SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode @@ -223,6 +226,7 @@ Pseudo-code: # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) + mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1011) then SVSHAPE0[30:31] <- 0b11 # iDCT mode SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode @@ -253,6 +257,7 @@ Pseudo-code: # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) + mscale <- (0b0 || SVzd) + 1 SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode if (SVrm = 0b0101) then @@ -269,6 +274,7 @@ Pseudo-code: # set up template in SVSHAPE0 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) + mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1110) then SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap if (SVrm = 0b1111) then @@ -295,14 +301,17 @@ Pseudo-code: # set up template in SVSHAPE0, then copy to 1. only 2 needed SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) + mscale <- (0b0 || SVzd) + 1 SVSHAPE0[30:31] <- 0b10 # parallel reduce submode # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] # set up right operand (left operand 28:29 is zero) SVSHAPE1[28:29] <- 0b01 # right operand # set VL, MVL and Vertical-First - SVSTATE[0:6] <- vlen - SVSTATE[7:13] <- vlen + m[0:12] <- vlen * mscale + maxvl[0:6] <- m[6:12] + SVSTATE[0:6] <- vlen # VL + SVSTATE[7:13] <- maxvl # MAVXL SVSTATE[63] <- vf Special Registers Altered: diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index f2abe0c6..436755e4 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -313,10 +313,10 @@ class DCTTestCase(FHDLTestCase): self.assertEqual(sim.fpr(i+0), t) self.assertEqual(sim.fpr(i+4), u) - def test_sv_remap_fpmadds_dct_inner_4(self): + def test_sv_remap_fpmadds_dct_inner_4(self, stride=1): """>>> lst = ["svshape 4, 1, 1, 2, 0", "svremap 27, 1, 0, 2, 0, 1, 0", - "sv.fdmadds *0, *0, *0, *8" + "sv.fdmadds *0, *0, *0, *32" ] runs a full in-place 4-long O(N log2 N) inner butterfly schedule for DCT @@ -328,9 +328,9 @@ class DCTTestCase(FHDLTestCase): is straight Vectorised (0123...) because DCT coefficients cannot be shared between butterfly layers (due to +0.5) """ - lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0", + lst = SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride, "svremap 27, 1, 0, 2, 0, 1, 0", - "sv.fdmadds *0, *0, *0, *8" + "sv.fdmadds *0, *0, *0, *32" ]) lst = list(lst) @@ -347,11 +347,11 @@ class DCTTestCase(FHDLTestCase): av = [av[ri[i]] for i in range(n)] # store in regfile - fprs = [0] * 32 + fprs = [0] * 64 for i, c in enumerate(coe): - fprs[i+8] = fp64toselectable(1.0 / c) # invert + fprs[i*stride+32] = fp64toselectable(1.0 / c) # invert for i, a in enumerate(av): - fprs[i+0] = fp64toselectable(a) + fprs[i*stride+0] = fp64toselectable(a) with Program(lst, bigendian=False) as program: sim = self.run_tst_program(program, initial_fprs=fprs) @@ -367,12 +367,13 @@ class DCTTestCase(FHDLTestCase): res = transform_inner_radix2_dct(avi, coe) for i, expected in enumerate(res): - print ("i", i, float(sim.fpr(i)), "expected", expected) + print ("i", i*stride, float(sim.fpr(i*stride)), + "expected", expected) for i, expected in enumerate(res): # convert to Power single expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) expected = float(expected) - actual = float(sim.fpr(i)) + actual = float(sim.fpr(i*stride)) # approximate error calculation, good enough test # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB # and the rounding is different -- 2.30.2