From: Luke Kenneth Casson Leighton Date: Fri, 23 Jul 2021 15:57:31 +0000 (+0100) Subject: small inner DCT butterfly test, fix up order of fdmadds X-Git-Tag: xlen-bcd~222 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6392bcdf5da183c6794b74954f35b7377b86fd89;p=openpower-isa.git small inner DCT butterfly test, fix up order of fdmadds --- diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index c4eb7711..0a735cb7 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -129,14 +129,15 @@ Pseudo-code: n <- ((0b0 || SVxd) + 1) * n vlen[0:6] <- n[1:7] # set up template in SVSHAPE0, then copy to 1-3 - # for FRA and FRT + # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[30:31] <- 0b01 # Butterfly mode SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode + SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] - # set up FRB and FRS - SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule + # for FRA and FRT + SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule # set VL, MVL and Vertical-First SVSTATE[0:6] <- vlen SVSTATE[7:13] <- vlen diff --git a/openpower/isa/svfparith.mdwn b/openpower/isa/svfparith.mdwn index 7cc02df8..2bbf1429 100644 --- a/openpower/isa/svfparith.mdwn +++ b/openpower/isa/svfparith.mdwn @@ -168,9 +168,9 @@ A-Form Pseudo-code: - FRT <- FPADD32(FRA, FRB) - sub <- FPSUB32(FRB, FRA) - FRS <- FPMUL32(FRC, sub) + FRS <- FPADD32(FRA, FRB) + sub <- FPSUB32(FRA, FRB) + FRT <- FPMUL32(FRC, sub) Special Registers Altered: diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py index d4a973be..091904ab 100644 --- a/src/openpower/decoder/isa/remap_dct_yield.py +++ b/src/openpower/decoder/isa/remap_dct_yield.py @@ -326,7 +326,7 @@ def transform2(vec): vec[jh] = (t1 - t2) * (1/coeff) print ("coeff", size, i, "ci", ci, "jl", jl, "jh", jh, - "i/n", (ci+0.5)/size, coeff, vec[jl], + "i/n", (ci+0.5)/size, 1.0/coeff, vec[jl], vec[jh], "end", bin(jle), bin(jhe)) if jle == 0b111: # all loops end @@ -339,7 +339,6 @@ def transform2(vec): # j schedule SVSHAPE0 = SVSHAPE() SVSHAPE0.lims = [xdim, ydim, zdim] - SVSHAPE0.order = [0,1,2] # experiment with different permutations, here SVSHAPE0.submode2 = 0b10 SVSHAPE0.mode = 0b01 SVSHAPE0.skip = 0b00 @@ -348,7 +347,6 @@ def transform2(vec): # j+halfstep schedule SVSHAPE1 = SVSHAPE() SVSHAPE1.lims = [xdim, ydim, zdim] - SVSHAPE1.order = [0,1,2] # experiment with different permutations, here SVSHAPE1.mode = 0b01 SVSHAPE1.submode2 = 0b10 SVSHAPE1.skip = 0b01 @@ -389,7 +387,7 @@ def demo(): # j schedule SVSHAPE0 = SVSHAPE() SVSHAPE0.lims = [xdim, ydim, zdim] - SVSHAPE0.order = [0,1,2] # experiment with different permutations, here + SVSHAPE0.submode2 = 0b010 SVSHAPE0.mode = 0b01 SVSHAPE0.skip = 0b00 SVSHAPE0.offset = 0 # experiment with different offset, here @@ -397,7 +395,7 @@ def demo(): # j+halfstep schedule SVSHAPE1 = SVSHAPE() SVSHAPE1.lims = [xdim, ydim, zdim] - SVSHAPE1.order = [0,1,2] # experiment with different permutations, here + SVSHAPE1.submode2 = 0b010 SVSHAPE1.mode = 0b01 SVSHAPE1.skip = 0b01 SVSHAPE1.offset = 0 # experiment with different offset, here @@ -424,7 +422,6 @@ def demo(): # j schedule SVSHAPE0 = SVSHAPE() SVSHAPE0.lims = [xdim, ydim, zdim] - SVSHAPE0.order = [0,1,2] # experiment with different permutations, here SVSHAPE0.mode = 0b10 SVSHAPE0.submode2 = 0b100 SVSHAPE0.skip = 0b10 @@ -433,7 +430,6 @@ def demo(): # j+halfstep schedule SVSHAPE1 = SVSHAPE() SVSHAPE1.lims = [xdim, ydim, zdim] - SVSHAPE1.order = [0,1,2] # experiment with different permutations, here SVSHAPE1.mode = 0b10 SVSHAPE1.submode2 = 0b100 SVSHAPE1.skip = 0b11 diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index d6c29edf..53fd51fa 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -68,10 +68,11 @@ def transform_inner_radix2(vec, ctable): t1, t2 = vec[jl], vec[jh] coeff = ctable[k] vec[jl] = t1 + t2 - vec[jh] = (t1 - t2) * (1/coeff) + vec[jh] = (t1 - t2) * (1.0/coeff) print ("coeff", "ci", k, "jl", jl, "jh", jh, - "i/n", (k+0.5), coeff, vec[jl], vec[jh], + "i/n", (k+0.5), 1.0/coeff, + "t1, t2", t1, t2, "res", vec[jl], vec[jh], "end", bin(jle), bin(jhe)) if jle == 0b111: # all loops end break @@ -85,7 +86,7 @@ class DCTTestCase(FHDLTestCase): for i in range(32): self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64)) - def tst_sv_ffadds_dct(self): + def test_sv_ffadds_dct(self): """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v" ] four in-place vector adds, four in-place vector mul-subs @@ -116,14 +117,14 @@ class DCTTestCase(FHDLTestCase): # this isn't quite a perfect replication of the # FP32 mul-add-sub. better really to use FPMUL32, FPADD32 # and FPSUB32 directly to be honest. - t = b + a - diff = (b - a) + t = a + b + diff = (a - b) diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round diff = float(diff) u = diff * c tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double - res.append((tc, uc)) + res.append((uc, tc)) print ("DCT", i, "in", a, b, "c", c, "res", t, u) # SVSTATE (in this case, VL=2) @@ -146,20 +147,24 @@ class DCTTestCase(FHDLTestCase): self.assertEqual(sim.fpr(i+0), t) self.assertEqual(sim.fpr(i+4), u) - def test_sv_remap_fpmadds_dct(self): + def test_sv_remap_fpmadds_dct_4(self): """>>> lst = ["svshape 4, 1, 1, 2, 0", - "svremap 31, 1, 0, 2, 0, 1, 0", + "svremap 27, 1, 0, 2, 0, 1, 0", "sv.fdmadds 0.v, 0.v, 0.v, 8.v" ] - runs a full in-place O(N log2 N) butterfly schedule for - DCT + runs a full in-place 4-long O(N log2 N) inner butterfly schedule + for DCT SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC (3 inputs, 2 outputs) + + Note that the coefficient (FRC) is not on a "schedule", it + is straight Vectorised (0123...) because DCT coefficients + cannot be shared between butterfly layers (due to +0.5) """ lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0", - "svremap 31, 1, 0, 2, 0, 1, 0", - "sv.fdmadds 0.v, 0.v, 0.v, 8.v" + "svremap 27, 1, 0, 2, 0, 1, 0", + "sv.fdmadds 0.v, 0.v, 0.v, 8.v" ]) lst = list(lst) @@ -178,7 +183,7 @@ class DCTTestCase(FHDLTestCase): # store in regfile fprs = [0] * 32 for i, c in enumerate(coe): - fprs[i+8] = fp64toselectable(c) + fprs[i+8] = fp64toselectable(1.0 / c) # invert for i, a in enumerate(av): fprs[i+0] = fp64toselectable(a) @@ -207,7 +212,7 @@ class DCTTestCase(FHDLTestCase): # and the rounding is different err = abs((actual - expected) / expected) print ("err", i, err) - self.assertTrue(err < 1e-7) + self.assertTrue(err < 1e-6) def run_tst_program(self, prog, initial_regs=None, svstate=None,