From 17f19a80183d260238e2cf4ba9b78ccac9fe5807 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 29 Mar 2023 10:08:56 +0100 Subject: [PATCH] remove DCT/iDCT redundant modes which require less-efficient cos tables turns out that values are often repeated so why waste space especially when the svshape instruction is under pressure this goes into https://libre-soc.org/openpower/sv/rfc/ls009/ --- openpower/isa/simplev.mdwn | 12 +- .../decoder/isa/test_caller_svp64_dct.py | 372 +----------------- 2 files changed, 5 insertions(+), 379 deletions(-) diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index b22007dc..c350b3cf 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -164,10 +164,9 @@ Pseudo-code: # FRC (coefficients) SVSHAPE2[28:29] <- 0b10 # k schedule # set schedule up for (i)DCT Inner butterfly - # SVrm Mode 2 (Mode 6 for iDCT) is for pre-calculated coefficients, # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode) - if ((SVrm = 0b0010) | (SVrm = 0b0100) | - (SVrm = 0b1010) | (SVrm = 0b1100)) then + if ((SVrm = 0b0100) | + (SVrm = 0b1100)) then # calculate O(N log2 N) n <- [0] * 3 do while n < 5 @@ -181,17 +180,14 @@ Pseudo-code: SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) mscale <- (0b0 || SVzd) + 1 - if (SVrm = 0b1010) | (SVrm = 0b1100) then + if (SVrm = 0b1100) then SVSHAPE0[30:31] <- 0b11 # iDCT mode SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode else SVSHAPE0[30:31] <- 0b01 # DCT mode SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop - if (SVrm = 0b1100) | (SVrm = 0b0100) then - SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4 - else - SVSHAPE0[6:11] <- 0b000001 # (i)DCT Inner Butterfly mode 2 + SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4 # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] SVSHAPE2[0:31] <- SVSHAPE0[0:31] diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index 35e74226..78e47529 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -308,149 +308,6 @@ class DCTTestCase(FHDLTestCase): self.assertEqual(sim.fpr(i+0), t) self.assertEqual(sim.fpr(i+4), u) - def test_sv_remap_fpmadds_dct_inner_4_stride_1(self): - self.sv_remap_fpmadds_dct_inner_4(stride=2) - - def test_sv_remap_fpmadds_dct_inner_4_stride_1(self): - self.sv_remap_fpmadds_dct_inner_4(stride=1) - - def sv_remap_fpmadds_dct_inner_4(self, stride=2): - """>>> lst = ["svshape 4, 1, 1, 2, 0", - "svremap 27, 1, 0, 2, 0, 1, 0", - "sv.fdmadds *0, *0, *0, *32" - ] - runs a full in-place 4-long O(N log2 N) inner butterfly schedule - for DCT - - SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC - (3 inputs, 2 outputs) - - Note that the coefficient (FRC) is not on a "schedule", it - is straight Vectorised (0123...) because DCT coefficients - cannot be shared between butterfly layers (due to +0.5) - """ - lst = SVP64Asm(["svshape 4, 1, %d, 2, 0" % stride, - "svremap 27, 1, 0, 2, 0, 1, 0", - "sv.fdmadds *0, *0, *0, *16" - ]) - lst = list(lst) - - # array and coefficients to test - n = 4 - av = [7.0, -9.8, 3.0, -32.3] - coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients - - levels = n.bit_length() - 1 - ri = list(range(n)) - ri = [ri[reverse_bits(i, levels)] for i in range(n)] - avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3 - av = halfrev2(avi, False) - av = [av[ri[i]] for i in range(n)] - - # store in regfile - fprs = [0] * 64 - for i, c in enumerate(coe): - fprs[i+16] = fp64toselectable(1.0 / c) # invert - for i, a in enumerate(av): - fprs[i*stride+0] = fp64toselectable(a) - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, initial_fprs=fprs) - print("spr svshape0", sim.spr['SVSHAPE0']) - print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz) - print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz) - print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz) - print("spr svshape1", sim.spr['SVSHAPE1']) - print("spr svshape2", sim.spr['SVSHAPE2']) - print("spr svshape3", sim.spr['SVSHAPE3']) - - # work out the results with the twin mul/add-sub - res = transform_inner_radix2_dct(avi, coe) - - for i, expected in enumerate(res): - print("i", i*stride, float(sim.fpr(i*stride)), - "expected", expected) - for i, expected in enumerate(res): - # convert to Power single - expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) - expected = float(expected) - actual = float(sim.fpr(i*stride)) - # approximate error calculation, good enough test - # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB - # and the rounding is different - err = abs((actual - expected) / expected) - print("err", i, err) - self.assertTrue(err < 1e-6) - - def test_sv_remap_fpmadds_idct_inner_4_stride_1(self): - self.sv_remap_fpmadds_idct_inner_4(stride=2) - - def test_sv_remap_fpmadds_idct_inner_4_stride_1(self): - self.sv_remap_fpmadds_idct_inner_4(stride=1) - - def sv_remap_fpmadds_idct_inner_4(self, stride=2): - """>>> lst = ["svshape 4, 1, 1, 10, 0", - "svremap 27, 0, 1, 2, 1, 0, 0", - "sv.ffmadds *0, *0, *0, *8" - ] - runs a full in-place 4-long O(N log2 N) inner butterfly schedule - for inverse-DCT - - SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC - (3 inputs, 2 outputs) - - Note that the coefficient (FRC) is not on a "schedule", it - is straight Vectorised (0123...) because DCT coefficients - cannot be shared between butterfly layers (due to +0.5) - """ - lst = SVP64Asm(["svshape 4, 1, %d, 10, 0" % stride, - "svremap 27, 0, 1, 2, 1, 0, 0", - "sv.ffmadds *0, *0, *0, *16" - ]) - lst = list(lst) - - # array and coefficients to test - n = 4 - levels = n.bit_length() - 1 - coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients - avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3 - av = halfrev2(avi, False) - - # store in regfile - fprs = [0] * 64 - for i, c in enumerate(coe): - fprs[i+16] = fp64toselectable(1.0 / c) # invert - for i, a in enumerate(av): - fprs[i*stride+0] = fp64toselectable(a) - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, initial_fprs=fprs) - print("spr svshape0", sim.spr['SVSHAPE0']) - print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz) - print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz) - print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz) - print("spr svshape1", sim.spr['SVSHAPE1']) - print("spr svshape2", sim.spr['SVSHAPE2']) - print("spr svshape3", sim.spr['SVSHAPE3']) - - # work out the results with the twin mul/add-sub - res = transform_inner_radix2_idct(avi, coe) - - for i, expected in enumerate(res): - print("i", i*stride, float(sim.fpr(i*stride)), - "expected", expected) - for i, expected in enumerate(res): - # convert to Power single - expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) - expected = float(expected) - actual = float(sim.fpr(i*stride)) - # approximate error calculation, good enough test - # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB - # and the rounding is different - err = abs((actual - expected) / expected) - print("err", i, err) - self.assertTrue(err < 1e-6) - def test_sv_remap_fpmadds_idct_outer_8(self, stride=2): """>>> lst = ["svshape 8, 1, 1, 11, 0", "svremap 27, 0, 1, 2, 1, 0, 0", @@ -562,233 +419,6 @@ class DCTTestCase(FHDLTestCase): print("err", i, err) self.assertTrue(err < 1e-6) - def test_sv_remap_fpmadds_idct_8(self, stride=2): - """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1", - "svshape 8, 1, 1, 11, 0", - "sv.fadds *0, *0, *0", - "svshape 8, 1, 1, 10, 0", - "sv.ffmadds *0, *0, *0, *16" - ] - runs a full in-place 8-long O(N log2 N) inverse-DCT, both - inner and outer butterfly "REMAP" schedules. - """ - lst = SVP64Asm(["svremap 27, 0, 1, 2, 1, 0, 1", - "svshape 8, 1, %d, 11, 0" % stride, - "sv.fadds *0, *0, *0", - "svshape 8, 1, %d, 10, 0" % stride, - "sv.ffmadds *0, *0, *0, *16" - ]) - lst = list(lst) - - # array and coefficients to test - avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2] - n = len(avi) - levels = n.bit_length() - 1 - ri = list(range(n)) - ri = [ri[reverse_bits(i, levels)] for i in range(n)] - av = [avi[ri[i]] for i in range(n)] - av = halfrev2(av, True) - - # divide first value by 2.0, manually. rev and halfrev should - # not have moved it - av[0] /= 2.0 - #avi[0] /= 2.0 - - print("input data pre idct", av) - - ctable = [] - size = 2 - while size <= n: - halfsize = size // 2 - for i in range(n//size): - for ci in range(halfsize): - ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0) - size *= 2 - - # store in regfile - fprs = [0] * 32 - for i, a in enumerate(av): - fprs[i*stride+0] = fp64toselectable(a) - for i, c in enumerate(ctable): - fprs[i+16] = fp64toselectable(1.0 / c) # invert - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, initial_fprs=fprs) - print("spr svshape0", sim.spr['SVSHAPE0']) - print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz) - print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz) - print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz) - print("spr svshape1", sim.spr['SVSHAPE1']) - print("spr svshape2", sim.spr['SVSHAPE2']) - print("spr svshape3", sim.spr['SVSHAPE3']) - - # inverse DCT - expected = [-15.793373940443367, 27.46969091937703, - -24.712331606496313, 27.03601462756265] - - #res = inverse_transform_iter(avi) - res = inverse_transform2(avi) - #res = transform_outer_radix2_idct(avi) - - for i, expected in enumerate(res): - print("i", i*stride, float(sim.fpr(i*stride)), - "expected", expected) - for i, expected in enumerate(res): - # convert to Power single - expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) - expected = float(expected) - actual = float(sim.fpr(i*stride)) - # approximate error calculation, good enough test - # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB - # and the rounding is different - err = abs((actual - expected) / expected) - print("err", i*stride, err) - self.assertTrue(err < 1e-5) - - def test_sv_remap_fpmadds_dct_8(self, stride=2): - """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1", - "svshape 8, 1, 1, 2, 0", - "sv.fdmadds *0, *0, *0, *8" - "svshape 8, 1, 1, 3, 0", - "sv.fadds *0, *0, *0" - ] - runs a full in-place 8-long O(N log2 N) DCT, both - inner and outer butterfly "REMAP" schedules. - """ - lst = SVP64Asm(["svremap 27, 1, 0, 2, 0, 1, 1", - "svshape 8, 1, %d, 2, 0" % stride, - "sv.fdmadds *0, *0, *0, *16", - "svshape 8, 1, %d, 3, 0" % stride, - "sv.fadds *0, *0, *0" - ]) - lst = list(lst) - - # array and coefficients to test - avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2] - n = len(avi) - levels = n.bit_length() - 1 - ri = list(range(n)) - ri = [ri[reverse_bits(i, levels)] for i in range(n)] - av = halfrev2(avi, False) - av = [av[ri[i]] for i in range(n)] - ctable = [] - size = n - while size >= 2: - halfsize = size // 2 - for i in range(n//size): - for ci in range(halfsize): - ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0) - size //= 2 - - # store in regfile - fprs = [0] * 32 - for i, a in enumerate(av): - fprs[i*stride+0] = fp64toselectable(a) - for i, c in enumerate(ctable): - fprs[i+16] = fp64toselectable(1.0 / c) # invert - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, initial_fprs=fprs) - print("spr svshape0", sim.spr['SVSHAPE0']) - print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz) - print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz) - print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz) - print("spr svshape1", sim.spr['SVSHAPE1']) - print("spr svshape2", sim.spr['SVSHAPE2']) - print("spr svshape3", sim.spr['SVSHAPE3']) - - # outer iterative sum - res = transform2(avi) - - for i, expected in enumerate(res): - print("i", i*stride, float(sim.fpr(i*stride)), - "expected", expected) - for i, expected in enumerate(res): - # convert to Power single - expected = fph.DOUBLE2SINGLE(fp64toselectable(expected)) - expected = float(expected) - actual = float(sim.fpr(i*stride)) - # approximate error calculation, good enough test - # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB - # and the rounding is different - err = abs((actual - expected) / expected) - print("err", i, err) - self.assertTrue(err < 1e-5) - - def test_sv_remap_dct_cos_precompute_8(self): - """pre-computes a DCT COS table, deliberately using a lot of - registers so as to be able to see what is going on (dumping all - regs after the run). - - the simpler (scalar) version is in test_caller_transcendentals.py - (test_fp_coss_cvt), this is the SVP64 variant. TODO: really - need the new version of fcfids which doesn't spam memory with - LD/STs. - """ - lst = SVP64Asm(["svshape 8, 1, 1, 2, 0", - "svremap 0, 0, 0, 2, 0, 1, 1", - "sv.svstep *4, 4, 1", # svstep get vector of ci - "sv.svstep *16, 3, 1", # svstep get vector of step - "addi 1, 0, 0x0000", - "setvl 0, 0, 12, 0, 1, 1", - "sv.std *4, 0(1)", - "sv.lfd *64, 0(1)", - "sv.fcfids *48, *64", - "addi 1, 0, 0x0060", - "sv.std *16, 0(1)", - "sv.lfd *12, 0(1)", - "sv.fcfids *24, *12", - "sv.fadds *0, *24, 43", # plus 0.5 - "sv.fmuls *0, *0, 41", # times PI - "sv.fdivs *0, *0, *48", # div size - "sv.fcoss *80, *0", - "sv.fdivs *80, 43, *80", # div 0.5 / x - ]) - lst = list(lst) - - gprs = [0] * 32 - fprs = [0] * 128 - # constants - fprs[43] = fp64toselectable(0.5) # 0.5 - fprs[41] = fp64toselectable(math.pi) # pi - fprs[44] = fp64toselectable(2.0) # 2.0 - - n = 8 - - ctable = [] - size = n - while size >= 2: - halfsize = size // 2 - for i in range(n//size): - for ci in range(halfsize): - ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0) - size //= 2 - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, gprs, initial_fprs=fprs) - print("MEM") - sim.mem.dump() - print("ci FP") - for i in range(len(ctable)): - actual = float(sim.fpr(i+24)) - print("i", i, actual) - print("size FP") - for i in range(len(ctable)): - actual = float(sim.fpr(i+48)) - print("i", i, actual) - print("temps") - for i in range(len(ctable)): - actual = float(sim.fpr(i)) - print("i", i, actual) - for i in range(len(ctable)): - expected = 1.0/ctable[i] - actual = float(sim.fpr(i+80)) - err = abs((actual - expected) / expected) - print("i", i, actual, "1/expect", 1/expected, - "expected", expected, - "err", err) - self.assertTrue(err < 1e-6) - def test_sv_remap_dct_cos_precompute_inner_8(self): """pre-computes a DCT COS table, using the shorter costable indices schedule. turns out, some COS values are repeated @@ -1037,7 +667,7 @@ class DCTTestCase(FHDLTestCase): "svshape 8, 1, 1, 11, 0", "sv.fadds *0, *0, *0", # Inner butterfly, twin +/- MUL-ADD-SUB - "svshape 8, 1, 1, 10, 0", + "svshape 8, 1, 1, 12, 0", "sv.ffmadds *0, *0, *0, *8" ] runs a full in-place 8-long O(N log2 N) Inverse-DCT, both -- 2.30.2