self.assertEqual(sim.fpr(i+0), t)
self.assertEqual(sim.fpr(i+4), u)
- def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
- self.sv_remap_fpmadds_dct_inner_4(stride=2)
-
- def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
- self.sv_remap_fpmadds_dct_inner_4(stride=1)
-
- def sv_remap_fpmadds_dct_inner_4(self, stride=2):
- """>>> lst = ["svshape 4, 1, 1, 2, 0",
- "svremap 27, 1, 0, 2, 0, 1, 0",
- "sv.fdmadds *0, *0, *0, *32"
- ]
- runs a full in-place 4-long O(N log2 N) inner butterfly schedule
- for DCT
-
- SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
- (3 inputs, 2 outputs)
-
- Note that the coefficient (FRC) is not on a "schedule", it
- is straight Vectorised (0123...) because DCT coefficients
- cannot be shared between butterfly layers (due to +0.5)
- """
- lst = SVP64Asm(["svshape 4, 1, %d, 2, 0" % stride,
- "svremap 27, 1, 0, 2, 0, 1, 0",
- "sv.fdmadds *0, *0, *0, *16"
- ])
- lst = list(lst)
-
- # array and coefficients to test
- n = 4
- av = [7.0, -9.8, 3.0, -32.3]
- coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
-
- levels = n.bit_length() - 1
- ri = list(range(n))
- ri = [ri[reverse_bits(i, levels)] for i in range(n)]
- avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
- av = halfrev2(avi, False)
- av = [av[ri[i]] for i in range(n)]
-
- # store in regfile
- fprs = [0] * 64
- for i, c in enumerate(coe):
- fprs[i+16] = fp64toselectable(1.0 / c) # invert
- for i, a in enumerate(av):
- fprs[i*stride+0] = fp64toselectable(a)
-
- with Program(lst, bigendian=False) as program:
- sim = self.run_tst_program(program, initial_fprs=fprs)
- print("spr svshape0", sim.spr['SVSHAPE0'])
- print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
- print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
- print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
- print("spr svshape1", sim.spr['SVSHAPE1'])
- print("spr svshape2", sim.spr['SVSHAPE2'])
- print("spr svshape3", sim.spr['SVSHAPE3'])
-
- # work out the results with the twin mul/add-sub
- res = transform_inner_radix2_dct(avi, coe)
-
- for i, expected in enumerate(res):
- print("i", i*stride, float(sim.fpr(i*stride)),
- "expected", expected)
- for i, expected in enumerate(res):
- # convert to Power single
- expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
- expected = float(expected)
- actual = float(sim.fpr(i*stride))
- # approximate error calculation, good enough test
- # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
- # and the rounding is different
- err = abs((actual - expected) / expected)
- print("err", i, err)
- self.assertTrue(err < 1e-6)
-
- def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
- self.sv_remap_fpmadds_idct_inner_4(stride=2)
-
- def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
- self.sv_remap_fpmadds_idct_inner_4(stride=1)
-
- def sv_remap_fpmadds_idct_inner_4(self, stride=2):
- """>>> lst = ["svshape 4, 1, 1, 10, 0",
- "svremap 27, 0, 1, 2, 1, 0, 0",
- "sv.ffmadds *0, *0, *0, *8"
- ]
- runs a full in-place 4-long O(N log2 N) inner butterfly schedule
- for inverse-DCT
-
- SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
- (3 inputs, 2 outputs)
-
- Note that the coefficient (FRC) is not on a "schedule", it
- is straight Vectorised (0123...) because DCT coefficients
- cannot be shared between butterfly layers (due to +0.5)
- """
- lst = SVP64Asm(["svshape 4, 1, %d, 10, 0" % stride,
- "svremap 27, 0, 1, 2, 1, 0, 0",
- "sv.ffmadds *0, *0, *0, *16"
- ])
- lst = list(lst)
-
- # array and coefficients to test
- n = 4
- levels = n.bit_length() - 1
- coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
- avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
- av = halfrev2(avi, False)
-
- # store in regfile
- fprs = [0] * 64
- for i, c in enumerate(coe):
- fprs[i+16] = fp64toselectable(1.0 / c) # invert
- for i, a in enumerate(av):
- fprs[i*stride+0] = fp64toselectable(a)
-
- with Program(lst, bigendian=False) as program:
- sim = self.run_tst_program(program, initial_fprs=fprs)
- print("spr svshape0", sim.spr['SVSHAPE0'])
- print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
- print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
- print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
- print("spr svshape1", sim.spr['SVSHAPE1'])
- print("spr svshape2", sim.spr['SVSHAPE2'])
- print("spr svshape3", sim.spr['SVSHAPE3'])
-
- # work out the results with the twin mul/add-sub
- res = transform_inner_radix2_idct(avi, coe)
-
- for i, expected in enumerate(res):
- print("i", i*stride, float(sim.fpr(i*stride)),
- "expected", expected)
- for i, expected in enumerate(res):
- # convert to Power single
- expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
- expected = float(expected)
- actual = float(sim.fpr(i*stride))
- # approximate error calculation, good enough test
- # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
- # and the rounding is different
- err = abs((actual - expected) / expected)
- print("err", i, err)
- self.assertTrue(err < 1e-6)
-
def test_sv_remap_fpmadds_idct_outer_8(self, stride=2):
""">>> lst = ["svshape 8, 1, 1, 11, 0",
"svremap 27, 0, 1, 2, 1, 0, 0",
print("err", i, err)
self.assertTrue(err < 1e-6)
- def test_sv_remap_fpmadds_idct_8(self, stride=2):
- """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
- "svshape 8, 1, 1, 11, 0",
- "sv.fadds *0, *0, *0",
- "svshape 8, 1, 1, 10, 0",
- "sv.ffmadds *0, *0, *0, *16"
- ]
- runs a full in-place 8-long O(N log2 N) inverse-DCT, both
- inner and outer butterfly "REMAP" schedules.
- """
- lst = SVP64Asm(["svremap 27, 0, 1, 2, 1, 0, 1",
- "svshape 8, 1, %d, 11, 0" % stride,
- "sv.fadds *0, *0, *0",
- "svshape 8, 1, %d, 10, 0" % stride,
- "sv.ffmadds *0, *0, *0, *16"
- ])
- lst = list(lst)
-
- # array and coefficients to test
- avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
- n = len(avi)
- levels = n.bit_length() - 1
- ri = list(range(n))
- ri = [ri[reverse_bits(i, levels)] for i in range(n)]
- av = [avi[ri[i]] for i in range(n)]
- av = halfrev2(av, True)
-
- # divide first value by 2.0, manually. rev and halfrev should
- # not have moved it
- av[0] /= 2.0
- #avi[0] /= 2.0
-
- print("input data pre idct", av)
-
- ctable = []
- size = 2
- while size <= n:
- halfsize = size // 2
- for i in range(n//size):
- for ci in range(halfsize):
- ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
- size *= 2
-
- # store in regfile
- fprs = [0] * 32
- for i, a in enumerate(av):
- fprs[i*stride+0] = fp64toselectable(a)
- for i, c in enumerate(ctable):
- fprs[i+16] = fp64toselectable(1.0 / c) # invert
-
- with Program(lst, bigendian=False) as program:
- sim = self.run_tst_program(program, initial_fprs=fprs)
- print("spr svshape0", sim.spr['SVSHAPE0'])
- print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
- print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
- print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
- print("spr svshape1", sim.spr['SVSHAPE1'])
- print("spr svshape2", sim.spr['SVSHAPE2'])
- print("spr svshape3", sim.spr['SVSHAPE3'])
-
- # inverse DCT
- expected = [-15.793373940443367, 27.46969091937703,
- -24.712331606496313, 27.03601462756265]
-
- #res = inverse_transform_iter(avi)
- res = inverse_transform2(avi)
- #res = transform_outer_radix2_idct(avi)
-
- for i, expected in enumerate(res):
- print("i", i*stride, float(sim.fpr(i*stride)),
- "expected", expected)
- for i, expected in enumerate(res):
- # convert to Power single
- expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
- expected = float(expected)
- actual = float(sim.fpr(i*stride))
- # approximate error calculation, good enough test
- # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
- # and the rounding is different
- err = abs((actual - expected) / expected)
- print("err", i*stride, err)
- self.assertTrue(err < 1e-5)
-
- def test_sv_remap_fpmadds_dct_8(self, stride=2):
- """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
- "svshape 8, 1, 1, 2, 0",
- "sv.fdmadds *0, *0, *0, *8"
- "svshape 8, 1, 1, 3, 0",
- "sv.fadds *0, *0, *0"
- ]
- runs a full in-place 8-long O(N log2 N) DCT, both
- inner and outer butterfly "REMAP" schedules.
- """
- lst = SVP64Asm(["svremap 27, 1, 0, 2, 0, 1, 1",
- "svshape 8, 1, %d, 2, 0" % stride,
- "sv.fdmadds *0, *0, *0, *16",
- "svshape 8, 1, %d, 3, 0" % stride,
- "sv.fadds *0, *0, *0"
- ])
- lst = list(lst)
-
- # array and coefficients to test
- avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
- n = len(avi)
- levels = n.bit_length() - 1
- ri = list(range(n))
- ri = [ri[reverse_bits(i, levels)] for i in range(n)]
- av = halfrev2(avi, False)
- av = [av[ri[i]] for i in range(n)]
- ctable = []
- size = n
- while size >= 2:
- halfsize = size // 2
- for i in range(n//size):
- for ci in range(halfsize):
- ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
- size //= 2
-
- # store in regfile
- fprs = [0] * 32
- for i, a in enumerate(av):
- fprs[i*stride+0] = fp64toselectable(a)
- for i, c in enumerate(ctable):
- fprs[i+16] = fp64toselectable(1.0 / c) # invert
-
- with Program(lst, bigendian=False) as program:
- sim = self.run_tst_program(program, initial_fprs=fprs)
- print("spr svshape0", sim.spr['SVSHAPE0'])
- print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
- print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
- print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
- print("spr svshape1", sim.spr['SVSHAPE1'])
- print("spr svshape2", sim.spr['SVSHAPE2'])
- print("spr svshape3", sim.spr['SVSHAPE3'])
-
- # outer iterative sum
- res = transform2(avi)
-
- for i, expected in enumerate(res):
- print("i", i*stride, float(sim.fpr(i*stride)),
- "expected", expected)
- for i, expected in enumerate(res):
- # convert to Power single
- expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
- expected = float(expected)
- actual = float(sim.fpr(i*stride))
- # approximate error calculation, good enough test
- # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
- # and the rounding is different
- err = abs((actual - expected) / expected)
- print("err", i, err)
- self.assertTrue(err < 1e-5)
-
- def test_sv_remap_dct_cos_precompute_8(self):
- """pre-computes a DCT COS table, deliberately using a lot of
- registers so as to be able to see what is going on (dumping all
- regs after the run).
-
- the simpler (scalar) version is in test_caller_transcendentals.py
- (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
- need the new version of fcfids which doesn't spam memory with
- LD/STs.
- """
- lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
- "svremap 0, 0, 0, 2, 0, 1, 1",
- "sv.svstep *4, 4, 1", # svstep get vector of ci
- "sv.svstep *16, 3, 1", # svstep get vector of step
- "addi 1, 0, 0x0000",
- "setvl 0, 0, 12, 0, 1, 1",
- "sv.std *4, 0(1)",
- "sv.lfd *64, 0(1)",
- "sv.fcfids *48, *64",
- "addi 1, 0, 0x0060",
- "sv.std *16, 0(1)",
- "sv.lfd *12, 0(1)",
- "sv.fcfids *24, *12",
- "sv.fadds *0, *24, 43", # plus 0.5
- "sv.fmuls *0, *0, 41", # times PI
- "sv.fdivs *0, *0, *48", # div size
- "sv.fcoss *80, *0",
- "sv.fdivs *80, 43, *80", # div 0.5 / x
- ])
- lst = list(lst)
-
- gprs = [0] * 32
- fprs = [0] * 128
- # constants
- fprs[43] = fp64toselectable(0.5) # 0.5
- fprs[41] = fp64toselectable(math.pi) # pi
- fprs[44] = fp64toselectable(2.0) # 2.0
-
- n = 8
-
- ctable = []
- size = n
- while size >= 2:
- halfsize = size // 2
- for i in range(n//size):
- for ci in range(halfsize):
- ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
- size //= 2
-
- with Program(lst, bigendian=False) as program:
- sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
- print("MEM")
- sim.mem.dump()
- print("ci FP")
- for i in range(len(ctable)):
- actual = float(sim.fpr(i+24))
- print("i", i, actual)
- print("size FP")
- for i in range(len(ctable)):
- actual = float(sim.fpr(i+48))
- print("i", i, actual)
- print("temps")
- for i in range(len(ctable)):
- actual = float(sim.fpr(i))
- print("i", i, actual)
- for i in range(len(ctable)):
- expected = 1.0/ctable[i]
- actual = float(sim.fpr(i+80))
- err = abs((actual - expected) / expected)
- print("i", i, actual, "1/expect", 1/expected,
- "expected", expected,
- "err", err)
- self.assertTrue(err < 1e-6)
-
def test_sv_remap_dct_cos_precompute_inner_8(self):
"""pre-computes a DCT COS table, using the shorter costable
indices schedule. turns out, some COS values are repeated
"svshape 8, 1, 1, 11, 0",
"sv.fadds *0, *0, *0",
# Inner butterfly, twin +/- MUL-ADD-SUB
- "svshape 8, 1, 1, 10, 0",
+ "svshape 8, 1, 1, 12, 0",
"sv.ffmadds *0, *0, *0, *8"
]
runs a full in-place 8-long O(N log2 N) Inverse-DCT, both