From a77af7ef383640c695cc435124b855540e0f0ba1 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 7 Jul 2021 14:07:27 +0100 Subject: [PATCH] add in DFT test from Project Nayuki to verify results of butterfly schedule --- .../decoder/isa/test_caller_svp64_fft.py | 187 +++++++++++------- 1 file changed, 112 insertions(+), 75 deletions(-) diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py index fe7c7f6d..23bd38e5 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_fft.py +++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py @@ -17,71 +17,65 @@ from copy import deepcopy from openpower.decoder.helpers import fp64toselectable from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE + +def transform_radix2(vec, exptable): + """ + # FFT and convolution test (Python), based on Project Nayuki + # + # Copyright (c) 2020 Project Nayuki. (MIT License) + # https://www.nayuki.io/page/free-small-fft-in-multiple-languages + + """ + # bits of the integer 'val'. + def reverse_bits(val, width): + result = 0 + for _ in range(width): + result = (result << 1) | (val & 1) + val >>= 1 + return result + + # Initialization + n = len(vec) + levels = n.bit_length() - 1 + + # Copy with bit-reversed permutation + #vec = [vec[reverse_bits(i, levels)] for i in range(n)] + + size = 2 + while size <= n: + halfsize = size // 2 + tablestep = n // size + for i in range(0, n, size): + k = 0 + for j in range(i, i + halfsize): + # exact same actual computation, just embedded in + # triple-nested for-loops + jl, jh = j, j+halfsize + temp1 = vec[jh] * exptable[k] + temp2 = vec[jl] + vec[jh] = temp2 - temp1 + vec[jl] = temp2 + temp1 + print ("transform_radix2 jl jh k", jl, jh, k, + "temp1, temp2", temp1, temp2, + "v[jh] v[jl]", vec[jh], vec[jl]) + k += tablestep + size *= 2 + + return vec + + class DecoderTestCase(FHDLTestCase): def _check_regs(self, sim, expected): for i in range(32): self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64)) - def tst_sv_fpmadds_fft(self): - """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v" - ] - four in-place vector mul-adds, four in-place vector mul-subs - - this is the twin "butterfly" mul-add-sub from Cooley-Tukey - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms - - there is the *option* to target a different location (non-in-place) - just in case. - - SVP64 "FFT" mode will *automatically* offset FRB and an implicit - FRS to perform the two multiplies. one add, one subtract. - - sv.ffmadds FRT, FRA, FRC, FRB actually does: - fmadds FRT , FRA, FRC, FRA - fnmsubs FRT+vl, FRA, FRC, FRB+vl - """ - lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v" - ]) - lst = list(lst) - - fprs = [0] * 32 - av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3 - bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7 - coe = [-1.0, 4.0, 3.1, 6.2] # coefficients - res = [] - # work out the results with the twin mul/add-sub - for i, (a, b, c) in enumerate(zip(av, bv, coe)): - fprs[i+2] = fp64toselectable(a) - fprs[i+6] = fp64toselectable(b) - fprs[i+10] = fp64toselectable(c) - mul = a * c - t = a + mul - u = b - mul - t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single - u = DOUBLE2SINGLE(fp64toselectable(u)) # from double - res.append((t, u)) - print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u) - - # SVSTATE (in this case, VL=2) - svstate = SVP64State() - svstate.vl[0:7] = 4 # VL - svstate.maxvl[0:7] = 4 # MAXVL - print ("SVSTATE", bin(svstate.spr.asint())) - - with Program(lst, bigendian=False) as program: - sim = self.run_tst_program(program, svstate=svstate, - initial_fprs=fprs) - # confirm that the results are as expected - for i, (t, u) in enumerate(res): - self.assertEqual(sim.fpr(i+2), t) - self.assertEqual(sim.fpr(i+6), u) - def test_sv_remap_fpmadds_fft(self): """>>> lst = ["svremap 8, 1, 1, 1", "sv.ffmadds 2.v, 2.v, 2.v, 10.v" ] - four in-place vector mul-adds, four in-place vector mul-subs + runs a full in-place O(N log2 N) butterfly schedule for + Discrete Fourier Transform. this is the twin "butterfly" mul-add-sub from Cooley-Tukey https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms @@ -89,38 +83,28 @@ class DecoderTestCase(FHDLTestCase): there is the *option* to target a different location (non-in-place) just in case. - SVP64 "FFT" mode will *automatically* offset FRB and an implicit - FRS to perform the two multiplies. one add, one subtract. - - sv.ffmadds FRT, FRA, FRC, FRB actually does: - fmadds FRT , FRA, FRC, FRA - fnmsubs FRT+vl, FRA, FRC, FRB+vl + SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC + (3 inputs, 2 outputs) """ lst = SVP64Asm( ["svremap 8, 1, 1, 1", "sv.ffmadds 0.v, 0.v, 0.v, 8.v" ]) lst = list(lst) - fprs = [0] * 32 + # array and coefficients to test av = [7.0, -9.8, 3.0, -32.3, -2.0, 5.0, -9.8, 31.3] # array 0..7 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients + # store in regfile + fprs = [0] * 32 for i, c in enumerate(coe): fprs[i+8] = fp64toselectable(c) for i, a in enumerate(av): fprs[i+0] = fp64toselectable(a) + # work out the results with the twin mul/add-sub - res = [] - for i, (a, c) in enumerate(zip(av, coe)): - continue - mul = a * c - t = a + mul - u = b - mul - t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single - u = DOUBLE2SINGLE(fp64toselectable(u)) # from double - res.append((t, u)) - print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u) + res = transform_radix2(av, coe) # set total. err don't know how to calculate how many there are... # do it manually for now @@ -151,10 +135,63 @@ class DecoderTestCase(FHDLTestCase): print ("spr svshape1", sim.spr['SVSHAPE1']) print ("spr svshape2", sim.spr['SVSHAPE2']) print ("spr svshape3", sim.spr['SVSHAPE3']) - for i in range(8): - print ("i", i, float(sim.fpr(i))) + for i, expected in enumerate(res): + print ("i", i, float(sim.fpr(i)), "expected", expected) + for i, expected in enumerate(res): + # convert to Power single + expected = DOUBLE2SINGLE(fp64toselectable(expected)) + self.assertEqual(sim.fpr(i), expected) + + + def test_sv_fpmadds_fft(self): + """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v" + ] + four in-place vector mul-adds, four in-place vector mul-subs + + this is the twin "butterfly" mul-add-sub from Cooley-Tukey + https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms + + there is the *option* to target a different location (non-in-place) + just in case. + + SVP64 "FFT" mode will *automatically* offset FRB and an implicit + FRS to perform the two multiplies. one add, one subtract. + + sv.ffmadds FRT, FRA, FRC, FRB actually does: + fmadds FRT , FRA, FRC, FRA + fnmsubs FRT+vl, FRA, FRC, FRB+vl + """ + lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v" + ]) + lst = list(lst) - return + fprs = [0] * 32 + av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3 + bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7 + coe = [-1.0, 4.0, 3.1, 6.2] # coefficients + res = [] + # work out the results with the twin mul/add-sub + for i, (a, b, c) in enumerate(zip(av, bv, coe)): + fprs[i+2] = fp64toselectable(a) + fprs[i+6] = fp64toselectable(b) + fprs[i+10] = fp64toselectable(c) + mul = a * c + t = a + mul + u = b - mul + t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single + u = DOUBLE2SINGLE(fp64toselectable(u)) # from double + res.append((t, u)) + print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u) + + # SVSTATE (in this case, VL=2) + svstate = SVP64State() + svstate.vl[0:7] = 4 # VL + svstate.maxvl[0:7] = 4 # MAXVL + print ("SVSTATE", bin(svstate.spr.asint())) + + with Program(lst, bigendian=False) as program: + sim = self.run_tst_program(program, svstate=svstate, + initial_fprs=fprs) # confirm that the results are as expected for i, (t, u) in enumerate(res): self.assertEqual(sim.fpr(i+2), t) -- 2.30.2