From 38c833d2038c2068be70725634f5763141f27bb6 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Sat, 10 Jul 2021 12:09:10 +0100 Subject: [PATCH] add (disabled) FFT complex unit test under development --- .../decoder/isa/test_caller_svp64_fft.py | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py index 62b8609e..5b93281a 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_fft.py +++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py @@ -61,6 +61,56 @@ def transform_radix2(vec, exptable): return vec +def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i): + """ + # FFT and convolution test (Python), based on Project Nayuki + # + # Copyright (c) 2020 Project Nayuki. (MIT License) + # https://www.nayuki.io/page/free-small-fft-in-multiple-languages + + """ + # bits of the integer 'val'. + def reverse_bits(val, width): + result = 0 + for _ in range(width): + result = (result << 1) | (val & 1) + val >>= 1 + return result + + # Initialization + n = len(vec_r) + levels = n.bit_length() - 1 + + # Copy with bit-reversed permutation + #vec = [vec[reverse_bits(i, levels)] for i in range(n)] + + size = 2 + while size <= n: + halfsize = size // 2 + tablestep = n // size + for i in range(0, n, size): + k = 0 + for j in range(i, i + halfsize): + # exact same actual computation, just embedded in + # triple-nested for-loops + jl, jh = j, j+halfsize + + tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k] + tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k] + vec_r[jh] = vec_r[jl] - tpre + vec_i[jh] = vec_i[jl] - tpim + vec_r[jl] += tpre + vec_i[jl] += tpim + + print ("xform jl jh k", jl, jh, k, + "vr h l", vec_r[jh], vec_r[jl], + "vi h l", vec_i[jh], vec_i[jl]) + k += tablestep + size *= 2 + + return vec_r, vec_i + + class FFTTestCase(FHDLTestCase): def _check_regs(self, sim, expected): @@ -281,6 +331,134 @@ class FFTTestCase(FHDLTestCase): self.assertEqual(sim.fpr(i+2), t) self.assertEqual(sim.fpr(i+6), u) + def tst_sv_remap_fpmadds_fft_svstep_complex(self): + """ + runs a full in-place O(N log2 N) butterfly schedule for + Discrete Fourier Transform. this version however uses + SVP64 "Vertical-First" Mode and so needs an explicit + branch, testing CR0. + + SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC + (3 inputs, 2 outputs) + + complex calculation: + + tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k] + vec_r[jh] = vec_r[jl] - tpre + vec_r[jl] += tpre + + tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k] + vec_i[jh] = vec_i[jl] - tpim + vec_i[jl] += tpim + + temp1 = vec[jh] * exptable[k] + temp2 = vec[jl] + vec[jh] = temp2 - temp1 + vec[jl] = temp2 + temp1 + """ + lst = SVP64Asm( ["setvl 0, 0, 11, 1, 1, 1", + # tpre + "svremap 8, 1, 1, 1", + "sv.fmuls 32, 0.v, 16.v", + "svremap 8, 1, 1, 1", + "sv.fmuls 33, 8.v, 20.v", + "fadds 32, 32, 33", + # tpim + "svremap 8, 1, 1, 1", + "sv.fmuls 34, 0.v, 20.v", + "svremap 8, 1, 1, 1", + "sv.fmuls 35, 8.v, 16.v", + "fsubs 34, 34, 35", + # vec_r jh/jl + "svremap 8, 1, 1, 1", + "sv.ffadds 0.v, 0.v, 32", + # vec_i jh/jl + "svremap 8, 1, 1, 1", + "sv.ffadds 8.v, 8.v, 34", + + # svstep loop + "setvl. 0, 0, 0, 1, 0, 0", + "bc 4, 2, -16" + ]) + lst = list(lst) + + # array and coefficients to test + ar = [7.0, -9.8, 3.0, -32.3, + -2.0, 5.0, -9.8, 31.3] # array 0..7 real + ai = [1.0, -1.8, 3.0, 19.3, + 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary + coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real + coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary + + # store in regfile + fprs = [0] * 64 + for i, a in enumerate(ar): + fprs[i+0] = fp64toselectable(a) + for i, a in enumerate(ai): + fprs[i+8] = fp64toselectable(a) + for i, c in enumerate(coer): + fprs[i+16] = fp64toselectable(c) + for i, c in enumerate(coei): + fprs[i+20] = fp64toselectable(c) + + # set total. err don't know how to calculate how many there are... + # do it manually for now + VL = 0 + size = 2 + n = len(ar) + while size <= n: + halfsize = size // 2 + tablestep = n // size + for i in range(0, n, size): + for j in range(i, i + halfsize): + VL += 1 + size *= 2 + + # SVSTATE (calculated VL) + svstate = SVP64State() + svstate.vl[0:7] = VL # VL + svstate.maxvl[0:7] = VL # MAXVL + print ("SVSTATE", bin(svstate.spr.asint())) + + with Program(lst, bigendian=False) as program: + sim = self.run_tst_program(program, svstate=svstate, + initial_fprs=fprs) + print ("spr svshape0", sim.spr['SVSHAPE0']) + print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz) + print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz) + print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz) + print ("spr svshape1", sim.spr['SVSHAPE1']) + print ("spr svshape2", sim.spr['SVSHAPE2']) + print ("spr svshape3", sim.spr['SVSHAPE3']) + + # work out the results with the twin mul/add-sub, explicit + # complex numbers + res_r, res_i = transform_radix2_complex(ar, ai, coer, coei) + + for i, expected_r, expected_i in enumerate(zip(res_r, res_i)): + print ("i", i, float(sim.fpr(i)), + "expected_r", expected_r, + "expected_i", expected_i) + for i, expected_r, expected_i in enumerate(zip(res_r, res_i)): + # convert to Power single + expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r )) + expected_r = float(expected_r) + actual_r = float(sim.fpr(i)) + # approximate error calculation, good enough test + # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB + # and the rounding is different + err = abs(actual_r - expected_r ) / expected_r + self.assertTrue(err < 1e-7) + # convert to Power single + expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i )) + expected_i = float(expected_i) + actual_i = float(sim.fpr(i+8)) + # approximate error calculation, good enough test + # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB + # and the rounding is different + err = abs(actual_i - expected_i ) / expected_i + self.assertTrue(err < 1e-7) + def run_tst_program(self, prog, initial_regs=None, svstate=None, initial_mem=None, -- 2.30.2