from openpower.decoder.isa.test_caller import run_tst
from openpower.sv.trans.svp64 import SVP64Asm
from copy import deepcopy
-from openpower.decoder.helpers import fp64toselectable
+from openpower.decoder.helpers import fp64toselectable, SINGLE
from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
-def transform_radix2(vec, exptable):
+def transform_radix2(vec, exptable, reverse=False):
"""
# FFT and convolution test (Python), based on Project Nayuki
#
levels = n.bit_length() - 1
# Copy with bit-reversed permutation
- #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
+ if reverse:
+ vec = [vec[reverse_bits(i, levels)] for i in range(n)]
size = 2
while size <= n:
return vec
-def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i):
+def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
"""
# FFT and convolution test (Python), based on Project Nayuki
#
levels = n.bit_length() - 1
# Copy with bit-reversed permutation
- #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
+ if reverse:
+ vec = [vec[reverse_bits(i, levels)] for i in range(n)]
size = 2
while size <= n:
"svremap 31, 1, 0, 2, 0, 1, 0",
"sv.ffmadds 0.v, 0.v, 0.v, 8.v",
"setvl. 0, 0, 1, 1, 0, 0",
- "bc 4, 2, -16"
+ "bc 6, 3, -16"
])
runs a full in-place O(N log2 N) butterfly schedule for
Discrete Fourier Transform. this version however uses
"svremap 31, 1, 0, 2, 0, 1, 0",
"sv.ffmadds 0.v, 0.v, 0.v, 8.v",
"setvl. 0, 0, 1, 1, 0, 0",
- "bc 4, 2, -16"
+ "bc 6, 3, -16"
])
lst = list(lst)
"svremap 26, 0, 0, 0, 0, 1, 1",
"sv.ffadds 0.v, 24, 0.v",
"setvl. 0, 0, 1, 1, 0, 0",
- "bc 4, 2, -28"
+ "bc 6, 3, -28"
])
runs a full in-place O(N log2 N) butterfly schedule for
twin-add: # sv.ffadds FRT(/FRS), FRA, FRB
vec[jh] = temp2 - temp1
vec[jl] = temp2 + temp1
+
+ also see notes in complex fft test: here svremap is done in
+ "non-persistent" mode (as a demo) whereas in the complex fft
+ svremap is used in "persistent" mode, where by a complete
+ coincidence the REMAP arguments all happen to line up and
+ only one persistent svremap is needed. the exact same trick
+ *could* be applied here but for illustrative purposes it is not.
"""
lst = SVP64Asm( [
"svshape 8, 1, 1, 1, 1",
# RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
- "svremap 5, 1, 0, 2, 0, 0, 1",
+ "svremap 5, 1, 0, 2, 0, 0, 0",
"sv.fmuls 24, 0.v, 8.v",
# RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
- "svremap 26, 0, 0, 0, 0, 1, 1",
+ "svremap 26, 0, 0, 0, 0, 1, 0",
"sv.ffadds 0.v, 24, 0.v",
"setvl. 0, 0, 1, 1, 0, 0",
- "bc 4, 2, -28"
+ "bc 6, 3, -28"
])
lst = list(lst)
sv.ffmadds FRT, FRA, FRC, FRB actually does:
fmadds FRT , FRA, FRC, FRA
fnmsubs FRT+vl, FRA, FRC, FRB+vl
+
"""
lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
])
temp2 = vec[jl]
vec[jh] = temp2 - temp1
vec[jl] = temp2 + temp1
+
+ note: a rather nice convenience / coincidence. the meaning of
+ these two instructions is:
+ # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
+ "svremap 5, 1, 0, 2, 0, 0, 1",
+ # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
+ "svremap 26, 0, 0, 0, 0, 1, 1",
+
+ however it turns out that they can be *merged*, and for
+ the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
+ *ignore* their REMAPs (by definition, because you can't REMAP
+ scalar operands), and for the second one (sv.ffads) exactly the
+ right REMAPs are also ignored!
+
+ therefore we can merge:
+ "svremap 5, 1, 0, 2, 0, 0, 1",
+ "svremap 26, 0, 0, 0, 0, 1, 1",
+ into:
+ "svremap 31, 1, 0, 2, 0, 1, 1",
+ and save one instruction.
"""
lst = SVP64Asm( [
- # set triple butterfly mode
+ # set triple butterfly mode with persistent "REMAP"
"svshape 8, 1, 1, 1, 1",
+ "svremap 31, 1, 0, 2, 0, 1, 1",
# tpre
- "svremap 5, 1, 0, 2, 0, 0, 1",
"sv.fmuls 24, 0.v, 16.v", # mul1_r = r*cos_r
- "svremap 5, 1, 0, 2, 0, 0, 1",
"sv.fmadds 24, 8.v, 20.v, 24", # mul2_r = i*sin_i
# tpre = mul1_r + mul2_r
# tpim
- "svremap 5, 1, 0, 2, 0, 0, 1",
"sv.fmuls 26, 0.v, 20.v", # mul1_i = r*sin_i
- "svremap 5, 1, 0, 2, 0, 0, 1",
"sv.fmsubs 26, 8.v, 16.v, 26", # mul2_i = i*cos_r
# tpim = mul2_i - mul1_i
# vec_r jh/jl
- "svremap 26, 0, 0, 0, 0, 1, 1",
"sv.ffadds 0.v, 24, 0.v", # vh/vl +/- tpre
# vec_i jh/jl
- "svremap 26, 0, 0, 0, 0, 1, 1",
"sv.ffadds 8.v, 26, 8.v", # vh/vl +- tpim
# svstep loop
"setvl. 0, 0, 1, 1, 0, 0",
- "bc 4, 2, -76"
+ "bc 6, 3, -56"
])
lst = list(lst)
self.assertEqual(sim.fpr(i+2), t)
self.assertEqual(sim.fpr(i+6), u)
+ def test_sv_remap_fpmadds_fft_ldst(self):
+ """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
+ "sv.lfssh 0.v, 4(0), 20", # bit-reversed
+ "svshape 8, 1, 1, 1, 0",
+ "svremap 31, 1, 0, 2, 0, 1, 0",
+ "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
+
+ runs a full in-place O(N log2 N) butterfly schedule for
+ Discrete Fourier Transform, using bit-reversed LD/ST
+ """
+ lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
+ "svremap 1, 0, 0, 0, 0, 0, 0, 0",
+ "sv.lfssh 0.v, 4(0), 20", # shifted
+ "svshape 8, 1, 1, 1, 0",
+ "svremap 31, 1, 0, 2, 0, 1, 0",
+ "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
+ ])
+ lst = list(lst)
+
+ # array and coefficients to test
+ av = [7.0, -9.8, 3.0, -32.3,
+ -2.0, 5.0, -9.8, 31.3] # array 0..7
+ coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
+
+ # store in regfile
+ fprs = [0] * 32
+ for i, c in enumerate(coe):
+ fprs[i+8] = fp64toselectable(c)
+ # store in memory
+ mem = {}
+ val = 0
+ for i, a in enumerate(av):
+ a = SINGLE(fp64toselectable(a)).value
+ shift = (i % 2) == 1
+ if shift == 0:
+ val = a
+ else:
+ mem[(i//2)*8] = val | (a << 32)
+
+ with Program(lst, bigendian=False) as program:
+ sim = self.run_tst_program(program, initial_mem=mem,
+ initial_fprs=fprs)
+ print ("spr svshape0", sim.spr['SVSHAPE0'])
+ print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
+ print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
+ print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
+ print ("spr svshape1", sim.spr['SVSHAPE1'])
+ print ("spr svshape2", sim.spr['SVSHAPE2'])
+ print ("spr svshape3", sim.spr['SVSHAPE3'])
+
+ print ("mem dump")
+ print (sim.mem.dump())
+
+ # work out the results with the twin mul/add-sub,
+ # note bit-reverse mode requested
+ res = transform_radix2(av, coe, reverse=True)
+
+ for i, expected in enumerate(res):
+ print ("i", i, float(sim.fpr(i)), "expected", expected)
+ for i, expected in enumerate(res):
+ # convert to Power single
+ expected = DOUBLE2SINGLE(fp64toselectable(expected))
+ expected = float(expected)
+ actual = float(sim.fpr(i))
+ # approximate error calculation, good enough test
+ # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
+ # and the rounding is different
+ err = abs(actual - expected) / expected
+ self.assertTrue(err < 1e-6)
+
def run_tst_program(self, prog, initial_regs=None,
svstate=None,
initial_mem=None,