From 8dfffc9c2ff7bb91715500160d1b057f9bef3ba0 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Mon, 28 Jun 2021 15:15:03 +0100 Subject: [PATCH] add extra offset for FRB, for FFT Cooley-Tukey twin mul/add-sub --- openpower/isa/svfparith.mdwn | 8 +-- .../decoder/isa/test_caller_svp64_fft.py | 66 ++++++++++++------- src/openpower/decoder/power_decoder2.py | 32 +++++---- 3 files changed, 66 insertions(+), 40 deletions(-) diff --git a/openpower/isa/svfparith.mdwn b/openpower/isa/svfparith.mdwn index 1bcc1ac7..813ee2c5 100644 --- a/openpower/isa/svfparith.mdwn +++ b/openpower/isa/svfparith.mdwn @@ -166,7 +166,7 @@ A-Form Pseudo-code: - FRT <- FPMULADD32(FRA, FRC, FRB, 1, 1) + FRT <- FPMULADD32(FRA, FRC, FRA, 1, 1) FRS <- FPMULADD32(FRA, FRC, FRB, -1, 1) Special Registers Altered: @@ -185,7 +185,7 @@ A-Form Pseudo-code: - FRT <- FPMULADD32(FRA, FRC, FRB, 1, -1) + FRT <- FPMULADD32(FRA, FRC, FRA, 1, -1) FRS <- FPMULADD32(FRA, FRC, FRB, -1, -1) Special Registers Altered: @@ -204,7 +204,7 @@ A-Form Pseudo-code: - FRT <- FPMULADD32(FRA, FRC, FRB, -1, -1) + FRT <- FPMULADD32(FRA, FRC, FRA, -1, -1) FRS <- FPMULADD32(FRA, FRC, FRB, 1, -1) Special Registers Altered: @@ -223,7 +223,7 @@ A-Form Pseudo-code: - FRT <- FPMULADD32(FRA, FRC, FRB, -1, 1) + FRT <- FPMULADD32(FRA, FRC, FRA, -1, 1) FRS <- FPMULADD32(FRA, FRC, FRB, 1, 1) Special Registers Altered: diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py index d7dbf03c..6cb2b522 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_fft.py +++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py @@ -14,7 +14,8 @@ from openpower.decoder.isa.test_caller import Register, run_tst from openpower.sv.trans.svp64 import SVP64Asm from openpower.consts import SVP64CROffs from copy import deepcopy - +from openpower.decoder.helpers import fp64toselectable +from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE class DecoderTestCase(FHDLTestCase): @@ -22,44 +23,59 @@ class DecoderTestCase(FHDLTestCase): for i in range(32): self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64)) - def test_sv_fpmadds(self): - """>>> lst = ["sv.ffmadds 12.v, 2.v, 4.v, 12.v" + def test_sv_fpmadds_fft(self): + """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v" ] - two vector mul-adds, two vector mul-subs - * fp12 = fp2 * fp4 + f12 = 7.0 * -2.0 + 2.0 = -12.0 - * fp13 = fp3 * fp5 + f13 = (-9.8 * 2.0) + -32.3 = -51.9 - * fp14 = -(fp2 * fp4) + f14 = -(7.0 * -2.0) + 2.0 = -16.0 - * fp15 = -(fp3 * fp5) + f15 = -(-9.8 * 2) + -32.3 = -12.7 + four in-place vector mul-adds, four in-place vector mul-subs + + this is the twin "butterfly" mul-add-sub from Cooley-Tukey + https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms + + there is the *option* to target a different location (non-in-place) + just in case. + + SVP64 "FFT" mode will *automatically* offset FRB and an implicit + FRS to perform the two multiplies. one add, one subtract. + + sv.ffmadds FRT, FRA, FRC, FRB actually does: + fmadds FRT , FRA, FRC, FRA + fnmsubs FRT+vl, FRA, FRC, FRB+vl """ - lst = SVP64Asm(["sv.ffmadds 12.v, 2.v, 4.v, 12.v" + lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v" ]) lst = list(lst) fprs = [0] * 32 - fprs[2] = 0x401C000000000000 # 7.0 - fprs[3] = 0xC02399999999999A # -9.8 - fprs[4] = 0x4000000000000000 # 2.0 - fprs[5] = 0xC040266660000000 # -32.3 - fprs[6] = 0x4000000000000000 # 2.0 - fprs[7] = 0x4000000000000000 # 2.0 - fprs[12] = 0xc000000000000000 # -2.0 - fprs[13] = 0x4000000000000000 # 2.0 - fprs[14] = 0xC02399999999999A # -9.8 - fprs[15] = 0xC040266660000000 # -32.3 + av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3 + bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7 + coe = [-1.0, 4.0, 3.1, 6.2] # coefficients + res = [] + # work out the results with the twin mul/add-sub + for i, (a, b, c) in enumerate(zip(av, bv, coe)): + fprs[i+2] = fp64toselectable(a) + fprs[i+6] = fp64toselectable(b) + fprs[i+10] = fp64toselectable(c) + mul = a * c + t = a + mul + u = b - mul + t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single + u = DOUBLE2SINGLE(fp64toselectable(u)) # from double + res.append((t, u)) + print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u) # SVSTATE (in this case, VL=2) svstate = SVP64State() - svstate.vl[0:7] = 2 # VL - svstate.maxvl[0:7] = 2 # MAXVL + svstate.vl[0:7] = 4 # VL + svstate.maxvl[0:7] = 4 # MAXVL print ("SVSTATE", bin(svstate.spr.asint())) with Program(lst, bigendian=False) as program: sim = self.run_tst_program(program, svstate=svstate, initial_fprs=fprs) - self.assertEqual(sim.fpr(12), SelectableInt(0xC028000000000000, 64)) - self.assertEqual(sim.fpr(13), SelectableInt(0xC049F33320000000, 64)) - self.assertEqual(sim.fpr(14), SelectableInt(0x4030000000000000, 64)) - self.assertEqual(sim.fpr(15), SelectableInt(0xc029666640000000, 64)) + # confirm that the results are as expected + for i, (t, u) in enumerate(res): + self.assertEqual(sim.fpr(i+2), t) + self.assertEqual(sim.fpr(i+6), u) def run_tst_program(self, prog, initial_regs=None, svstate=None, diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py index ccf9fef1..290feea9 100644 --- a/src/openpower/decoder/power_decoder2.py +++ b/src/openpower/decoder/power_decoder2.py @@ -453,6 +453,7 @@ class DecodeOut2(Elaboratable): self.lk = Signal(reset_less=True) self.insn_in = Signal(32, reset_less=True) self.reg_out = Data(5, "reg_o2") + self.fp_madd_en = Signal(reset_less=True) # FFT instruction detected self.fast_out = Data(3, "fast_o2") self.fast_out3 = Data(3, "fast_o3") @@ -494,6 +495,7 @@ class DecodeOut2(Elaboratable): with m.If(self.svp64_fft_mode): comb += self.reg_out.data.eq(self.dec.FRT) comb += self.reg_out.ok.eq(1) + comb += self.fp_madd_en.eq(1) return m @@ -1250,27 +1252,34 @@ class PowerDecode2(PowerDecodeSubset): # registers a, b, c and out and out2 (LD/ST EA) sv_etype = self.op_get("SV_Etype") - for to_reg, fromreg, svdec, out in ( - (e.read_reg1, dec_a.reg_out, in1_svdec, False), - (e.read_reg2, dec_b.reg_out, in2_svdec, False), - (e.read_reg3, dec_c.reg_out, in3_svdec, False), - (e.write_reg, dec_o.reg_out, o_svdec, True), - (e.write_ea, dec_o2.reg_out, o2_svdec, True)): + for rname, to_reg, fromreg, svdec, out in ( + ("RA", e.read_reg1, dec_a.reg_out, in1_svdec, False), + ("RB", e.read_reg2, dec_b.reg_out, in2_svdec, False), + ("RC", e.read_reg3, dec_c.reg_out, in3_svdec, False), + ("RT", e.write_reg, dec_o.reg_out, o_svdec, True), + ("EA", e.write_ea, dec_o2.reg_out, o2_svdec, True)): comb += svdec.extra.eq(extra) # EXTRA field of SVP64 RM comb += svdec.etype.eq(sv_etype) # EXTRA2/3 for this insn comb += svdec.reg_in.eq(fromreg.data) # 3-bit (CR0/BC/BFA) comb += to_reg.ok.eq(fromreg.ok) + # *screaam* FFT mode needs an extra offset for RB + # similar to FRS/FRT (below). all of this needs cleanup + offs = Signal(7, name="offs_"+rname, reset_less=True) + comb += offs.eq(0) + if rname == 'RB': + with m.If(dec_o2.reg_out.ok & dec_o2.fp_madd_en): + comb += offs.eq(vl) # detect if Vectorised: add srcstep/dststep if yes. # to_reg is 7-bits, outs get dststep added, ins get srcstep with m.If(svdec.isvec): step = dststep if out else srcstep # reverse gear goes the opposite way with m.If(self.rm_dec.reverse_gear): - comb += to_reg.data.eq(svdec.reg_out+(vl-1-step)) + comb += to_reg.data.eq(offs+svdec.reg_out+(vl-1-step)) with m.Else(): - comb += to_reg.data.eq(step+svdec.reg_out) + comb += to_reg.data.eq(offs+step+svdec.reg_out) with m.Else(): - comb += to_reg.data.eq(svdec.reg_out) + comb += to_reg.data.eq(offs+svdec.reg_out) # SVP64 in/out fields comb += in1_svdec.idx.eq(self.op_get("sv_in1")) # reg #1 (in1_sel) @@ -1291,8 +1300,9 @@ class PowerDecode2(PowerDecodeSubset): # urrr... don't ask... the implicit register FRS in FFT mode # "tracks" FRT exactly except it's offset by VL. rather than - # mess up the above with if-statements, override it here - with m.If(dec_o2.reg_out.ok & self.use_svp64_fft): + # mess up the above with if-statements, override it here. + # same trick is applied to FRA, above, but it's a lot cleaner, there + with m.If(dec_o2.reg_out.ok & dec_o2.fp_madd_en): svdec = o_svdec # yes take source as o_svdec... with m.If(svdec.isvec): # reverse gear goes the opposite way -- 2.30.2