small inner DCT butterfly test, fix up order of fdmadds
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
14 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
15 iterate_dct_inner_butterfly_indices)
16
17
18 def transform_inner_radix2(vec, ctable):
19
20 # Initialization
21 n = len(vec)
22 print ()
23 print ("transform2", n)
24 levels = n.bit_length() - 1
25
26 # reference (read/write) the in-place data in *reverse-bit-order*
27 ri = list(range(n))
28 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
29
30 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
31 # TODO: merge these two
32 vec = halfrev2(vec, False)
33 vec = [vec[ri[i]] for i in range(n)]
34
35 ################
36 # INNER butterfly
37 ################
38 xdim = n
39 ydim = 0
40 zdim = 0
41
42 # set up an SVSHAPE
43 class SVSHAPE:
44 pass
45 # j schedule
46 SVSHAPE0 = SVSHAPE()
47 SVSHAPE0.lims = [xdim, ydim, zdim]
48 SVSHAPE0.order = [0,1,2] # experiment with different permutations, here
49 SVSHAPE0.mode = 0b01
50 SVSHAPE0.submode2 = 0b01
51 SVSHAPE0.skip = 0b00
52 SVSHAPE0.offset = 0 # experiment with different offset, here
53 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
54 # j+halfstep schedule
55 SVSHAPE1 = SVSHAPE()
56 SVSHAPE1.lims = [xdim, ydim, zdim]
57 SVSHAPE1.order = [0,1,2] # experiment with different permutations, here
58 SVSHAPE1.mode = 0b01
59 SVSHAPE1.submode2 = 0b01
60 SVSHAPE1.skip = 0b01
61 SVSHAPE1.offset = 0 # experiment with different offset, here
62 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
63
64 # enumerate over the iterator function, getting new indices
65 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
66 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
67 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
68 t1, t2 = vec[jl], vec[jh]
69 coeff = ctable[k]
70 vec[jl] = t1 + t2
71 vec[jh] = (t1 - t2) * (1.0/coeff)
72 print ("coeff", "ci", k,
73 "jl", jl, "jh", jh,
74 "i/n", (k+0.5), 1.0/coeff,
75 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
76 "end", bin(jle), bin(jhe))
77 if jle == 0b111: # all loops end
78 break
79
80 return vec
81
82
83 class DCTTestCase(FHDLTestCase):
84
85 def _check_regs(self, sim, expected):
86 for i in range(32):
87 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
88
89 def test_sv_ffadds_dct(self):
90 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
91 ]
92 four in-place vector adds, four in-place vector mul-subs
93
94 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
95 FRS to perform the two multiplies. one add, one subtract.
96
97 sv.fdadds FRT, FRA, FRC, FRB actually does:
98 fadds FRT , FRB, FRA
99 fsubs FRT+vl, FRA, FRB+vl
100 """
101 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
102 ])
103 lst = list(lst)
104
105 # cheat here with these values, they're selected so that
106 # rounding errors do not occur. sigh.
107 fprs = [0] * 32
108 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
109 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
110 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
111 res = []
112 # work out the results with the twin add-sub
113 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
114 fprs[i+0] = fp64toselectable(a)
115 fprs[i+4] = fp64toselectable(b)
116 fprs[i+8] = fp64toselectable(c)
117 # this isn't quite a perfect replication of the
118 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
119 # and FPSUB32 directly to be honest.
120 t = a + b
121 diff = (a - b)
122 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
123 diff = float(diff)
124 u = diff * c
125 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
126 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
127 res.append((uc, tc))
128 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
129
130 # SVSTATE (in this case, VL=2)
131 svstate = SVP64State()
132 svstate.vl = 4 # VL
133 svstate.maxvl = 4 # MAXVL
134 print ("SVSTATE", bin(svstate.asint()))
135
136 with Program(lst, bigendian=False) as program:
137 sim = self.run_tst_program(program, svstate=svstate,
138 initial_fprs=fprs)
139 # confirm that the results are as expected
140 for i, (t, u) in enumerate(res):
141 a = float(sim.fpr(i+0))
142 b = float(sim.fpr(i+4))
143 t = float(t)
144 u = float(u)
145 print ("DCT", i, "in", a, b, "res", t, u)
146 for i, (t, u) in enumerate(res):
147 self.assertEqual(sim.fpr(i+0), t)
148 self.assertEqual(sim.fpr(i+4), u)
149
150 def test_sv_remap_fpmadds_dct_4(self):
151 """>>> lst = ["svshape 4, 1, 1, 2, 0",
152 "svremap 27, 1, 0, 2, 0, 1, 0",
153 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
154 ]
155 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
156 for DCT
157
158 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
159 (3 inputs, 2 outputs)
160
161 Note that the coefficient (FRC) is not on a "schedule", it
162 is straight Vectorised (0123...) because DCT coefficients
163 cannot be shared between butterfly layers (due to +0.5)
164 """
165 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
166 "svremap 27, 1, 0, 2, 0, 1, 0",
167 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
168 ])
169 lst = list(lst)
170
171 # array and coefficients to test
172 n = 4
173 av = [7.0, -9.8, 3.0, -32.3]
174 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
175
176 levels = n.bit_length() - 1
177 ri = list(range(n))
178 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
179 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
180 av = halfrev2(avi, False)
181 av = [av[ri[i]] for i in range(n)]
182
183 # store in regfile
184 fprs = [0] * 32
185 for i, c in enumerate(coe):
186 fprs[i+8] = fp64toselectable(1.0 / c) # invert
187 for i, a in enumerate(av):
188 fprs[i+0] = fp64toselectable(a)
189
190 with Program(lst, bigendian=False) as program:
191 sim = self.run_tst_program(program, initial_fprs=fprs)
192 print ("spr svshape0", sim.spr['SVSHAPE0'])
193 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
194 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
195 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
196 print ("spr svshape1", sim.spr['SVSHAPE1'])
197 print ("spr svshape2", sim.spr['SVSHAPE2'])
198 print ("spr svshape3", sim.spr['SVSHAPE3'])
199
200 # work out the results with the twin mul/add-sub
201 res = transform_inner_radix2(avi, coe)
202
203 for i, expected in enumerate(res):
204 print ("i", i, float(sim.fpr(i)), "expected", expected)
205 for i, expected in enumerate(res):
206 # convert to Power single
207 expected = DOUBLE2SINGLE(fp64toselectable(expected))
208 expected = float(expected)
209 actual = float(sim.fpr(i))
210 # approximate error calculation, good enough test
211 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
212 # and the rounding is different
213 err = abs((actual - expected) / expected)
214 print ("err", i, err)
215 self.assertTrue(err < 1e-6)
216
217 def run_tst_program(self, prog, initial_regs=None,
218 svstate=None,
219 initial_mem=None,
220 initial_fprs=None):
221 if initial_regs is None:
222 initial_regs = [0] * 32
223 simulator = run_tst(prog, initial_regs, mem=initial_mem,
224 initial_fprs=initial_fprs,
225 svstate=svstate)
226
227 print ("GPRs")
228 simulator.gpr.dump()
229 print ("FPRs")
230 simulator.fpr.dump()
231
232 return simulator
233
234
235 if __name__ == "__main__":
236 unittest.main()