add DCT outer butterfly iterative overlapping ADD schedule
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
14 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
15 iterate_dct_inner_butterfly_indices,
16 iterate_dct_outer_butterfly_indices)
17
18
19 def transform_inner_radix2(vec, ctable):
20
21 # Initialization
22 n = len(vec)
23 print ()
24 print ("transform2", n)
25 levels = n.bit_length() - 1
26
27 # reference (read/write) the in-place data in *reverse-bit-order*
28 ri = list(range(n))
29 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
30
31 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
32 # TODO: merge these two
33 vec = halfrev2(vec, False)
34 vec = [vec[ri[i]] for i in range(n)]
35
36 ################
37 # INNER butterfly
38 ################
39 xdim = n
40 ydim = 0
41 zdim = 0
42
43 # set up an SVSHAPE
44 class SVSHAPE:
45 pass
46 # j schedule
47 SVSHAPE0 = SVSHAPE()
48 SVSHAPE0.lims = [xdim, ydim, zdim]
49 SVSHAPE0.order = [0,1,2] # experiment with different permutations, here
50 SVSHAPE0.mode = 0b01
51 SVSHAPE0.submode2 = 0b01
52 SVSHAPE0.skip = 0b00
53 SVSHAPE0.offset = 0 # experiment with different offset, here
54 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
55 # j+halfstep schedule
56 SVSHAPE1 = SVSHAPE()
57 SVSHAPE1.lims = [xdim, ydim, zdim]
58 SVSHAPE1.order = [0,1,2] # experiment with different permutations, here
59 SVSHAPE1.mode = 0b01
60 SVSHAPE1.submode2 = 0b01
61 SVSHAPE1.skip = 0b01
62 SVSHAPE1.offset = 0 # experiment with different offset, here
63 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
64
65 # enumerate over the iterator function, getting new indices
66 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
67 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
68 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
69 t1, t2 = vec[jl], vec[jh]
70 coeff = ctable[k]
71 vec[jl] = t1 + t2
72 vec[jh] = (t1 - t2) * (1.0/coeff)
73 print ("coeff", "ci", k,
74 "jl", jl, "jh", jh,
75 "i/n", (k+0.5), 1.0/coeff,
76 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
77 "end", bin(jle), bin(jhe))
78 if jle == 0b111: # all loops end
79 break
80
81 return vec
82
83 def transform_outer_radix2(vec):
84
85 # Initialization
86 n = len(vec)
87 print ()
88 print ("transform2", n)
89 levels = n.bit_length() - 1
90
91 # outer butterfly
92 xdim = n
93 ydim = 0
94 zdim = 0
95
96 # j schedule
97 class SVSHAPE:
98 pass
99 SVSHAPE0 = SVSHAPE()
100 SVSHAPE0.lims = [xdim, ydim, zdim]
101 SVSHAPE0.submode2 = 0b100
102 SVSHAPE0.mode = 0b01
103 SVSHAPE0.skip = 0b00
104 SVSHAPE0.offset = 0 # experiment with different offset, here
105 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
106 # j+halfstep schedule
107 SVSHAPE1 = SVSHAPE()
108 SVSHAPE1.lims = [xdim, ydim, zdim]
109 SVSHAPE1.mode = 0b01
110 SVSHAPE1.submode2 = 0b100
111 SVSHAPE1.skip = 0b01
112 SVSHAPE1.offset = 0 # experiment with different offset, here
113 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
114
115 # enumerate over the iterator function, getting new indices
116 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
117 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
118 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
119 print ("itersum jr", jl, jh,
120 "end", bin(jle), bin(jhe))
121 vec[jl] += vec[jh]
122 if jle == 0b111: # all loops end
123 break
124
125 print("transform2 result", vec)
126
127 return vec
128
129
130 class DCTTestCase(FHDLTestCase):
131
132 def _check_regs(self, sim, expected):
133 for i in range(32):
134 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
135
136 def test_sv_ffadds_dct(self):
137 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
138 ]
139 four in-place vector adds, four in-place vector mul-subs
140
141 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
142 FRS to perform the two multiplies. one add, one subtract.
143
144 sv.fdadds FRT, FRA, FRC, FRB actually does:
145 fadds FRT , FRB, FRA
146 fsubs FRT+vl, FRA, FRB+vl
147 """
148 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
149 ])
150 lst = list(lst)
151
152 # cheat here with these values, they're selected so that
153 # rounding errors do not occur. sigh.
154 fprs = [0] * 32
155 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
156 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
157 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
158 res = []
159 # work out the results with the twin add-sub
160 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
161 fprs[i+0] = fp64toselectable(a)
162 fprs[i+4] = fp64toselectable(b)
163 fprs[i+8] = fp64toselectable(c)
164 # this isn't quite a perfect replication of the
165 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
166 # and FPSUB32 directly to be honest.
167 t = a + b
168 diff = (a - b)
169 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
170 diff = float(diff)
171 u = diff * c
172 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
173 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
174 res.append((uc, tc))
175 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
176
177 # SVSTATE (in this case, VL=2)
178 svstate = SVP64State()
179 svstate.vl = 4 # VL
180 svstate.maxvl = 4 # MAXVL
181 print ("SVSTATE", bin(svstate.asint()))
182
183 with Program(lst, bigendian=False) as program:
184 sim = self.run_tst_program(program, svstate=svstate,
185 initial_fprs=fprs)
186 # confirm that the results are as expected
187 for i, (t, u) in enumerate(res):
188 a = float(sim.fpr(i+0))
189 b = float(sim.fpr(i+4))
190 t = float(t)
191 u = float(u)
192 print ("DCT", i, "in", a, b, "res", t, u)
193 for i, (t, u) in enumerate(res):
194 self.assertEqual(sim.fpr(i+0), t)
195 self.assertEqual(sim.fpr(i+4), u)
196
197 def test_sv_remap_fpmadds_dct_inner_4(self):
198 """>>> lst = ["svshape 4, 1, 1, 2, 0",
199 "svremap 27, 1, 0, 2, 0, 1, 0",
200 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
201 ]
202 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
203 for DCT
204
205 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
206 (3 inputs, 2 outputs)
207
208 Note that the coefficient (FRC) is not on a "schedule", it
209 is straight Vectorised (0123...) because DCT coefficients
210 cannot be shared between butterfly layers (due to +0.5)
211 """
212 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
213 "svremap 27, 1, 0, 2, 0, 1, 0",
214 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
215 ])
216 lst = list(lst)
217
218 # array and coefficients to test
219 n = 4
220 av = [7.0, -9.8, 3.0, -32.3]
221 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
222
223 levels = n.bit_length() - 1
224 ri = list(range(n))
225 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
226 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
227 av = halfrev2(avi, False)
228 av = [av[ri[i]] for i in range(n)]
229
230 # store in regfile
231 fprs = [0] * 32
232 for i, c in enumerate(coe):
233 fprs[i+8] = fp64toselectable(1.0 / c) # invert
234 for i, a in enumerate(av):
235 fprs[i+0] = fp64toselectable(a)
236
237 with Program(lst, bigendian=False) as program:
238 sim = self.run_tst_program(program, initial_fprs=fprs)
239 print ("spr svshape0", sim.spr['SVSHAPE0'])
240 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
241 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
242 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
243 print ("spr svshape1", sim.spr['SVSHAPE1'])
244 print ("spr svshape2", sim.spr['SVSHAPE2'])
245 print ("spr svshape3", sim.spr['SVSHAPE3'])
246
247 # work out the results with the twin mul/add-sub
248 res = transform_inner_radix2(avi, coe)
249
250 for i, expected in enumerate(res):
251 print ("i", i, float(sim.fpr(i)), "expected", expected)
252 for i, expected in enumerate(res):
253 # convert to Power single
254 expected = DOUBLE2SINGLE(fp64toselectable(expected))
255 expected = float(expected)
256 actual = float(sim.fpr(i))
257 # approximate error calculation, good enough test
258 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
259 # and the rounding is different
260 err = abs((actual - expected) / expected)
261 print ("err", i, err)
262 self.assertTrue(err < 1e-6)
263
264 def test_sv_remap_fpmadds_dct_outer_8(self):
265 """>>> lst = ["svshape 8, 1, 1, 3, 0",
266 "svremap 27, 1, 0, 2, 0, 1, 0",
267 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
268 ]
269 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
270 for DCT, does the iterative overlapped ADDs
271
272 SVP64 "REMAP" in Butterfly Mode.
273 """
274 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
275 "svremap 27, 1, 0, 2, 0, 1, 0",
276 "sv.fadds 0.v, 0.v, 0.v"
277 ])
278 lst = list(lst)
279
280 # array and coefficients to test
281 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
282
283 # store in regfile
284 fprs = [0] * 32
285 for i, a in enumerate(av):
286 fprs[i+0] = fp64toselectable(a)
287
288 with Program(lst, bigendian=False) as program:
289 sim = self.run_tst_program(program, initial_fprs=fprs)
290 print ("spr svshape0", sim.spr['SVSHAPE0'])
291 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
292 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
293 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
294 print ("spr svshape1", sim.spr['SVSHAPE1'])
295 print ("spr svshape2", sim.spr['SVSHAPE2'])
296 print ("spr svshape3", sim.spr['SVSHAPE3'])
297
298 # outer iterative sum
299 res = transform_outer_radix2(av)
300
301 for i, expected in enumerate(res):
302 print ("i", i, float(sim.fpr(i)), "expected", expected)
303 for i, expected in enumerate(res):
304 # convert to Power single
305 expected = DOUBLE2SINGLE(fp64toselectable(expected))
306 expected = float(expected)
307 actual = float(sim.fpr(i))
308 # approximate error calculation, good enough test
309 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
310 # and the rounding is different
311 err = abs((actual - expected) / expected)
312 print ("err", i, err)
313 self.assertTrue(err < 1e-6)
314
315 def run_tst_program(self, prog, initial_regs=None,
316 svstate=None,
317 initial_mem=None,
318 initial_fprs=None):
319 if initial_regs is None:
320 initial_regs = [0] * 32
321 simulator = run_tst(prog, initial_regs, mem=initial_mem,
322 initial_fprs=initial_fprs,
323 svstate=svstate)
324
325 print ("GPRs")
326 simulator.gpr.dump()
327 print ("FPRs")
328 simulator.fpr.dump()
329
330 return simulator
331
332
333 if __name__ == "__main__":
334 unittest.main()