add DCT unit test combining DCT inner and outer butterfly
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2)
17 import unittest
18 import math
19
20
21 def transform_inner_radix2(vec, ctable):
22
23 # Initialization
24 n = len(vec)
25 print ()
26 print ("transform2", n)
27 levels = n.bit_length() - 1
28
29 # reference (read/write) the in-place data in *reverse-bit-order*
30 ri = list(range(n))
31 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
32
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec = halfrev2(vec, False)
36 vec = [vec[ri[i]] for i in range(n)]
37
38 ################
39 # INNER butterfly
40 ################
41 xdim = n
42 ydim = 0
43 zdim = 0
44
45 # set up an SVSHAPE
46 class SVSHAPE:
47 pass
48 # j schedule
49 SVSHAPE0 = SVSHAPE()
50 SVSHAPE0.lims = [xdim, ydim, zdim]
51 SVSHAPE0.order = [0,1,2] # experiment with different permutations, here
52 SVSHAPE0.mode = 0b01
53 SVSHAPE0.submode2 = 0b01
54 SVSHAPE0.skip = 0b00
55 SVSHAPE0.offset = 0 # experiment with different offset, here
56 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
57 # j+halfstep schedule
58 SVSHAPE1 = SVSHAPE()
59 SVSHAPE1.lims = [xdim, ydim, zdim]
60 SVSHAPE1.order = [0,1,2] # experiment with different permutations, here
61 SVSHAPE1.mode = 0b01
62 SVSHAPE1.submode2 = 0b01
63 SVSHAPE1.skip = 0b01
64 SVSHAPE1.offset = 0 # experiment with different offset, here
65 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
66
67 # enumerate over the iterator function, getting new indices
68 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
69 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
70 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
71 t1, t2 = vec[jl], vec[jh]
72 coeff = ctable[k]
73 vec[jl] = t1 + t2
74 vec[jh] = (t1 - t2) * (1.0/coeff)
75 print ("coeff", "ci", k,
76 "jl", jl, "jh", jh,
77 "i/n", (k+0.5), 1.0/coeff,
78 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
79 "end", bin(jle), bin(jhe))
80 if jle == 0b111: # all loops end
81 break
82
83 return vec
84
85 def transform_outer_radix2(vec):
86
87 # Initialization
88 n = len(vec)
89 print ()
90 print ("transform2", n)
91 levels = n.bit_length() - 1
92
93 # outer butterfly
94 xdim = n
95 ydim = 0
96 zdim = 0
97
98 # j schedule
99 class SVSHAPE:
100 pass
101 SVSHAPE0 = SVSHAPE()
102 SVSHAPE0.lims = [xdim, ydim, zdim]
103 SVSHAPE0.submode2 = 0b100
104 SVSHAPE0.mode = 0b01
105 SVSHAPE0.skip = 0b00
106 SVSHAPE0.offset = 0 # experiment with different offset, here
107 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
108 # j+halfstep schedule
109 SVSHAPE1 = SVSHAPE()
110 SVSHAPE1.lims = [xdim, ydim, zdim]
111 SVSHAPE1.mode = 0b01
112 SVSHAPE1.submode2 = 0b100
113 SVSHAPE1.skip = 0b01
114 SVSHAPE1.offset = 0 # experiment with different offset, here
115 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
116
117 # enumerate over the iterator function, getting new indices
118 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
119 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
120 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
121 print ("itersum jr", jl, jh,
122 "end", bin(jle), bin(jhe))
123 vec[jl] += vec[jh]
124 if jle == 0b111: # all loops end
125 break
126
127 print("transform2 result", vec)
128
129 return vec
130
131
132 class DCTTestCase(FHDLTestCase):
133
134 def _check_regs(self, sim, expected):
135 for i in range(32):
136 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
137
138 def test_sv_ffadds_dct(self):
139 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
140 ]
141 four in-place vector adds, four in-place vector mul-subs
142
143 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
144 FRS to perform the two multiplies. one add, one subtract.
145
146 sv.fdadds FRT, FRA, FRC, FRB actually does:
147 fadds FRT , FRB, FRA
148 fsubs FRT+vl, FRA, FRB+vl
149 """
150 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
151 ])
152 lst = list(lst)
153
154 # cheat here with these values, they're selected so that
155 # rounding errors do not occur. sigh.
156 fprs = [0] * 32
157 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
158 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
159 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
160 res = []
161 # work out the results with the twin add-sub
162 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
163 fprs[i+0] = fp64toselectable(a)
164 fprs[i+4] = fp64toselectable(b)
165 fprs[i+8] = fp64toselectable(c)
166 # this isn't quite a perfect replication of the
167 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
168 # and FPSUB32 directly to be honest.
169 t = a + b
170 diff = (a - b)
171 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
172 diff = float(diff)
173 u = diff * c
174 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
175 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
176 res.append((uc, tc))
177 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
178
179 # SVSTATE (in this case, VL=2)
180 svstate = SVP64State()
181 svstate.vl = 4 # VL
182 svstate.maxvl = 4 # MAXVL
183 print ("SVSTATE", bin(svstate.asint()))
184
185 with Program(lst, bigendian=False) as program:
186 sim = self.run_tst_program(program, svstate=svstate,
187 initial_fprs=fprs)
188 # confirm that the results are as expected
189 for i, (t, u) in enumerate(res):
190 a = float(sim.fpr(i+0))
191 b = float(sim.fpr(i+4))
192 t = float(t)
193 u = float(u)
194 print ("DCT", i, "in", a, b, "res", t, u)
195 for i, (t, u) in enumerate(res):
196 self.assertEqual(sim.fpr(i+0), t)
197 self.assertEqual(sim.fpr(i+4), u)
198
199 def test_sv_remap_fpmadds_dct_inner_4(self):
200 """>>> lst = ["svshape 4, 1, 1, 2, 0",
201 "svremap 27, 1, 0, 2, 0, 1, 0",
202 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
203 ]
204 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
205 for DCT
206
207 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
208 (3 inputs, 2 outputs)
209
210 Note that the coefficient (FRC) is not on a "schedule", it
211 is straight Vectorised (0123...) because DCT coefficients
212 cannot be shared between butterfly layers (due to +0.5)
213 """
214 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
215 "svremap 27, 1, 0, 2, 0, 1, 0",
216 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
217 ])
218 lst = list(lst)
219
220 # array and coefficients to test
221 n = 4
222 av = [7.0, -9.8, 3.0, -32.3]
223 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
224
225 levels = n.bit_length() - 1
226 ri = list(range(n))
227 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
228 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
229 av = halfrev2(avi, False)
230 av = [av[ri[i]] for i in range(n)]
231
232 # store in regfile
233 fprs = [0] * 32
234 for i, c in enumerate(coe):
235 fprs[i+8] = fp64toselectable(1.0 / c) # invert
236 for i, a in enumerate(av):
237 fprs[i+0] = fp64toselectable(a)
238
239 with Program(lst, bigendian=False) as program:
240 sim = self.run_tst_program(program, initial_fprs=fprs)
241 print ("spr svshape0", sim.spr['SVSHAPE0'])
242 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
243 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
244 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
245 print ("spr svshape1", sim.spr['SVSHAPE1'])
246 print ("spr svshape2", sim.spr['SVSHAPE2'])
247 print ("spr svshape3", sim.spr['SVSHAPE3'])
248
249 # work out the results with the twin mul/add-sub
250 res = transform_inner_radix2(avi, coe)
251
252 for i, expected in enumerate(res):
253 print ("i", i, float(sim.fpr(i)), "expected", expected)
254 for i, expected in enumerate(res):
255 # convert to Power single
256 expected = DOUBLE2SINGLE(fp64toselectable(expected))
257 expected = float(expected)
258 actual = float(sim.fpr(i))
259 # approximate error calculation, good enough test
260 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
261 # and the rounding is different
262 err = abs((actual - expected) / expected)
263 print ("err", i, err)
264 self.assertTrue(err < 1e-6)
265
266 def test_sv_remap_fpmadds_dct_outer_8(self):
267 """>>> lst = ["svshape 8, 1, 1, 3, 0",
268 "svremap 27, 1, 0, 2, 0, 1, 0",
269 "sv.fadds 0.v, 0.v, 0.v"
270 ]
271 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
272 for DCT, does the iterative overlapped ADDs
273
274 SVP64 "REMAP" in Butterfly Mode.
275 """
276 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
277 "svremap 27, 1, 0, 2, 0, 1, 0",
278 "sv.fadds 0.v, 0.v, 0.v"
279 ])
280 lst = list(lst)
281
282 # array and coefficients to test
283 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
284
285 # store in regfile
286 fprs = [0] * 32
287 for i, a in enumerate(av):
288 fprs[i+0] = fp64toselectable(a)
289
290 with Program(lst, bigendian=False) as program:
291 sim = self.run_tst_program(program, initial_fprs=fprs)
292 print ("spr svshape0", sim.spr['SVSHAPE0'])
293 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
294 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
295 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
296 print ("spr svshape1", sim.spr['SVSHAPE1'])
297 print ("spr svshape2", sim.spr['SVSHAPE2'])
298 print ("spr svshape3", sim.spr['SVSHAPE3'])
299
300 # outer iterative sum
301 res = transform_outer_radix2(av)
302
303 for i, expected in enumerate(res):
304 print ("i", i, float(sim.fpr(i)), "expected", expected)
305 for i, expected in enumerate(res):
306 # convert to Power single
307 expected = DOUBLE2SINGLE(fp64toselectable(expected))
308 expected = float(expected)
309 actual = float(sim.fpr(i))
310 # approximate error calculation, good enough test
311 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
312 # and the rounding is different
313 err = abs((actual - expected) / expected)
314 print ("err", i, err)
315 self.assertTrue(err < 1e-6)
316
317 def test_sv_remap_fpmadds_dct_8(self):
318 """>>> lst = ["svshape 8, 1, 1, 3, 0",
319 "svremap 27, 1, 0, 2, 0, 1, 0",
320 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
321 "sv.fadds 0.v, 0.v, 0.v"
322 ]
323 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
324 for DCT, does the iterative overlapped ADDs
325
326 SVP64 "REMAP" in Butterfly Mode.
327 """
328 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
329 "svshape 8, 1, 1, 2, 0",
330 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
331 "svshape 8, 1, 1, 3, 0",
332 "sv.fadds 0.v, 0.v, 0.v"
333 ])
334 lst = list(lst)
335
336 # array and coefficients to test
337 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
338 n = len(avi)
339 levels = n.bit_length() - 1
340 ri = list(range(n))
341 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
342 av = halfrev2(avi, False)
343 av = [av[ri[i]] for i in range(n)]
344 ctable = []
345 size = n
346 while size >= 2:
347 halfsize = size // 2
348 for i in range(n//size):
349 for ci in range(halfsize):
350 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
351 size //= 2
352
353 # store in regfile
354 fprs = [0] * 32
355 for i, a in enumerate(av):
356 fprs[i+0] = fp64toselectable(a)
357 for i, c in enumerate(ctable):
358 fprs[i+8] = fp64toselectable(1.0 / c) # invert
359
360 with Program(lst, bigendian=False) as program:
361 sim = self.run_tst_program(program, initial_fprs=fprs)
362 print ("spr svshape0", sim.spr['SVSHAPE0'])
363 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
364 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
365 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
366 print ("spr svshape1", sim.spr['SVSHAPE1'])
367 print ("spr svshape2", sim.spr['SVSHAPE2'])
368 print ("spr svshape3", sim.spr['SVSHAPE3'])
369
370 # outer iterative sum
371 res = transform2(avi)
372
373 for i, expected in enumerate(res):
374 print ("i", i, float(sim.fpr(i)), "expected", expected)
375 for i, expected in enumerate(res):
376 # convert to Power single
377 expected = DOUBLE2SINGLE(fp64toselectable(expected))
378 expected = float(expected)
379 actual = float(sim.fpr(i))
380 # approximate error calculation, good enough test
381 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
382 # and the rounding is different
383 err = abs((actual - expected) / expected)
384 print ("err", i, err)
385 self.assertTrue(err < 1e-5)
386
387 def run_tst_program(self, prog, initial_regs=None,
388 svstate=None,
389 initial_mem=None,
390 initial_fprs=None):
391 if initial_regs is None:
392 initial_regs = [0] * 32
393 simulator = run_tst(prog, initial_regs, mem=initial_mem,
394 initial_fprs=initial_fprs,
395 svstate=svstate)
396
397 print ("GPRs")
398 simulator.gpr.dump()
399 print ("FPRs")
400 simulator.fpr.dump()
401
402 return simulator
403
404
405 if __name__ == "__main__":
406 unittest.main()