comments
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2)
17 import unittest
18 import math
19
20
21 def transform_inner_radix2(vec, ctable):
22
23 # Initialization
24 n = len(vec)
25 print ()
26 print ("transform2", n)
27 levels = n.bit_length() - 1
28
29 # reference (read/write) the in-place data in *reverse-bit-order*
30 ri = list(range(n))
31 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
32
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec = halfrev2(vec, False)
36 vec = [vec[ri[i]] for i in range(n)]
37
38 ################
39 # INNER butterfly
40 ################
41 xdim = n
42 ydim = 0
43 zdim = 0
44
45 # set up an SVSHAPE
46 class SVSHAPE:
47 pass
48 # j schedule
49 SVSHAPE0 = SVSHAPE()
50 SVSHAPE0.lims = [xdim, ydim, zdim]
51 SVSHAPE0.order = [0,1,2] # experiment with different permutations, here
52 SVSHAPE0.mode = 0b01
53 SVSHAPE0.submode2 = 0b01
54 SVSHAPE0.skip = 0b00
55 SVSHAPE0.offset = 0 # experiment with different offset, here
56 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
57 # j+halfstep schedule
58 SVSHAPE1 = SVSHAPE()
59 SVSHAPE1.lims = [xdim, ydim, zdim]
60 SVSHAPE1.order = [0,1,2] # experiment with different permutations, here
61 SVSHAPE1.mode = 0b01
62 SVSHAPE1.submode2 = 0b01
63 SVSHAPE1.skip = 0b01
64 SVSHAPE1.offset = 0 # experiment with different offset, here
65 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
66
67 # enumerate over the iterator function, getting new indices
68 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
69 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
70 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
71 t1, t2 = vec[jl], vec[jh]
72 coeff = ctable[k]
73 vec[jl] = t1 + t2
74 vec[jh] = (t1 - t2) * (1.0/coeff)
75 print ("coeff", "ci", k,
76 "jl", jl, "jh", jh,
77 "i/n", (k+0.5), 1.0/coeff,
78 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
79 "end", bin(jle), bin(jhe))
80 if jle == 0b111: # all loops end
81 break
82
83 return vec
84
85 def transform_outer_radix2(vec):
86
87 # Initialization
88 n = len(vec)
89 print ()
90 print ("transform2", n)
91 levels = n.bit_length() - 1
92
93 # outer butterfly
94 xdim = n
95 ydim = 0
96 zdim = 0
97
98 # j schedule
99 class SVSHAPE:
100 pass
101 SVSHAPE0 = SVSHAPE()
102 SVSHAPE0.lims = [xdim, ydim, zdim]
103 SVSHAPE0.submode2 = 0b100
104 SVSHAPE0.mode = 0b01
105 SVSHAPE0.skip = 0b00
106 SVSHAPE0.offset = 0 # experiment with different offset, here
107 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
108 # j+halfstep schedule
109 SVSHAPE1 = SVSHAPE()
110 SVSHAPE1.lims = [xdim, ydim, zdim]
111 SVSHAPE1.mode = 0b01
112 SVSHAPE1.submode2 = 0b100
113 SVSHAPE1.skip = 0b01
114 SVSHAPE1.offset = 0 # experiment with different offset, here
115 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
116
117 # enumerate over the iterator function, getting new indices
118 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
119 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
120 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
121 print ("itersum jr", jl, jh,
122 "end", bin(jle), bin(jhe))
123 vec[jl] += vec[jh]
124 if jle == 0b111: # all loops end
125 break
126
127 print("transform2 result", vec)
128
129 return vec
130
131
132 class DCTTestCase(FHDLTestCase):
133
134 def _check_regs(self, sim, expected):
135 for i in range(32):
136 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
137
138 def test_sv_ffadds_dct(self):
139 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
140 ]
141 four in-place vector adds, four in-place vector mul-subs
142
143 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
144 FRS to perform the two multiplies. one add, one subtract.
145
146 sv.fdadds FRT, FRA, FRC, FRB actually does:
147 fadds FRT , FRB, FRA
148 fsubs FRT+vl, FRA, FRB+vl
149 """
150 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
151 ])
152 lst = list(lst)
153
154 # cheat here with these values, they're selected so that
155 # rounding errors do not occur. sigh.
156 fprs = [0] * 32
157 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
158 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
159 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
160 res = []
161 # work out the results with the twin add-sub
162 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
163 fprs[i+0] = fp64toselectable(a)
164 fprs[i+4] = fp64toselectable(b)
165 fprs[i+8] = fp64toselectable(c)
166 # this isn't quite a perfect replication of the
167 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
168 # and FPSUB32 directly to be honest.
169 t = a + b
170 diff = (a - b)
171 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
172 diff = float(diff)
173 u = diff * c
174 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
175 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
176 res.append((uc, tc))
177 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
178
179 # SVSTATE (in this case, VL=2)
180 svstate = SVP64State()
181 svstate.vl = 4 # VL
182 svstate.maxvl = 4 # MAXVL
183 print ("SVSTATE", bin(svstate.asint()))
184
185 with Program(lst, bigendian=False) as program:
186 sim = self.run_tst_program(program, svstate=svstate,
187 initial_fprs=fprs)
188 # confirm that the results are as expected
189 for i, (t, u) in enumerate(res):
190 a = float(sim.fpr(i+0))
191 b = float(sim.fpr(i+4))
192 t = float(t)
193 u = float(u)
194 print ("DCT", i, "in", a, b, "res", t, u)
195 for i, (t, u) in enumerate(res):
196 self.assertEqual(sim.fpr(i+0), t)
197 self.assertEqual(sim.fpr(i+4), u)
198
199 def test_sv_remap_fpmadds_dct_inner_4(self):
200 """>>> lst = ["svshape 4, 1, 1, 2, 0",
201 "svremap 27, 1, 0, 2, 0, 1, 0",
202 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
203 ]
204 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
205 for DCT
206
207 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
208 (3 inputs, 2 outputs)
209
210 Note that the coefficient (FRC) is not on a "schedule", it
211 is straight Vectorised (0123...) because DCT coefficients
212 cannot be shared between butterfly layers (due to +0.5)
213 """
214 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
215 "svremap 27, 1, 0, 2, 0, 1, 0",
216 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
217 ])
218 lst = list(lst)
219
220 # array and coefficients to test
221 n = 4
222 av = [7.0, -9.8, 3.0, -32.3]
223 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
224
225 levels = n.bit_length() - 1
226 ri = list(range(n))
227 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
228 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
229 av = halfrev2(avi, False)
230 av = [av[ri[i]] for i in range(n)]
231
232 # store in regfile
233 fprs = [0] * 32
234 for i, c in enumerate(coe):
235 fprs[i+8] = fp64toselectable(1.0 / c) # invert
236 for i, a in enumerate(av):
237 fprs[i+0] = fp64toselectable(a)
238
239 with Program(lst, bigendian=False) as program:
240 sim = self.run_tst_program(program, initial_fprs=fprs)
241 print ("spr svshape0", sim.spr['SVSHAPE0'])
242 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
243 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
244 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
245 print ("spr svshape1", sim.spr['SVSHAPE1'])
246 print ("spr svshape2", sim.spr['SVSHAPE2'])
247 print ("spr svshape3", sim.spr['SVSHAPE3'])
248
249 # work out the results with the twin mul/add-sub
250 res = transform_inner_radix2(avi, coe)
251
252 for i, expected in enumerate(res):
253 print ("i", i, float(sim.fpr(i)), "expected", expected)
254 for i, expected in enumerate(res):
255 # convert to Power single
256 expected = DOUBLE2SINGLE(fp64toselectable(expected))
257 expected = float(expected)
258 actual = float(sim.fpr(i))
259 # approximate error calculation, good enough test
260 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
261 # and the rounding is different
262 err = abs((actual - expected) / expected)
263 print ("err", i, err)
264 self.assertTrue(err < 1e-6)
265
266 def test_sv_remap_fpmadds_dct_outer_8(self):
267 """>>> lst = ["svshape 8, 1, 1, 3, 0",
268 "svremap 27, 1, 0, 2, 0, 1, 0",
269 "sv.fadds 0.v, 0.v, 0.v"
270 ]
271 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
272 for DCT, does the iterative overlapped ADDs
273
274 SVP64 "REMAP" in Butterfly Mode.
275 """
276 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
277 "svremap 27, 1, 0, 2, 0, 1, 0",
278 "sv.fadds 0.v, 0.v, 0.v"
279 ])
280 lst = list(lst)
281
282 # array and coefficients to test
283 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
284
285 # store in regfile
286 fprs = [0] * 32
287 for i, a in enumerate(av):
288 fprs[i+0] = fp64toselectable(a)
289
290 with Program(lst, bigendian=False) as program:
291 sim = self.run_tst_program(program, initial_fprs=fprs)
292 print ("spr svshape0", sim.spr['SVSHAPE0'])
293 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
294 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
295 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
296 print ("spr svshape1", sim.spr['SVSHAPE1'])
297 print ("spr svshape2", sim.spr['SVSHAPE2'])
298 print ("spr svshape3", sim.spr['SVSHAPE3'])
299
300 # outer iterative sum
301 res = transform_outer_radix2(av)
302
303 for i, expected in enumerate(res):
304 print ("i", i, float(sim.fpr(i)), "expected", expected)
305 for i, expected in enumerate(res):
306 # convert to Power single
307 expected = DOUBLE2SINGLE(fp64toselectable(expected))
308 expected = float(expected)
309 actual = float(sim.fpr(i))
310 # approximate error calculation, good enough test
311 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
312 # and the rounding is different
313 err = abs((actual - expected) / expected)
314 print ("err", i, err)
315 self.assertTrue(err < 1e-6)
316
317 def test_sv_remap_fpmadds_dct_8(self):
318 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
319 "svshape 8, 1, 1, 2, 0",
320 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
321 "svshape 8, 1, 1, 3, 0",
322 "sv.fadds 0.v, 0.v, 0.v"
323 ]
324 runs a full in-place 8-long O(N log2 N) DCT, both
325 inner and outer butterfly "REMAP" schedules.
326 """
327 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
328 "svshape 8, 1, 1, 2, 0",
329 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
330 "svshape 8, 1, 1, 3, 0",
331 "sv.fadds 0.v, 0.v, 0.v"
332 ])
333 lst = list(lst)
334
335 # array and coefficients to test
336 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
337 n = len(avi)
338 levels = n.bit_length() - 1
339 ri = list(range(n))
340 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
341 av = halfrev2(avi, False)
342 av = [av[ri[i]] for i in range(n)]
343 ctable = []
344 size = n
345 while size >= 2:
346 halfsize = size // 2
347 for i in range(n//size):
348 for ci in range(halfsize):
349 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
350 size //= 2
351
352 # store in regfile
353 fprs = [0] * 32
354 for i, a in enumerate(av):
355 fprs[i+0] = fp64toselectable(a)
356 for i, c in enumerate(ctable):
357 fprs[i+8] = fp64toselectable(1.0 / c) # invert
358
359 with Program(lst, bigendian=False) as program:
360 sim = self.run_tst_program(program, initial_fprs=fprs)
361 print ("spr svshape0", sim.spr['SVSHAPE0'])
362 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
363 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
364 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
365 print ("spr svshape1", sim.spr['SVSHAPE1'])
366 print ("spr svshape2", sim.spr['SVSHAPE2'])
367 print ("spr svshape3", sim.spr['SVSHAPE3'])
368
369 # outer iterative sum
370 res = transform2(avi)
371
372 for i, expected in enumerate(res):
373 print ("i", i, float(sim.fpr(i)), "expected", expected)
374 for i, expected in enumerate(res):
375 # convert to Power single
376 expected = DOUBLE2SINGLE(fp64toselectable(expected))
377 expected = float(expected)
378 actual = float(sim.fpr(i))
379 # approximate error calculation, good enough test
380 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
381 # and the rounding is different
382 err = abs((actual - expected) / expected)
383 print ("err", i, err)
384 self.assertTrue(err < 1e-5)
385
386 def run_tst_program(self, prog, initial_regs=None,
387 svstate=None,
388 initial_mem=None,
389 initial_fprs=None):
390 if initial_regs is None:
391 initial_regs = [0] * 32
392 simulator = run_tst(prog, initial_regs, mem=initial_mem,
393 initial_fprs=initial_fprs,
394 svstate=svstate)
395
396 print ("GPRs")
397 simulator.gpr.dump()
398 print ("FPRs")
399 simulator.fpr.dump()
400
401 return simulator
402
403
404 if __name__ == "__main__":
405 unittest.main()