add inverse DCT in-place unit test with bit-reversed half-swap LD
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2, inverse_transform2)
17 from openpower.decoder.isa.fastdctlee import inverse_transform_iter
18 import unittest
19 import math
20
21
22 def transform_inner_radix2_dct(vec, ctable):
23
24 # Initialization
25 n = len(vec)
26 print ()
27 print ("transform2", n)
28 levels = n.bit_length() - 1
29
30 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri = list(range(n))
32 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
33
34 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
35 # TODO: merge these two
36 vec = halfrev2(vec, False)
37 vec = [vec[ri[i]] for i in range(n)]
38
39 ################
40 # INNER butterfly
41 ################
42 xdim = n
43 ydim = 0
44 zdim = 0
45
46 # set up an SVSHAPE
47 class SVSHAPE:
48 pass
49 # j schedule
50 SVSHAPE0 = SVSHAPE()
51 SVSHAPE0.lims = [xdim, 2, zdim]
52 SVSHAPE0.mode = 0b01
53 SVSHAPE0.submode2 = 0b01
54 SVSHAPE0.skip = 0b00
55 SVSHAPE0.offset = 0 # experiment with different offset, here
56 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
57 # j+halfstep schedule
58 SVSHAPE1 = SVSHAPE()
59 SVSHAPE1.lims = [xdim, 2, zdim]
60 SVSHAPE1.mode = 0b01
61 SVSHAPE1.submode2 = 0b01
62 SVSHAPE1.skip = 0b01
63 SVSHAPE1.offset = 0 # experiment with different offset, here
64 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
65
66 # enumerate over the iterator function, getting new indices
67 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
68 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
69 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
70 t1, t2 = vec[jl], vec[jh]
71 coeff = ctable[k]
72 vec[jl] = t1 + t2
73 vec[jh] = (t1 - t2) * (1.0/coeff)
74 print ("coeff", "ci", k,
75 "jl", jl, "jh", jh,
76 "i/n", (k+0.5), 1.0/coeff,
77 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
78 "end", bin(jle), bin(jhe))
79 if jle == 0b111: # all loops end
80 break
81
82 return vec
83
84
85 def transform_outer_radix2_dct(vec):
86
87 # Initialization
88 n = len(vec)
89 print ()
90 print ("transform2", n)
91 levels = n.bit_length() - 1
92
93 # outer butterfly
94 xdim = n
95 ydim = 0
96 zdim = 0
97
98 # j schedule
99 class SVSHAPE:
100 pass
101 SVSHAPE0 = SVSHAPE()
102 SVSHAPE0.lims = [xdim, 3, zdim]
103 SVSHAPE0.submode2 = 0b100
104 SVSHAPE0.mode = 0b01
105 SVSHAPE0.skip = 0b00
106 SVSHAPE0.offset = 0 # experiment with different offset, here
107 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
108 # j+halfstep schedule
109 SVSHAPE1 = SVSHAPE()
110 SVSHAPE1.lims = [xdim, 3, zdim]
111 SVSHAPE1.mode = 0b01
112 SVSHAPE1.submode2 = 0b100
113 SVSHAPE1.skip = 0b01
114 SVSHAPE1.offset = 0 # experiment with different offset, here
115 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
116
117 # enumerate over the iterator function, getting new indices
118 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
119 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
120 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
121 print ("itersum jr", jl, jh,
122 "end", bin(jle), bin(jhe))
123 vec[jl] += vec[jh]
124 if jle == 0b111: # all loops end
125 break
126
127 print("transform2 result", vec)
128
129 return vec
130
131
132 def transform_inner_radix2_idct(vec, ctable):
133
134 # Initialization
135 n = len(vec)
136 print ()
137 print ("transform2", n)
138 levels = n.bit_length() - 1
139
140 # pretend we LDed data in half-swapped order
141 vec = halfrev2(vec, False)
142
143 ################
144 # INNER butterfly
145 ################
146 xdim = n
147 ydim = 0
148 zdim = 0
149
150 # set up an SVSHAPE
151 class SVSHAPE:
152 pass
153 # j schedule
154 SVSHAPE0 = SVSHAPE()
155 SVSHAPE0.lims = [xdim, 0b000001, 0]
156 SVSHAPE0.mode = 0b11
157 SVSHAPE0.submode2 = 0b11
158 SVSHAPE0.skip = 0b00
159 SVSHAPE0.offset = 0 # experiment with different offset, here
160 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
161 # j+halfstep schedule
162 SVSHAPE1 = SVSHAPE()
163 SVSHAPE1.lims = [xdim, 0b000001, 0]
164 SVSHAPE1.mode = 0b11
165 SVSHAPE1.submode2 = 0b11
166 SVSHAPE1.skip = 0b01
167 SVSHAPE1.offset = 0 # experiment with different offset, here
168 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
169
170 # enumerate over the iterator function, getting new indices
171 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
172 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
173 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
174 t1, t2 = vec[jl], vec[jh]
175 coeff = ctable[k]
176 vec[jl] = t1 + t2/coeff
177 vec[jh] = t1 - t2/coeff
178 print ("coeff", "ci", k,
179 "jl", jl, "jh", jh,
180 "i/n", (k+0.5), 1.0/coeff,
181 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
182 "end", bin(jle), bin(jhe))
183 if jle == 0b111: # all loops end
184 break
185
186 return vec
187
188
189 def transform_outer_radix2_idct(vec):
190
191 # Initialization
192 n = len(vec)
193 print ()
194 print ("transform2-inv", n)
195 levels = n.bit_length() - 1
196
197 # outer butterfly
198 xdim = n
199 ydim = 0
200 zdim = 0
201
202 # reference (read/write) the in-place data in *reverse-bit-order*
203 ri = list(range(n))
204 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
205
206 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
207 # TODO: merge these two
208 vec = [vec[ri[i]] for i in range(n)]
209 vec = halfrev2(vec, True)
210
211 # j schedule
212 class SVSHAPE:
213 pass
214 SVSHAPE0 = SVSHAPE()
215 SVSHAPE0.lims = [xdim, 2, zdim]
216 SVSHAPE0.submode2 = 0b011
217 SVSHAPE0.mode = 0b11
218 SVSHAPE0.skip = 0b00
219 SVSHAPE0.offset = 0 # experiment with different offset, here
220 SVSHAPE0.invxyz = [1,0,1] # inversion if desired
221 # j+halfstep schedule
222 SVSHAPE1 = SVSHAPE()
223 SVSHAPE1.lims = [xdim, 2, zdim]
224 SVSHAPE1.mode = 0b11
225 SVSHAPE1.submode2 = 0b011
226 SVSHAPE1.skip = 0b01
227 SVSHAPE1.offset = 0 # experiment with different offset, here
228 SVSHAPE1.invxyz = [1,0,1] # inversion if desired
229
230 # enumerate over the iterator function, getting new indices
231 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
232 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
233 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
234 print ("itersum jr", jl, jh,
235 "end", bin(jle), bin(jhe))
236 vec[jh] += vec[jl]
237 if jle == 0b111: # all loops end
238 break
239
240 print("transform2-inv result", vec)
241
242 return vec
243
244
245 class DCTTestCase(FHDLTestCase):
246
247 def _check_regs(self, sim, expected):
248 for i in range(32):
249 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
250
251 def test_sv_ffadds_dct(self):
252 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
253 ]
254 four in-place vector adds, four in-place vector mul-subs
255
256 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
257 FRS to perform the two multiplies. one add, one subtract.
258
259 sv.fdadds FRT, FRA, FRC, FRB actually does:
260 fadds FRT , FRB, FRA
261 fsubs FRT+vl, FRA, FRB+vl
262 """
263 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
264 ])
265 lst = list(lst)
266
267 # cheat here with these values, they're selected so that
268 # rounding errors do not occur. sigh.
269 fprs = [0] * 32
270 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
271 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
272 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
273 res = []
274 # work out the results with the twin add-sub
275 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
276 fprs[i+0] = fp64toselectable(a)
277 fprs[i+4] = fp64toselectable(b)
278 fprs[i+8] = fp64toselectable(c)
279 # this isn't quite a perfect replication of the
280 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
281 # and FPSUB32 directly to be honest.
282 t = a + b
283 diff = (a - b)
284 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
285 diff = float(diff)
286 u = diff * c
287 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
288 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
289 res.append((uc, tc))
290 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
291
292 # SVSTATE (in this case, VL=2)
293 svstate = SVP64State()
294 svstate.vl = 4 # VL
295 svstate.maxvl = 4 # MAXVL
296 print ("SVSTATE", bin(svstate.asint()))
297
298 with Program(lst, bigendian=False) as program:
299 sim = self.run_tst_program(program, svstate=svstate,
300 initial_fprs=fprs)
301 # confirm that the results are as expected
302 for i, (t, u) in enumerate(res):
303 a = float(sim.fpr(i+0))
304 b = float(sim.fpr(i+4))
305 t = float(t)
306 u = float(u)
307 print ("DCT", i, "in", a, b, "res", t, u)
308 for i, (t, u) in enumerate(res):
309 self.assertEqual(sim.fpr(i+0), t)
310 self.assertEqual(sim.fpr(i+4), u)
311
312 def test_sv_remap_fpmadds_dct_inner_4(self):
313 """>>> lst = ["svshape 4, 1, 1, 2, 0",
314 "svremap 27, 1, 0, 2, 0, 1, 0",
315 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
316 ]
317 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
318 for DCT
319
320 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
321 (3 inputs, 2 outputs)
322
323 Note that the coefficient (FRC) is not on a "schedule", it
324 is straight Vectorised (0123...) because DCT coefficients
325 cannot be shared between butterfly layers (due to +0.5)
326 """
327 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
328 "svremap 27, 1, 0, 2, 0, 1, 0",
329 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
330 ])
331 lst = list(lst)
332
333 # array and coefficients to test
334 n = 4
335 av = [7.0, -9.8, 3.0, -32.3]
336 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
337
338 levels = n.bit_length() - 1
339 ri = list(range(n))
340 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
341 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
342 av = halfrev2(avi, False)
343 av = [av[ri[i]] for i in range(n)]
344
345 # store in regfile
346 fprs = [0] * 32
347 for i, c in enumerate(coe):
348 fprs[i+8] = fp64toselectable(1.0 / c) # invert
349 for i, a in enumerate(av):
350 fprs[i+0] = fp64toselectable(a)
351
352 with Program(lst, bigendian=False) as program:
353 sim = self.run_tst_program(program, initial_fprs=fprs)
354 print ("spr svshape0", sim.spr['SVSHAPE0'])
355 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
356 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
357 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
358 print ("spr svshape1", sim.spr['SVSHAPE1'])
359 print ("spr svshape2", sim.spr['SVSHAPE2'])
360 print ("spr svshape3", sim.spr['SVSHAPE3'])
361
362 # work out the results with the twin mul/add-sub
363 res = transform_inner_radix2_dct(avi, coe)
364
365 for i, expected in enumerate(res):
366 print ("i", i, float(sim.fpr(i)), "expected", expected)
367 for i, expected in enumerate(res):
368 # convert to Power single
369 expected = DOUBLE2SINGLE(fp64toselectable(expected))
370 expected = float(expected)
371 actual = float(sim.fpr(i))
372 # approximate error calculation, good enough test
373 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
374 # and the rounding is different
375 err = abs((actual - expected) / expected)
376 print ("err", i, err)
377 self.assertTrue(err < 1e-6)
378
379 def test_sv_remap_fpmadds_idct_inner_4(self):
380 """>>> lst = ["svshape 4, 1, 1, 10, 0",
381 "svremap 27, 0, 1, 2, 1, 0, 0",
382 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
383 ]
384 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
385 for inverse-DCT
386
387 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
388 (3 inputs, 2 outputs)
389
390 Note that the coefficient (FRC) is not on a "schedule", it
391 is straight Vectorised (0123...) because DCT coefficients
392 cannot be shared between butterfly layers (due to +0.5)
393 """
394 lst = SVP64Asm( ["svshape 4, 1, 1, 10, 0",
395 "svremap 27, 0, 1, 2, 1, 0, 0",
396 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
397 ])
398 lst = list(lst)
399
400 # array and coefficients to test
401 n = 4
402 levels = n.bit_length() - 1
403 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
404 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
405 av = halfrev2(avi, False)
406
407 # store in regfile
408 fprs = [0] * 32
409 for i, c in enumerate(coe):
410 fprs[i+8] = fp64toselectable(1.0 / c) # invert
411 for i, a in enumerate(av):
412 fprs[i+0] = fp64toselectable(a)
413
414 with Program(lst, bigendian=False) as program:
415 sim = self.run_tst_program(program, initial_fprs=fprs)
416 print ("spr svshape0", sim.spr['SVSHAPE0'])
417 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
418 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
419 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
420 print ("spr svshape1", sim.spr['SVSHAPE1'])
421 print ("spr svshape2", sim.spr['SVSHAPE2'])
422 print ("spr svshape3", sim.spr['SVSHAPE3'])
423
424 # work out the results with the twin mul/add-sub
425 res = transform_inner_radix2_idct(avi, coe)
426
427 for i, expected in enumerate(res):
428 print ("i", i, float(sim.fpr(i)), "expected", expected)
429 for i, expected in enumerate(res):
430 # convert to Power single
431 expected = DOUBLE2SINGLE(fp64toselectable(expected))
432 expected = float(expected)
433 actual = float(sim.fpr(i))
434 # approximate error calculation, good enough test
435 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
436 # and the rounding is different
437 err = abs((actual - expected) / expected)
438 print ("err", i, err)
439 self.assertTrue(err < 1e-6)
440
441 def test_sv_remap_fpmadds_idct_outer_8(self):
442 """>>> lst = ["svshape 8, 1, 1, 11, 0",
443 "svremap 27, 0, 1, 2, 1, 0, 0",
444 "sv.fadds 0.v, 0.v, 0.v"
445 ]
446 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
447 for inverse-DCT, does the iterative overlapped ADDs
448
449 SVP64 "REMAP" in Butterfly Mode.
450 """
451 lst = SVP64Asm( ["svshape 8, 1, 1, 11, 0", # outer butterfly
452 "svremap 27, 0, 1, 2, 1, 0, 0",
453 "sv.fadds 0.v, 0.v, 0.v"
454 ])
455 lst = list(lst)
456
457 # array and coefficients to test
458 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
459
460 n = len(avi)
461 levels = n.bit_length() - 1
462 ri = list(range(n))
463 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
464 av = [avi[ri[i]] for i in range(n)]
465 av = halfrev2(av, True)
466
467 # store in regfile
468 fprs = [0] * 32
469 for i, a in enumerate(av):
470 fprs[i+0] = fp64toselectable(a)
471
472 with Program(lst, bigendian=False) as program:
473 sim = self.run_tst_program(program, initial_fprs=fprs)
474 print ("spr svshape0", sim.spr['SVSHAPE0'])
475 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
476 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
477 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
478 print ("spr svshape1", sim.spr['SVSHAPE1'])
479 print ("spr svshape2", sim.spr['SVSHAPE2'])
480 print ("spr svshape3", sim.spr['SVSHAPE3'])
481
482 # outer iterative sum
483 res = transform_outer_radix2_idct(avi)
484
485 for i, expected in enumerate(res):
486 print ("i", i, float(sim.fpr(i)), "expected", expected)
487 for i, expected in enumerate(res):
488 # convert to Power single
489 expected = DOUBLE2SINGLE(fp64toselectable(expected))
490 expected = float(expected)
491 actual = float(sim.fpr(i))
492 # approximate error calculation, good enough test
493 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
494 # and the rounding is different
495 err = abs((actual - expected) / expected)
496 print ("err", i, err)
497 self.assertTrue(err < 1e-6)
498
499 def test_sv_remap_fpmadds_dct_outer_8(self):
500 """>>> lst = ["svshape 8, 1, 1, 3, 0",
501 "svremap 27, 1, 0, 2, 0, 1, 0",
502 "sv.fadds 0.v, 0.v, 0.v"
503 ]
504 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
505 for DCT, does the iterative overlapped ADDs
506
507 SVP64 "REMAP" in Butterfly Mode.
508 """
509 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
510 "svremap 27, 1, 0, 2, 0, 1, 0",
511 "sv.fadds 0.v, 0.v, 0.v"
512 ])
513 lst = list(lst)
514
515 # array and coefficients to test
516 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
517
518 # store in regfile
519 fprs = [0] * 32
520 for i, a in enumerate(av):
521 fprs[i+0] = fp64toselectable(a)
522
523 with Program(lst, bigendian=False) as program:
524 sim = self.run_tst_program(program, initial_fprs=fprs)
525 print ("spr svshape0", sim.spr['SVSHAPE0'])
526 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
527 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
528 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
529 print ("spr svshape1", sim.spr['SVSHAPE1'])
530 print ("spr svshape2", sim.spr['SVSHAPE2'])
531 print ("spr svshape3", sim.spr['SVSHAPE3'])
532
533 # outer iterative sum
534 res = transform_outer_radix2_dct(av)
535
536 for i, expected in enumerate(res):
537 print ("i", i, float(sim.fpr(i)), "expected", expected)
538 for i, expected in enumerate(res):
539 # convert to Power single
540 expected = DOUBLE2SINGLE(fp64toselectable(expected))
541 expected = float(expected)
542 actual = float(sim.fpr(i))
543 # approximate error calculation, good enough test
544 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
545 # and the rounding is different
546 err = abs((actual - expected) / expected)
547 print ("err", i, err)
548 self.assertTrue(err < 1e-6)
549
550 def test_sv_remap_fpmadds_idct_8(self):
551 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
552 "svshape 8, 1, 1, 11, 0",
553 "sv.fadds 0.v, 0.v, 0.v",
554 "svshape 8, 1, 1, 10, 0",
555 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
556 ]
557 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
558 inner and outer butterfly "REMAP" schedules.
559 """
560 lst = SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
561 "svshape 8, 1, 1, 11, 0",
562 "sv.fadds 0.v, 0.v, 0.v",
563 "svshape 8, 1, 1, 10, 0",
564 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
565 ])
566 lst = list(lst)
567
568 # array and coefficients to test
569 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
570 n = len(avi)
571 levels = n.bit_length() - 1
572 ri = list(range(n))
573 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
574 av = [avi[ri[i]] for i in range(n)]
575 av = halfrev2(av, True)
576
577 # divide first value by 2.0, manually. rev and halfrev should
578 # not have moved it
579 av[0] /= 2.0
580 #avi[0] /= 2.0
581
582 print ("input data pre idct", av)
583
584 ctable = []
585 size = 2
586 while size <= n:
587 halfsize = size // 2
588 for i in range(n//size):
589 for ci in range(halfsize):
590 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
591 size *= 2
592
593 # store in regfile
594 fprs = [0] * 32
595 for i, a in enumerate(av):
596 fprs[i+0] = fp64toselectable(a)
597 for i, c in enumerate(ctable):
598 fprs[i+8] = fp64toselectable(1.0 / c) # invert
599
600 with Program(lst, bigendian=False) as program:
601 sim = self.run_tst_program(program, initial_fprs=fprs)
602 print ("spr svshape0", sim.spr['SVSHAPE0'])
603 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
604 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
605 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
606 print ("spr svshape1", sim.spr['SVSHAPE1'])
607 print ("spr svshape2", sim.spr['SVSHAPE2'])
608 print ("spr svshape3", sim.spr['SVSHAPE3'])
609
610 # inverse DCT
611 expected = [-15.793373940443367, 27.46969091937703,
612 -24.712331606496313, 27.03601462756265]
613
614 #res = inverse_transform_iter(avi)
615 res = inverse_transform2(avi)
616 #res = transform_outer_radix2_idct(avi)
617
618 for i, expected in enumerate(res):
619 print ("i", i, float(sim.fpr(i)), "expected", expected)
620 for i, expected in enumerate(res):
621 # convert to Power single
622 expected = DOUBLE2SINGLE(fp64toselectable(expected))
623 expected = float(expected)
624 actual = float(sim.fpr(i))
625 # approximate error calculation, good enough test
626 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
627 # and the rounding is different
628 err = abs((actual - expected) / expected)
629 print ("err", i, err)
630 self.assertTrue(err < 1e-5)
631
632 def test_sv_remap_fpmadds_dct_8(self):
633 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
634 "svshape 8, 1, 1, 2, 0",
635 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
636 "svshape 8, 1, 1, 3, 0",
637 "sv.fadds 0.v, 0.v, 0.v"
638 ]
639 runs a full in-place 8-long O(N log2 N) DCT, both
640 inner and outer butterfly "REMAP" schedules.
641 """
642 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
643 "svshape 8, 1, 1, 2, 0",
644 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
645 "svshape 8, 1, 1, 3, 0",
646 "sv.fadds 0.v, 0.v, 0.v"
647 ])
648 lst = list(lst)
649
650 # array and coefficients to test
651 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
652 n = len(avi)
653 levels = n.bit_length() - 1
654 ri = list(range(n))
655 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
656 av = halfrev2(avi, False)
657 av = [av[ri[i]] for i in range(n)]
658 ctable = []
659 size = n
660 while size >= 2:
661 halfsize = size // 2
662 for i in range(n//size):
663 for ci in range(halfsize):
664 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
665 size //= 2
666
667 # store in regfile
668 fprs = [0] * 32
669 for i, a in enumerate(av):
670 fprs[i+0] = fp64toselectable(a)
671 for i, c in enumerate(ctable):
672 fprs[i+8] = fp64toselectable(1.0 / c) # invert
673
674 with Program(lst, bigendian=False) as program:
675 sim = self.run_tst_program(program, initial_fprs=fprs)
676 print ("spr svshape0", sim.spr['SVSHAPE0'])
677 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
678 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
679 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
680 print ("spr svshape1", sim.spr['SVSHAPE1'])
681 print ("spr svshape2", sim.spr['SVSHAPE2'])
682 print ("spr svshape3", sim.spr['SVSHAPE3'])
683
684 # outer iterative sum
685 res = transform2(avi)
686
687 for i, expected in enumerate(res):
688 print ("i", i, float(sim.fpr(i)), "expected", expected)
689 for i, expected in enumerate(res):
690 # convert to Power single
691 expected = DOUBLE2SINGLE(fp64toselectable(expected))
692 expected = float(expected)
693 actual = float(sim.fpr(i))
694 # approximate error calculation, good enough test
695 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
696 # and the rounding is different
697 err = abs((actual - expected) / expected)
698 print ("err", i, err)
699 self.assertTrue(err < 1e-5)
700
701 def test_sv_remap_dct_cos_precompute_8(self):
702 """pre-computes a DCT COS table, deliberately using a lot of
703 registers so as to be able to see what is going on (dumping all
704 regs after the run).
705
706 the simpler (scalar) version is in test_caller_transcendentals.py
707 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
708 need the new version of fcfids which doesn't spam memory with
709 LD/STs.
710 """
711 lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
712 "svremap 0, 0, 0, 2, 0, 1, 1",
713 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
714 "sv.svstep 16.v, 3, 1", # svstep get vector of step
715 "addi 1, 0, 0x0000",
716 "setvl 0, 0, 12, 0, 1, 1",
717 "sv.std 4.v, 0(1)",
718 "sv.lfd 64.v, 0(1)",
719 "sv.fcfids 48.v, 64.v",
720 "addi 1, 0, 0x0060",
721 "sv.std 16.v, 0(1)",
722 "sv.lfd 12.v, 0(1)",
723 "sv.fcfids 24.v, 12.v",
724 "sv.fadds 0.v, 24.v, 43", # plus 0.5
725 "sv.fmuls 0.v, 0.v, 41", # times PI
726 "sv.fdivs 0.v, 0.v, 48.v", # div size
727 "sv.fcoss 80.v, 0.v",
728 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
729 ])
730 lst = list(lst)
731
732 gprs = [0] * 32
733 fprs = [0] * 128
734 # constants
735 fprs[43] = fp64toselectable(0.5) # 0.5
736 fprs[41] = fp64toselectable(math.pi) # pi
737 fprs[44] = fp64toselectable(2.0) # 2.0
738
739 n = 8
740
741 ctable = []
742 size = n
743 while size >= 2:
744 halfsize = size // 2
745 for i in range(n//size):
746 for ci in range(halfsize):
747 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
748 size //= 2
749
750 with Program(lst, bigendian=False) as program:
751 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
752 print ("MEM")
753 sim.mem.dump()
754 print ("ci FP")
755 for i in range(len(ctable)):
756 actual = float(sim.fpr(i+24))
757 print ("i", i, actual)
758 print ("size FP")
759 for i in range(len(ctable)):
760 actual = float(sim.fpr(i+48))
761 print ("i", i, actual)
762 print ("temps")
763 for i in range(len(ctable)):
764 actual = float(sim.fpr(i))
765 print ("i", i, actual)
766 for i in range(len(ctable)):
767 expected = 1.0/ctable[i]
768 actual = float(sim.fpr(i+80))
769 err = abs((actual - expected) / expected)
770 print ("i", i, actual, "1/expect", 1/expected,
771 "expected", expected,
772 "err", err)
773 self.assertTrue(err < 1e-6)
774
775 def test_sv_remap_dct_cos_precompute_inner_8(self):
776 """pre-computes a DCT COS table, using the shorter costable
777 indices schedule. turns out, some COS values are repeated
778 in each layer of the DCT butterfly.
779
780 the simpler (scalar) version is in test_caller_transcendentals.py
781 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
782 need the new version of fcfids which doesn't spam memory with
783 LD/STs.
784 """
785 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
786 "svremap 0, 0, 0, 2, 0, 1, 1",
787 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
788 "sv.svstep 16.v, 2, 1", # svstep get vector of step
789 "addi 1, 0, 0x0000",
790 "setvl 0, 0, 7, 0, 1, 1",
791 "sv.std 4.v, 0(1)",
792 "sv.lfd 64.v, 0(1)",
793 "sv.fcfids 48.v, 64.v",
794 "addi 1, 0, 0x0060",
795 "sv.std 16.v, 0(1)",
796 "sv.lfd 12.v, 0(1)",
797 "sv.fcfids 24.v, 12.v",
798 "sv.fadds 0.v, 24.v, 43", # plus 0.5
799 "sv.fmuls 0.v, 0.v, 41", # times PI
800 "sv.fdivs 0.v, 0.v, 48.v", # div size
801 "sv.fcoss 80.v, 0.v",
802 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
803 ])
804 lst = list(lst)
805
806 gprs = [0] * 32
807 fprs = [0] * 128
808 # constants
809 fprs[43] = fp64toselectable(0.5) # 0.5
810 fprs[41] = fp64toselectable(math.pi) # pi
811 fprs[44] = fp64toselectable(2.0) # 2.0
812
813 n = 8
814
815 ctable = []
816 size = n
817 while size >= 2:
818 halfsize = size // 2
819 for ci in range(halfsize):
820 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
821 ctable.append(coeff)
822 print ("coeff", "ci", ci, "size", size,
823 "i/n", (ci+0.5), 1.0/coeff)
824 size //= 2
825
826 with Program(lst, bigendian=False) as program:
827 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
828 print ("MEM")
829 sim.mem.dump()
830 print ("ci FP")
831 for i in range(len(ctable)):
832 actual = float(sim.fpr(i+24))
833 print ("i", i, actual)
834 print ("size FP")
835 for i in range(len(ctable)):
836 actual = float(sim.fpr(i+48))
837 print ("i", i, actual)
838 print ("temps")
839 for i in range(len(ctable)):
840 actual = float(sim.fpr(i))
841 print ("i", i, actual)
842 for i in range(len(ctable)):
843 expected = 1.0/ctable[i]
844 actual = float(sim.fpr(i+80))
845 err = abs((actual - expected) / expected)
846 print ("i", i, actual, "1/expect", 1/expected,
847 "expected", expected,
848 "err", err)
849 self.assertTrue(err < 1e-6)
850
851 def test_sv_remap_fpmadds_dct_8_mode_4(self):
852 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
853 "svshape 8, 1, 1, 4, 0",
854 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
855 "svshape 8, 1, 1, 3, 0",
856 "sv.fadds 0.v, 0.v, 0.v"
857 ]
858 runs a full in-place 8-long O(N log2 N) DCT, both
859 inner and outer butterfly "REMAP" schedules.
860 uses shorter tables: FRC also needs to be on a Schedule
861 """
862 lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
863 "svshape 8, 1, 1, 4, 0",
864 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
865 "svshape 8, 1, 1, 3, 0",
866 "sv.fadds 0.v, 0.v, 0.v"
867 ])
868 lst = list(lst)
869
870 # array and coefficients to test
871 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
872 n = len(avi)
873 levels = n.bit_length() - 1
874 ri = list(range(n))
875 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
876 av = halfrev2(avi, False)
877 av = [av[ri[i]] for i in range(n)]
878 ctable = []
879 size = n
880 while size >= 2:
881 halfsize = size // 2
882 for ci in range(halfsize):
883 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
884 size //= 2
885
886 # store in regfile
887 fprs = [0] * 32
888 for i, a in enumerate(av):
889 fprs[i+0] = fp64toselectable(a)
890 for i, c in enumerate(ctable):
891 fprs[i+8] = fp64toselectable(1.0 / c) # invert
892
893 with Program(lst, bigendian=False) as program:
894 sim = self.run_tst_program(program, initial_fprs=fprs)
895 print ("spr svshape0", sim.spr['SVSHAPE0'])
896 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
897 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
898 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
899 print ("spr svshape1", sim.spr['SVSHAPE1'])
900 print ("spr svshape2", sim.spr['SVSHAPE2'])
901 print ("spr svshape3", sim.spr['SVSHAPE3'])
902
903 # outer iterative sum
904 res = transform2(avi)
905
906 for i, expected in enumerate(res):
907 print ("i", i, float(sim.fpr(i)), "expected", expected)
908 for i, expected in enumerate(res):
909 # convert to Power single
910 expected = DOUBLE2SINGLE(fp64toselectable(expected))
911 expected = float(expected)
912 actual = float(sim.fpr(i))
913 # approximate error calculation, good enough test
914 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
915 # and the rounding is different
916 err = abs((actual - expected) / expected)
917 print ("err", i, err)
918 self.assertTrue(err < 1e-5)
919
920 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self):
921 """>>> lst = [# LOAD bit-reversed with half-swap
922 "svshape 8, 1, 1, 6, 0",
923 "svremap 1, 0, 0, 0, 0, 0, 0, 0",
924 "sv.lfssh 0.v, 4(1), 2",
925 # Inner butterfly, twin +/- MUL-ADD-SUB
926 "svremap 31, 1, 0, 2, 0, 1, 1",
927 "svshape 8, 1, 1, 4, 0",
928 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
929 # Outer butterfly, iterative sum
930 "svshape 8, 1, 1, 3, 0",
931 "sv.fadds 0.v, 0.v, 0.v"
932 ]
933 runs a full in-place 8-long O(N log2 N) DCT, both
934 inner and outer butterfly "REMAP" schedules, and using
935 bit-reversed half-swapped LDs.
936 uses shorter pre-loaded COS tables: FRC also needs to be on a
937 Schedule
938 """
939 lst = SVP64Asm( ["addi 1, 0, 0x000",
940 "svshape 8, 1, 1, 6, 0",
941 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
942 "sv.lfssh 0.v, 4(1), 2",
943 "svremap 31, 1, 0, 2, 0, 1, 1",
944 "svshape 8, 1, 1, 4, 0",
945 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
946 "svshape 8, 1, 1, 3, 0",
947 "sv.fadds 0.v, 0.v, 0.v"
948 ])
949 lst = list(lst)
950
951 # array and coefficients to test
952 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
953
954 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
955 # LD will bring them in, in the correct order.
956 mem = {}
957 val = 0
958 for i, a in enumerate(avi):
959 a = SINGLE(fp64toselectable(a)).value
960 shift = (i % 2) == 1
961 if shift == 0:
962 val = a # accumulate for next iteration
963 else:
964 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
965
966 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
967 n = len(avi)
968 ctable = []
969 size = n
970 while size >= 2:
971 halfsize = size // 2
972 for ci in range(halfsize):
973 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
974 size //= 2
975
976 # store in regfile
977 fprs = [0] * 32
978 for i, c in enumerate(ctable):
979 fprs[i+8] = fp64toselectable(1.0 / c) # invert
980
981 with Program(lst, bigendian=False) as program:
982 sim = self.run_tst_program(program, initial_fprs=fprs,
983 initial_mem=mem)
984 print ("spr svshape0", sim.spr['SVSHAPE0'])
985 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
986 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
987 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
988 print ("spr svshape1", sim.spr['SVSHAPE1'])
989 print ("spr svshape2", sim.spr['SVSHAPE2'])
990 print ("spr svshape3", sim.spr['SVSHAPE3'])
991
992 # outer iterative sum
993 res = transform2(avi)
994
995 for i, expected in enumerate(res):
996 print ("i", i, float(sim.fpr(i)), "expected", expected)
997
998 for i, expected in enumerate(res):
999 # convert to Power single
1000 expected = DOUBLE2SINGLE(fp64toselectable(expected))
1001 expected = float(expected)
1002 actual = float(sim.fpr(i))
1003 # approximate error calculation, good enough test
1004 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1005 # and the rounding is different
1006 err = abs((actual - expected) / expected)
1007 print ("err", i, err)
1008 self.assertTrue(err < 1e-5)
1009
1010 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self):
1011 """>>> lst = [# LOAD bit-reversed with half-swap
1012 "svshape 8, 1, 1, 14, 0",
1013 "svremap 1, 0, 0, 0, 0, 0, 0, 0",
1014 "sv.lfssh 0.v, 4(1), 2",
1015 # Outer butterfly, iterative sum
1016 "svremap 31, 0, 1, 2, 1, 0, 1",
1017 "svshape 8, 1, 1, 11, 0",
1018 "sv.fadds 0.v, 0.v, 0.v",
1019 # Inner butterfly, twin +/- MUL-ADD-SUB
1020 "svshape 8, 1, 1, 10, 0",
1021 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
1022 ]
1023 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
1024 inner and outer butterfly "REMAP" schedules, and using
1025 bit-reversed half-swapped LDs.
1026 uses shorter pre-loaded COS tables: FRC also needs to be on a
1027 Schedule in the sv.ffmadds instruction
1028 """
1029 lst = SVP64Asm( ["addi 1, 0, 0x000",
1030 "svshape 8, 1, 1, 14, 0",
1031 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
1032 "sv.lfssh 0.v, 4(1), 2",
1033 "svremap 31, 0, 1, 2, 1, 0, 1",
1034 "svshape 8, 1, 1, 11, 0",
1035 "sv.fadds 0.v, 0.v, 0.v",
1036 "svshape 8, 1, 1, 12, 0",
1037 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
1038 ])
1039 lst = list(lst)
1040
1041 # array and coefficients to test
1042 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
1043
1044 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
1045 # LD will bring them in, in the correct order.
1046 mem = {}
1047 val = 0
1048 for i, a in enumerate(avi):
1049 if i == 0: # first element, divide by 2
1050 a /= 2.0
1051 a = SINGLE(fp64toselectable(a)).value
1052 shift = (i % 2) == 1
1053 if shift == 0:
1054 val = a # accumulate for next iteration
1055 else:
1056 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
1057
1058 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
1059 n = len(avi)
1060 ctable = []
1061 size = 2
1062 while size <= n:
1063 halfsize = size // 2
1064 for ci in range(halfsize):
1065 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
1066 size *= 2
1067
1068 # store in regfile
1069 fprs = [0] * 32
1070 for i, c in enumerate(ctable):
1071 fprs[i+8] = fp64toselectable(1.0 / c) # invert
1072
1073 with Program(lst, bigendian=False) as program:
1074 sim = self.run_tst_program(program, initial_fprs=fprs,
1075 initial_mem=mem)
1076 print ("spr svshape0", sim.spr['SVSHAPE0'])
1077 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
1078 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
1079 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
1080 print ("spr svshape1", sim.spr['SVSHAPE1'])
1081 print ("spr svshape2", sim.spr['SVSHAPE2'])
1082 print ("spr svshape3", sim.spr['SVSHAPE3'])
1083
1084 # outer iterative sum
1085 res = inverse_transform2(avi)
1086
1087 for i, expected in enumerate(res):
1088 print ("i", i, float(sim.fpr(i)), "expected", expected)
1089
1090 for i, expected in enumerate(res):
1091 # convert to Power single
1092 expected = DOUBLE2SINGLE(fp64toselectable(expected))
1093 expected = float(expected)
1094 actual = float(sim.fpr(i))
1095 # approximate error calculation, good enough test
1096 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1097 # and the rounding is different
1098 err = abs((actual - expected) / expected)
1099 print ("err", i, err)
1100 self.assertTrue(err < 1e-5)
1101
1102 def run_tst_program(self, prog, initial_regs=None,
1103 svstate=None,
1104 initial_mem=None,
1105 initial_fprs=None):
1106 if initial_regs is None:
1107 initial_regs = [0] * 32
1108 simulator = run_tst(prog, initial_regs, mem=initial_mem,
1109 initial_fprs=initial_fprs,
1110 svstate=svstate)
1111
1112 print ("GPRs")
1113 simulator.gpr.dump()
1114 print ("FPRs")
1115 simulator.fpr.dump()
1116
1117 return simulator
1118
1119
1120 if __name__ == "__main__":
1121 unittest.main()