convert test_caller_svp64_dct.py to new vector numbering convention
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2, inverse_transform2)
17 from openpower.decoder.isa.fastdctlee import inverse_transform_iter
18 import unittest
19 import math
20
21 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
22 # from pseudo-code.
23 fph = ISACallerFnHelper(XLEN=64)
24
25
26 def transform_inner_radix2_dct(vec, ctable):
27
28 # Initialization
29 n = len(vec)
30 print ()
31 print ("transform2", n)
32 levels = n.bit_length() - 1
33
34 # reference (read/write) the in-place data in *reverse-bit-order*
35 ri = list(range(n))
36 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
37
38 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
39 # TODO: merge these two
40 vec = halfrev2(vec, False)
41 vec = [vec[ri[i]] for i in range(n)]
42
43 ################
44 # INNER butterfly
45 ################
46 xdim = n
47 ydim = 0
48 zdim = 0
49
50 # set up an SVSHAPE
51 class SVSHAPE:
52 pass
53 # j schedule
54 SVSHAPE0 = SVSHAPE()
55 SVSHAPE0.lims = [xdim, 2, zdim]
56 SVSHAPE0.mode = 0b01
57 SVSHAPE0.submode2 = 0b01
58 SVSHAPE0.skip = 0b00
59 SVSHAPE0.offset = 0 # experiment with different offset, here
60 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
61 # j+halfstep schedule
62 SVSHAPE1 = SVSHAPE()
63 SVSHAPE1.lims = [xdim, 2, zdim]
64 SVSHAPE1.mode = 0b01
65 SVSHAPE1.submode2 = 0b01
66 SVSHAPE1.skip = 0b01
67 SVSHAPE1.offset = 0 # experiment with different offset, here
68 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
69
70 # enumerate over the iterator function, getting new indices
71 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
72 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
73 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
74 t1, t2 = vec[jl], vec[jh]
75 coeff = ctable[k]
76 vec[jl] = t1 + t2
77 vec[jh] = (t1 - t2) * (1.0/coeff)
78 print ("coeff", "ci", k,
79 "jl", jl, "jh", jh,
80 "i/n", (k+0.5), 1.0/coeff,
81 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
82 "end", bin(jle), bin(jhe))
83 if jle == 0b111: # all loops end
84 break
85
86 return vec
87
88
89 def transform_outer_radix2_dct(vec):
90
91 # Initialization
92 n = len(vec)
93 print ()
94 print ("transform2", n)
95 levels = n.bit_length() - 1
96
97 # outer butterfly
98 xdim = n
99 ydim = 0
100 zdim = 0
101
102 # j schedule
103 class SVSHAPE:
104 pass
105 SVSHAPE0 = SVSHAPE()
106 SVSHAPE0.lims = [xdim, 3, zdim]
107 SVSHAPE0.submode2 = 0b100
108 SVSHAPE0.mode = 0b01
109 SVSHAPE0.skip = 0b00
110 SVSHAPE0.offset = 0 # experiment with different offset, here
111 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
112 # j+halfstep schedule
113 SVSHAPE1 = SVSHAPE()
114 SVSHAPE1.lims = [xdim, 3, zdim]
115 SVSHAPE1.mode = 0b01
116 SVSHAPE1.submode2 = 0b100
117 SVSHAPE1.skip = 0b01
118 SVSHAPE1.offset = 0 # experiment with different offset, here
119 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
120
121 # enumerate over the iterator function, getting new indices
122 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
123 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
124 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
125 print ("itersum jr", jl, jh,
126 "end", bin(jle), bin(jhe))
127 vec[jl] += vec[jh]
128 if jle == 0b111: # all loops end
129 break
130
131 print("transform2 result", vec)
132
133 return vec
134
135
136 def transform_inner_radix2_idct(vec, ctable):
137
138 # Initialization
139 n = len(vec)
140 print ()
141 print ("transform2", n)
142 levels = n.bit_length() - 1
143
144 # pretend we LDed data in half-swapped order
145 vec = halfrev2(vec, False)
146
147 ################
148 # INNER butterfly
149 ################
150 xdim = n
151 ydim = 0
152 zdim = 0
153
154 # set up an SVSHAPE
155 class SVSHAPE:
156 pass
157 # j schedule
158 SVSHAPE0 = SVSHAPE()
159 SVSHAPE0.lims = [xdim, 0b000001, 0]
160 SVSHAPE0.mode = 0b11
161 SVSHAPE0.submode2 = 0b11
162 SVSHAPE0.skip = 0b00
163 SVSHAPE0.offset = 0 # experiment with different offset, here
164 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
165 # j+halfstep schedule
166 SVSHAPE1 = SVSHAPE()
167 SVSHAPE1.lims = [xdim, 0b000001, 0]
168 SVSHAPE1.mode = 0b11
169 SVSHAPE1.submode2 = 0b11
170 SVSHAPE1.skip = 0b01
171 SVSHAPE1.offset = 0 # experiment with different offset, here
172 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
173
174 # enumerate over the iterator function, getting new indices
175 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
176 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
177 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
178 t1, t2 = vec[jl], vec[jh]
179 coeff = ctable[k]
180 vec[jl] = t1 + t2/coeff
181 vec[jh] = t1 - t2/coeff
182 print ("coeff", "ci", k,
183 "jl", jl, "jh", jh,
184 "i/n", (k+0.5), 1.0/coeff,
185 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
186 "end", bin(jle), bin(jhe))
187 if jle == 0b111: # all loops end
188 break
189
190 return vec
191
192
193 def transform_outer_radix2_idct(vec):
194
195 # Initialization
196 n = len(vec)
197 print ()
198 print ("transform2-inv", n)
199 levels = n.bit_length() - 1
200
201 # outer butterfly
202 xdim = n
203 ydim = 0
204 zdim = 0
205
206 # reference (read/write) the in-place data in *reverse-bit-order*
207 ri = list(range(n))
208 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
209
210 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
211 # TODO: merge these two
212 vec = [vec[ri[i]] for i in range(n)]
213 vec = halfrev2(vec, True)
214
215 # j schedule
216 class SVSHAPE:
217 pass
218 SVSHAPE0 = SVSHAPE()
219 SVSHAPE0.lims = [xdim, 2, zdim]
220 SVSHAPE0.submode2 = 0b011
221 SVSHAPE0.mode = 0b11
222 SVSHAPE0.skip = 0b00
223 SVSHAPE0.offset = 0 # experiment with different offset, here
224 SVSHAPE0.invxyz = [1,0,1] # inversion if desired
225 # j+halfstep schedule
226 SVSHAPE1 = SVSHAPE()
227 SVSHAPE1.lims = [xdim, 2, zdim]
228 SVSHAPE1.mode = 0b11
229 SVSHAPE1.submode2 = 0b011
230 SVSHAPE1.skip = 0b01
231 SVSHAPE1.offset = 0 # experiment with different offset, here
232 SVSHAPE1.invxyz = [1,0,1] # inversion if desired
233
234 # enumerate over the iterator function, getting new indices
235 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
236 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
237 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
238 print ("itersum jr", jl, jh,
239 "end", bin(jle), bin(jhe))
240 vec[jh] += vec[jl]
241 if jle == 0b111: # all loops end
242 break
243
244 print("transform2-inv result", vec)
245
246 return vec
247
248
249 class DCTTestCase(FHDLTestCase):
250
251 def _check_regs(self, sim, expected):
252 for i in range(32):
253 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
254
255 def test_sv_ffadds_dct(self):
256 """>>> lst = ["sv.fdmadds *0, *0, *0, *8"
257 ]
258 four in-place vector adds, four in-place vector mul-subs
259
260 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
261 FRS to perform the two multiplies. one add, one subtract.
262
263 sv.fdadds FRT, FRA, FRC, FRB actually does:
264 fadds FRT , FRB, FRA
265 fsubs FRT+vl, FRA, FRB+vl
266 """
267 lst = SVP64Asm(["sv.fdmadds *0, *0, *0, *8"
268 ])
269 lst = list(lst)
270
271 # cheat here with these values, they're selected so that
272 # rounding errors do not occur. sigh.
273 fprs = [0] * 32
274 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
275 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
276 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
277 res = []
278 # work out the results with the twin add-sub
279 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
280 fprs[i+0] = fp64toselectable(a)
281 fprs[i+4] = fp64toselectable(b)
282 fprs[i+8] = fp64toselectable(c)
283 # this isn't quite a perfect replication of the
284 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
285 # and FPSUB32 directly to be honest.
286 t = a + b
287 diff = (a - b)
288 diff = fph.DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
289 diff = float(diff)
290 u = diff * c
291 tc = fph.DOUBLE2SINGLE(fp64toselectable(t)) # cvt to Power single
292 uc = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
293 res.append((uc, tc))
294 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
295
296 # SVSTATE (in this case, VL=2)
297 svstate = SVP64State()
298 svstate.vl = 4 # VL
299 svstate.maxvl = 4 # MAXVL
300 print ("SVSTATE", bin(svstate.asint()))
301
302 with Program(lst, bigendian=False) as program:
303 sim = self.run_tst_program(program, svstate=svstate,
304 initial_fprs=fprs)
305 # confirm that the results are as expected
306 for i, (t, u) in enumerate(res):
307 a = float(sim.fpr(i+0))
308 b = float(sim.fpr(i+4))
309 t = float(t)
310 u = float(u)
311 print ("DCT", i, "in", a, b, "res", t, u)
312 for i, (t, u) in enumerate(res):
313 self.assertEqual(sim.fpr(i+0), t)
314 self.assertEqual(sim.fpr(i+4), u)
315
316 def test_sv_remap_fpmadds_dct_inner_4(self):
317 """>>> lst = ["svshape 4, 1, 1, 2, 0",
318 "svremap 27, 1, 0, 2, 0, 1, 0",
319 "sv.fdmadds *0, *0, *0, *8"
320 ]
321 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
322 for DCT
323
324 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
325 (3 inputs, 2 outputs)
326
327 Note that the coefficient (FRC) is not on a "schedule", it
328 is straight Vectorised (0123...) because DCT coefficients
329 cannot be shared between butterfly layers (due to +0.5)
330 """
331 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
332 "svremap 27, 1, 0, 2, 0, 1, 0",
333 "sv.fdmadds *0, *0, *0, *8"
334 ])
335 lst = list(lst)
336
337 # array and coefficients to test
338 n = 4
339 av = [7.0, -9.8, 3.0, -32.3]
340 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
341
342 levels = n.bit_length() - 1
343 ri = list(range(n))
344 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
345 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
346 av = halfrev2(avi, False)
347 av = [av[ri[i]] for i in range(n)]
348
349 # store in regfile
350 fprs = [0] * 32
351 for i, c in enumerate(coe):
352 fprs[i+8] = fp64toselectable(1.0 / c) # invert
353 for i, a in enumerate(av):
354 fprs[i+0] = fp64toselectable(a)
355
356 with Program(lst, bigendian=False) as program:
357 sim = self.run_tst_program(program, initial_fprs=fprs)
358 print ("spr svshape0", sim.spr['SVSHAPE0'])
359 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
360 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
361 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
362 print ("spr svshape1", sim.spr['SVSHAPE1'])
363 print ("spr svshape2", sim.spr['SVSHAPE2'])
364 print ("spr svshape3", sim.spr['SVSHAPE3'])
365
366 # work out the results with the twin mul/add-sub
367 res = transform_inner_radix2_dct(avi, coe)
368
369 for i, expected in enumerate(res):
370 print ("i", i, float(sim.fpr(i)), "expected", expected)
371 for i, expected in enumerate(res):
372 # convert to Power single
373 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
374 expected = float(expected)
375 actual = float(sim.fpr(i))
376 # approximate error calculation, good enough test
377 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
378 # and the rounding is different
379 err = abs((actual - expected) / expected)
380 print ("err", i, err)
381 self.assertTrue(err < 1e-6)
382
383 def test_sv_remap_fpmadds_idct_inner_4(self):
384 """>>> lst = ["svshape 4, 1, 1, 10, 0",
385 "svremap 27, 0, 1, 2, 1, 0, 0",
386 "sv.ffmadds *0, *0, *0, *8"
387 ]
388 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
389 for inverse-DCT
390
391 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
392 (3 inputs, 2 outputs)
393
394 Note that the coefficient (FRC) is not on a "schedule", it
395 is straight Vectorised (0123...) because DCT coefficients
396 cannot be shared between butterfly layers (due to +0.5)
397 """
398 lst = SVP64Asm( ["svshape 4, 1, 1, 10, 0",
399 "svremap 27, 0, 1, 2, 1, 0, 0",
400 "sv.ffmadds *0, *0, *0, *8"
401 ])
402 lst = list(lst)
403
404 # array and coefficients to test
405 n = 4
406 levels = n.bit_length() - 1
407 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
408 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
409 av = halfrev2(avi, False)
410
411 # store in regfile
412 fprs = [0] * 32
413 for i, c in enumerate(coe):
414 fprs[i+8] = fp64toselectable(1.0 / c) # invert
415 for i, a in enumerate(av):
416 fprs[i+0] = fp64toselectable(a)
417
418 with Program(lst, bigendian=False) as program:
419 sim = self.run_tst_program(program, initial_fprs=fprs)
420 print ("spr svshape0", sim.spr['SVSHAPE0'])
421 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
422 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
423 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
424 print ("spr svshape1", sim.spr['SVSHAPE1'])
425 print ("spr svshape2", sim.spr['SVSHAPE2'])
426 print ("spr svshape3", sim.spr['SVSHAPE3'])
427
428 # work out the results with the twin mul/add-sub
429 res = transform_inner_radix2_idct(avi, coe)
430
431 for i, expected in enumerate(res):
432 print ("i", i, float(sim.fpr(i)), "expected", expected)
433 for i, expected in enumerate(res):
434 # convert to Power single
435 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
436 expected = float(expected)
437 actual = float(sim.fpr(i))
438 # approximate error calculation, good enough test
439 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
440 # and the rounding is different
441 err = abs((actual - expected) / expected)
442 print ("err", i, err)
443 self.assertTrue(err < 1e-6)
444
445 def test_sv_remap_fpmadds_idct_outer_8(self):
446 """>>> lst = ["svshape 8, 1, 1, 11, 0",
447 "svremap 27, 0, 1, 2, 1, 0, 0",
448 "sv.fadds *0, *0, *0"
449 ]
450 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
451 for inverse-DCT, does the iterative overlapped ADDs
452
453 SVP64 "REMAP" in Butterfly Mode.
454 """
455 lst = SVP64Asm( ["svshape 8, 1, 1, 11, 0", # outer butterfly
456 "svremap 27, 0, 1, 2, 1, 0, 0",
457 "sv.fadds *0, *0, *0"
458 ])
459 lst = list(lst)
460
461 # array and coefficients to test
462 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
463
464 n = len(avi)
465 levels = n.bit_length() - 1
466 ri = list(range(n))
467 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
468 av = [avi[ri[i]] for i in range(n)]
469 av = halfrev2(av, True)
470
471 # store in regfile
472 fprs = [0] * 32
473 for i, a in enumerate(av):
474 fprs[i+0] = fp64toselectable(a)
475
476 with Program(lst, bigendian=False) as program:
477 sim = self.run_tst_program(program, initial_fprs=fprs)
478 print ("spr svshape0", sim.spr['SVSHAPE0'])
479 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
480 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
481 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
482 print ("spr svshape1", sim.spr['SVSHAPE1'])
483 print ("spr svshape2", sim.spr['SVSHAPE2'])
484 print ("spr svshape3", sim.spr['SVSHAPE3'])
485
486 # outer iterative sum
487 res = transform_outer_radix2_idct(avi)
488
489 for i, expected in enumerate(res):
490 print ("i", i, float(sim.fpr(i)), "expected", expected)
491 for i, expected in enumerate(res):
492 # convert to Power single
493 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
494 expected = float(expected)
495 actual = float(sim.fpr(i))
496 # approximate error calculation, good enough test
497 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
498 # and the rounding is different
499 err = abs((actual - expected) / expected)
500 print ("err", i, err)
501 self.assertTrue(err < 1e-6)
502
503 def test_sv_remap_fpmadds_dct_outer_8(self):
504 """>>> lst = ["svshape 8, 1, 1, 3, 0",
505 "svremap 27, 1, 0, 2, 0, 1, 0",
506 "sv.fadds *0, *0, *0"
507 ]
508 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
509 for DCT, does the iterative overlapped ADDs
510
511 SVP64 "REMAP" in Butterfly Mode.
512 """
513 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
514 "svremap 27, 1, 0, 2, 0, 1, 0",
515 "sv.fadds *0, *0, *0"
516 ])
517 lst = list(lst)
518
519 # array and coefficients to test
520 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
521
522 # store in regfile
523 fprs = [0] * 32
524 for i, a in enumerate(av):
525 fprs[i+0] = fp64toselectable(a)
526
527 with Program(lst, bigendian=False) as program:
528 sim = self.run_tst_program(program, initial_fprs=fprs)
529 print ("spr svshape0", sim.spr['SVSHAPE0'])
530 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
531 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
532 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
533 print ("spr svshape1", sim.spr['SVSHAPE1'])
534 print ("spr svshape2", sim.spr['SVSHAPE2'])
535 print ("spr svshape3", sim.spr['SVSHAPE3'])
536
537 # outer iterative sum
538 res = transform_outer_radix2_dct(av)
539
540 for i, expected in enumerate(res):
541 print ("i", i, float(sim.fpr(i)), "expected", expected)
542 for i, expected in enumerate(res):
543 # convert to Power single
544 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
545 expected = float(expected)
546 actual = float(sim.fpr(i))
547 # approximate error calculation, good enough test
548 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
549 # and the rounding is different
550 err = abs((actual - expected) / expected)
551 print ("err", i, err)
552 self.assertTrue(err < 1e-6)
553
554 def test_sv_remap_fpmadds_idct_8(self):
555 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
556 "svshape 8, 1, 1, 11, 0",
557 "sv.fadds *0, *0, *0",
558 "svshape 8, 1, 1, 10, 0",
559 "sv.ffmadds *0, *0, *0, *8"
560 ]
561 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
562 inner and outer butterfly "REMAP" schedules.
563 """
564 lst = SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
565 "svshape 8, 1, 1, 11, 0",
566 "sv.fadds *0, *0, *0",
567 "svshape 8, 1, 1, 10, 0",
568 "sv.ffmadds *0, *0, *0, *8"
569 ])
570 lst = list(lst)
571
572 # array and coefficients to test
573 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
574 n = len(avi)
575 levels = n.bit_length() - 1
576 ri = list(range(n))
577 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
578 av = [avi[ri[i]] for i in range(n)]
579 av = halfrev2(av, True)
580
581 # divide first value by 2.0, manually. rev and halfrev should
582 # not have moved it
583 av[0] /= 2.0
584 #avi[0] /= 2.0
585
586 print ("input data pre idct", av)
587
588 ctable = []
589 size = 2
590 while size <= n:
591 halfsize = size // 2
592 for i in range(n//size):
593 for ci in range(halfsize):
594 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
595 size *= 2
596
597 # store in regfile
598 fprs = [0] * 32
599 for i, a in enumerate(av):
600 fprs[i+0] = fp64toselectable(a)
601 for i, c in enumerate(ctable):
602 fprs[i+8] = fp64toselectable(1.0 / c) # invert
603
604 with Program(lst, bigendian=False) as program:
605 sim = self.run_tst_program(program, initial_fprs=fprs)
606 print ("spr svshape0", sim.spr['SVSHAPE0'])
607 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
608 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
609 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
610 print ("spr svshape1", sim.spr['SVSHAPE1'])
611 print ("spr svshape2", sim.spr['SVSHAPE2'])
612 print ("spr svshape3", sim.spr['SVSHAPE3'])
613
614 # inverse DCT
615 expected = [-15.793373940443367, 27.46969091937703,
616 -24.712331606496313, 27.03601462756265]
617
618 #res = inverse_transform_iter(avi)
619 res = inverse_transform2(avi)
620 #res = transform_outer_radix2_idct(avi)
621
622 for i, expected in enumerate(res):
623 print ("i", i, float(sim.fpr(i)), "expected", expected)
624 for i, expected in enumerate(res):
625 # convert to Power single
626 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
627 expected = float(expected)
628 actual = float(sim.fpr(i))
629 # approximate error calculation, good enough test
630 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
631 # and the rounding is different
632 err = abs((actual - expected) / expected)
633 print ("err", i, err)
634 self.assertTrue(err < 1e-5)
635
636 def test_sv_remap_fpmadds_dct_8(self):
637 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
638 "svshape 8, 1, 1, 2, 0",
639 "sv.fdmadds *0, *0, *0, *8"
640 "svshape 8, 1, 1, 3, 0",
641 "sv.fadds *0, *0, *0"
642 ]
643 runs a full in-place 8-long O(N log2 N) DCT, both
644 inner and outer butterfly "REMAP" schedules.
645 """
646 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
647 "svshape 8, 1, 1, 2, 0",
648 "sv.fdmadds *0, *0, *0, *8",
649 "svshape 8, 1, 1, 3, 0",
650 "sv.fadds *0, *0, *0"
651 ])
652 lst = list(lst)
653
654 # array and coefficients to test
655 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
656 n = len(avi)
657 levels = n.bit_length() - 1
658 ri = list(range(n))
659 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
660 av = halfrev2(avi, False)
661 av = [av[ri[i]] for i in range(n)]
662 ctable = []
663 size = n
664 while size >= 2:
665 halfsize = size // 2
666 for i in range(n//size):
667 for ci in range(halfsize):
668 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
669 size //= 2
670
671 # store in regfile
672 fprs = [0] * 32
673 for i, a in enumerate(av):
674 fprs[i+0] = fp64toselectable(a)
675 for i, c in enumerate(ctable):
676 fprs[i+8] = fp64toselectable(1.0 / c) # invert
677
678 with Program(lst, bigendian=False) as program:
679 sim = self.run_tst_program(program, initial_fprs=fprs)
680 print ("spr svshape0", sim.spr['SVSHAPE0'])
681 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
682 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
683 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
684 print ("spr svshape1", sim.spr['SVSHAPE1'])
685 print ("spr svshape2", sim.spr['SVSHAPE2'])
686 print ("spr svshape3", sim.spr['SVSHAPE3'])
687
688 # outer iterative sum
689 res = transform2(avi)
690
691 for i, expected in enumerate(res):
692 print ("i", i, float(sim.fpr(i)), "expected", expected)
693 for i, expected in enumerate(res):
694 # convert to Power single
695 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
696 expected = float(expected)
697 actual = float(sim.fpr(i))
698 # approximate error calculation, good enough test
699 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
700 # and the rounding is different
701 err = abs((actual - expected) / expected)
702 print ("err", i, err)
703 self.assertTrue(err < 1e-5)
704
705 def test_sv_remap_dct_cos_precompute_8(self):
706 """pre-computes a DCT COS table, deliberately using a lot of
707 registers so as to be able to see what is going on (dumping all
708 regs after the run).
709
710 the simpler (scalar) version is in test_caller_transcendentals.py
711 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
712 need the new version of fcfids which doesn't spam memory with
713 LD/STs.
714 """
715 lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
716 "svremap 0, 0, 0, 2, 0, 1, 1",
717 "sv.svstep *4, 4, 1", # svstep get vector of ci
718 "sv.svstep *16, 3, 1", # svstep get vector of step
719 "addi 1, 0, 0x0000",
720 "setvl 0, 0, 12, 0, 1, 1",
721 "sv.std *4, 0(1)",
722 "sv.lfd *64, 0(1)",
723 "sv.fcfids *48, *64",
724 "addi 1, 0, 0x0060",
725 "sv.std *16, 0(1)",
726 "sv.lfd *12, 0(1)",
727 "sv.fcfids *24, *12",
728 "sv.fadds *0, *24, 43", # plus 0.5
729 "sv.fmuls *0, *0, 41", # times PI
730 "sv.fdivs *0, *0, *48", # div size
731 "sv.fcoss *80, *0",
732 "sv.fdivs *80, 43, *80", # div 0.5 / x
733 ])
734 lst = list(lst)
735
736 gprs = [0] * 32
737 fprs = [0] * 128
738 # constants
739 fprs[43] = fp64toselectable(0.5) # 0.5
740 fprs[41] = fp64toselectable(math.pi) # pi
741 fprs[44] = fp64toselectable(2.0) # 2.0
742
743 n = 8
744
745 ctable = []
746 size = n
747 while size >= 2:
748 halfsize = size // 2
749 for i in range(n//size):
750 for ci in range(halfsize):
751 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
752 size //= 2
753
754 with Program(lst, bigendian=False) as program:
755 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
756 print ("MEM")
757 sim.mem.dump()
758 print ("ci FP")
759 for i in range(len(ctable)):
760 actual = float(sim.fpr(i+24))
761 print ("i", i, actual)
762 print ("size FP")
763 for i in range(len(ctable)):
764 actual = float(sim.fpr(i+48))
765 print ("i", i, actual)
766 print ("temps")
767 for i in range(len(ctable)):
768 actual = float(sim.fpr(i))
769 print ("i", i, actual)
770 for i in range(len(ctable)):
771 expected = 1.0/ctable[i]
772 actual = float(sim.fpr(i+80))
773 err = abs((actual - expected) / expected)
774 print ("i", i, actual, "1/expect", 1/expected,
775 "expected", expected,
776 "err", err)
777 self.assertTrue(err < 1e-6)
778
779 def test_sv_remap_dct_cos_precompute_inner_8(self):
780 """pre-computes a DCT COS table, using the shorter costable
781 indices schedule. turns out, some COS values are repeated
782 in each layer of the DCT butterfly.
783
784 the simpler (scalar) version is in test_caller_transcendentals.py
785 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
786 need the new version of fcfids which doesn't spam memory with
787 LD/STs.
788 """
789 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
790 "svremap 0, 0, 0, 2, 0, 1, 1",
791 "sv.svstep *4, 3, 1", # svstep get vector of ci
792 "sv.svstep *16, 2, 1", # svstep get vector of step
793 "addi 1, 0, 0x0000",
794 "setvl 0, 0, 7, 0, 1, 1",
795 "sv.std *4, 0(1)",
796 "sv.lfd *64, 0(1)",
797 "sv.fcfids *48, *64",
798 "addi 1, 0, 0x0060",
799 "sv.std *16, 0(1)",
800 "sv.lfd *12, 0(1)",
801 "sv.fcfids *24, *12",
802 "sv.fadds *0, *24, 43", # plus 0.5
803 "sv.fmuls *0, *0, 41", # times PI
804 "sv.fdivs *0, *0, *48", # div size
805 "sv.fcoss *80, *0",
806 "sv.fdivs *80, 43, *80", # div 0.5 / x
807 ])
808 lst = list(lst)
809
810 gprs = [0] * 32
811 fprs = [0] * 128
812 # constants
813 fprs[43] = fp64toselectable(0.5) # 0.5
814 fprs[41] = fp64toselectable(math.pi) # pi
815 fprs[44] = fp64toselectable(2.0) # 2.0
816
817 n = 8
818
819 ctable = []
820 size = n
821 while size >= 2:
822 halfsize = size // 2
823 for ci in range(halfsize):
824 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
825 ctable.append(coeff)
826 print ("coeff", "ci", ci, "size", size,
827 "i/n", (ci+0.5), 1.0/coeff)
828 size //= 2
829
830 with Program(lst, bigendian=False) as program:
831 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
832 print ("MEM")
833 sim.mem.dump()
834 print ("ci FP")
835 for i in range(len(ctable)):
836 actual = float(sim.fpr(i+24))
837 print ("i", i, actual)
838 print ("size FP")
839 for i in range(len(ctable)):
840 actual = float(sim.fpr(i+48))
841 print ("i", i, actual)
842 print ("temps")
843 for i in range(len(ctable)):
844 actual = float(sim.fpr(i))
845 print ("i", i, actual)
846 for i in range(len(ctable)):
847 expected = 1.0/ctable[i]
848 actual = float(sim.fpr(i+80))
849 err = abs((actual - expected) / expected)
850 print ("i", i, actual, "1/expect", 1/expected,
851 "expected", expected,
852 "err", err)
853 self.assertTrue(err < 1e-6)
854
855 def test_sv_remap_fpmadds_dct_8_mode_4(self):
856 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
857 "svshape 8, 1, 1, 4, 0",
858 "sv.fdmadds *0, *0, *0, *8"
859 "svshape 8, 1, 1, 3, 0",
860 "sv.fadds *0, *0, *0"
861 ]
862 runs a full in-place 8-long O(N log2 N) DCT, both
863 inner and outer butterfly "REMAP" schedules.
864 uses shorter tables: FRC also needs to be on a Schedule
865 """
866 lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
867 "svshape 8, 1, 1, 4, 0",
868 "sv.fdmadds *0, *0, *0, *8",
869 "svshape 8, 1, 1, 3, 0",
870 "sv.fadds *0, *0, *0"
871 ])
872 lst = list(lst)
873
874 # array and coefficients to test
875 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
876 n = len(avi)
877 levels = n.bit_length() - 1
878 ri = list(range(n))
879 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
880 av = halfrev2(avi, False)
881 av = [av[ri[i]] for i in range(n)]
882 ctable = []
883 size = n
884 while size >= 2:
885 halfsize = size // 2
886 for ci in range(halfsize):
887 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
888 size //= 2
889
890 # store in regfile
891 fprs = [0] * 32
892 for i, a in enumerate(av):
893 fprs[i+0] = fp64toselectable(a)
894 for i, c in enumerate(ctable):
895 fprs[i+8] = fp64toselectable(1.0 / c) # invert
896
897 with Program(lst, bigendian=False) as program:
898 sim = self.run_tst_program(program, initial_fprs=fprs)
899 print ("spr svshape0", sim.spr['SVSHAPE0'])
900 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
901 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
902 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
903 print ("spr svshape1", sim.spr['SVSHAPE1'])
904 print ("spr svshape2", sim.spr['SVSHAPE2'])
905 print ("spr svshape3", sim.spr['SVSHAPE3'])
906
907 # outer iterative sum
908 res = transform2(avi)
909
910 for i, expected in enumerate(res):
911 print ("i", i, float(sim.fpr(i)), "expected", expected)
912 for i, expected in enumerate(res):
913 # convert to Power single
914 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
915 expected = float(expected)
916 actual = float(sim.fpr(i))
917 # approximate error calculation, good enough test
918 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
919 # and the rounding is different
920 err = abs((actual - expected) / expected)
921 print ("err", i, err)
922 self.assertTrue(err < 1e-5)
923
924 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self):
925 """>>> lst = [# LOAD bit-reversed with half-swap
926 "svshape 8, 1, 1, 6, 0",
927 "svremap 1, 0, 0, 0, 0, 0, 0",
928 "sv.lfssh *0, 4(1), 2",
929 # Inner butterfly, twin +/- MUL-ADD-SUB
930 "svremap 31, 1, 0, 2, 0, 1, 1",
931 "svshape 8, 1, 1, 4, 0",
932 "sv.fdmadds *0, *0, *0, *8"
933 # Outer butterfly, iterative sum
934 "svshape 8, 1, 1, 3, 0",
935 "sv.fadds *0, *0, *0"
936 ]
937 runs a full in-place 8-long O(N log2 N) DCT, both
938 inner and outer butterfly "REMAP" schedules, and using
939 bit-reversed half-swapped LDs.
940 uses shorter pre-loaded COS tables: FRC also needs to be on a
941 Schedule
942 """
943 lst = SVP64Asm( ["addi 1, 0, 0x000",
944 "svshape 8, 1, 1, 6, 0",
945 "svremap 1, 0, 0, 0, 0, 0, 0",
946 "sv.lfssh *0, 4(1), 2",
947 "svremap 31, 1, 0, 2, 0, 1, 1",
948 "svshape 8, 1, 1, 4, 0",
949 "sv.fdmadds *0, *0, *0, *8",
950 "svshape 8, 1, 1, 3, 0",
951 "sv.fadds *0, *0, *0"
952 ])
953 lst = list(lst)
954
955 # array and coefficients to test
956 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
957
958 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
959 # LD will bring them in, in the correct order.
960 mem = {}
961 val = 0
962 for i, a in enumerate(avi):
963 a = SINGLE(fp64toselectable(a)).value
964 shift = (i % 2) == 1
965 if shift == 0:
966 val = a # accumulate for next iteration
967 else:
968 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
969
970 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
971 n = len(avi)
972 ctable = []
973 size = n
974 while size >= 2:
975 halfsize = size // 2
976 for ci in range(halfsize):
977 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
978 size //= 2
979
980 # store in regfile
981 fprs = [0] * 32
982 for i, c in enumerate(ctable):
983 fprs[i+8] = fp64toselectable(1.0 / c) # invert
984
985 with Program(lst, bigendian=False) as program:
986 sim = self.run_tst_program(program, initial_fprs=fprs,
987 initial_mem=mem)
988 print ("spr svshape0", sim.spr['SVSHAPE0'])
989 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
990 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
991 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
992 print ("spr svshape1", sim.spr['SVSHAPE1'])
993 print ("spr svshape2", sim.spr['SVSHAPE2'])
994 print ("spr svshape3", sim.spr['SVSHAPE3'])
995
996 # outer iterative sum
997 res = transform2(avi)
998
999 for i, expected in enumerate(res):
1000 print ("i", i, float(sim.fpr(i)), "expected", expected)
1001
1002 for i, expected in enumerate(res):
1003 # convert to Power single
1004 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
1005 expected = float(expected)
1006 actual = float(sim.fpr(i))
1007 # approximate error calculation, good enough test
1008 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1009 # and the rounding is different
1010 err = abs((actual - expected) / expected)
1011 print ("err", i, err)
1012 self.assertTrue(err < 1e-5)
1013
1014 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self):
1015 """>>> lst = [# LOAD bit-reversed with half-swap
1016 "svshape 8, 1, 1, 14, 0",
1017 "svremap 1, 0, 0, 0, 0, 0, 0",
1018 "sv.lfssh *0, 4(1), 2",
1019 # Outer butterfly, iterative sum
1020 "svremap 31, 0, 1, 2, 1, 0, 1",
1021 "svshape 8, 1, 1, 11, 0",
1022 "sv.fadds *0, *0, *0",
1023 # Inner butterfly, twin +/- MUL-ADD-SUB
1024 "svshape 8, 1, 1, 10, 0",
1025 "sv.ffmadds *0, *0, *0, *8"
1026 ]
1027 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
1028 inner and outer butterfly "REMAP" schedules, and using
1029 bit-reversed half-swapped LDs.
1030 uses shorter pre-loaded COS tables: FRC also needs to be on a
1031 Schedule in the sv.ffmadds instruction
1032 """
1033 lst = SVP64Asm( ["addi 1, 0, 0x000",
1034 "svshape 8, 1, 1, 14, 0",
1035 "svremap 1, 0, 0, 0, 0, 0, 0",
1036 "sv.lfssh *0, 4(1), 2",
1037 "svremap 31, 0, 1, 2, 1, 0, 1",
1038 "svshape 8, 1, 1, 11, 0",
1039 "sv.fadds *0, *0, *0",
1040 "svshape 8, 1, 1, 12, 0",
1041 "sv.ffmadds *0, *0, *0, *8"
1042 ])
1043 lst = list(lst)
1044
1045 # array and coefficients to test
1046 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
1047
1048 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
1049 # LD will bring them in, in the correct order.
1050 mem = {}
1051 val = 0
1052 for i, a in enumerate(avi):
1053 if i == 0: # first element, divide by 2
1054 a /= 2.0
1055 a = SINGLE(fp64toselectable(a)).value
1056 shift = (i % 2) == 1
1057 if shift == 0:
1058 val = a # accumulate for next iteration
1059 else:
1060 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
1061
1062 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
1063 n = len(avi)
1064 ctable = []
1065 size = 2
1066 while size <= n:
1067 halfsize = size // 2
1068 for ci in range(halfsize):
1069 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
1070 size *= 2
1071
1072 # store in regfile
1073 fprs = [0] * 32
1074 for i, c in enumerate(ctable):
1075 fprs[i+8] = fp64toselectable(1.0 / c) # invert
1076
1077 with Program(lst, bigendian=False) as program:
1078 sim = self.run_tst_program(program, initial_fprs=fprs,
1079 initial_mem=mem)
1080 print ("spr svshape0", sim.spr['SVSHAPE0'])
1081 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
1082 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
1083 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
1084 print ("spr svshape1", sim.spr['SVSHAPE1'])
1085 print ("spr svshape2", sim.spr['SVSHAPE2'])
1086 print ("spr svshape3", sim.spr['SVSHAPE3'])
1087
1088 # outer iterative sum
1089 res = inverse_transform2(avi)
1090
1091 for i, expected in enumerate(res):
1092 print ("i", i, float(sim.fpr(i)), "expected", expected)
1093
1094 for i, expected in enumerate(res):
1095 # convert to Power single
1096 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
1097 expected = float(expected)
1098 actual = float(sim.fpr(i))
1099 # approximate error calculation, good enough test
1100 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1101 # and the rounding is different
1102 err = abs((actual - expected) / expected)
1103 print ("err", i, err)
1104 self.assertTrue(err < 1e-5)
1105
1106 def run_tst_program(self, prog, initial_regs=None,
1107 svstate=None,
1108 initial_mem=None,
1109 initial_fprs=None):
1110 if initial_regs is None:
1111 initial_regs = [0] * 32
1112 simulator = run_tst(prog, initial_regs, mem=initial_mem,
1113 initial_fprs=initial_fprs,
1114 svstate=svstate)
1115
1116 print ("GPRs")
1117 simulator.gpr.dump()
1118 print ("FPRs")
1119 simulator.fpr.dump()
1120
1121 return simulator
1122
1123
1124 if __name__ == "__main__":
1125 unittest.main()