1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
16 transform2
, inverse_transform2
)
17 from openpower
.decoder
.isa
.fastdctlee
import inverse_transform_iter
22 def transform_inner_radix2_dct(vec
, ctable
):
27 print ("transform2", n
)
28 levels
= n
.bit_length() - 1
30 # reference (read/write) the in-place data in *reverse-bit-order*
32 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
34 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
35 # TODO: merge these two
36 vec
= halfrev2(vec
, False)
37 vec
= [vec
[ri
[i
]] for i
in range(n
)]
51 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
53 SVSHAPE0
.submode2
= 0b01
55 SVSHAPE0
.offset
= 0 # experiment with different offset, here
56 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
59 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
61 SVSHAPE1
.submode2
= 0b01
63 SVSHAPE1
.offset
= 0 # experiment with different offset, here
64 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
66 # enumerate over the iterator function, getting new indices
67 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
68 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
69 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
70 t1
, t2
= vec
[jl
], vec
[jh
]
73 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
74 print ("coeff", "ci", k
,
76 "i/n", (k
+0.5), 1.0/coeff
,
77 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
78 "end", bin(jle
), bin(jhe
))
79 if jle
== 0b111: # all loops end
85 def transform_outer_radix2_dct(vec
):
90 print ("transform2", n
)
91 levels
= n
.bit_length() - 1
102 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
103 SVSHAPE0
.submode2
= 0b100
106 SVSHAPE0
.offset
= 0 # experiment with different offset, here
107 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
108 # j+halfstep schedule
110 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
112 SVSHAPE1
.submode2
= 0b100
114 SVSHAPE1
.offset
= 0 # experiment with different offset, here
115 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
117 # enumerate over the iterator function, getting new indices
118 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
119 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
120 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
121 print ("itersum jr", jl
, jh
,
122 "end", bin(jle
), bin(jhe
))
124 if jle
== 0b111: # all loops end
127 print("transform2 result", vec
)
132 def transform_inner_radix2_idct(vec
, ctable
):
137 print ("transform2", n
)
138 levels
= n
.bit_length() - 1
140 # pretend we LDed data in half-swapped order
141 vec
= halfrev2(vec
, False)
155 SVSHAPE0
.lims
= [xdim
, 0b000001, 0]
157 SVSHAPE0
.submode2
= 0b11
159 SVSHAPE0
.offset
= 0 # experiment with different offset, here
160 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
161 # j+halfstep schedule
163 SVSHAPE1
.lims
= [xdim
, 0b000001, 0]
165 SVSHAPE1
.submode2
= 0b11
167 SVSHAPE1
.offset
= 0 # experiment with different offset, here
168 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
170 # enumerate over the iterator function, getting new indices
171 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
172 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
173 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
174 t1
, t2
= vec
[jl
], vec
[jh
]
176 vec
[jl
] = t1
+ t2
/coeff
177 vec
[jh
] = t1
- t2
/coeff
178 print ("coeff", "ci", k
,
180 "i/n", (k
+0.5), 1.0/coeff
,
181 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
182 "end", bin(jle
), bin(jhe
))
183 if jle
== 0b111: # all loops end
189 def transform_outer_radix2_idct(vec
):
194 print ("transform2-inv", n
)
195 levels
= n
.bit_length() - 1
202 # reference (read/write) the in-place data in *reverse-bit-order*
204 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
206 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
207 # TODO: merge these two
208 vec
= [vec
[ri
[i
]] for i
in range(n
)]
209 vec
= halfrev2(vec
, True)
215 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
216 SVSHAPE0
.submode2
= 0b011
219 SVSHAPE0
.offset
= 0 # experiment with different offset, here
220 SVSHAPE0
.invxyz
= [1,0,1] # inversion if desired
221 # j+halfstep schedule
223 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
225 SVSHAPE1
.submode2
= 0b011
227 SVSHAPE1
.offset
= 0 # experiment with different offset, here
228 SVSHAPE1
.invxyz
= [1,0,1] # inversion if desired
230 # enumerate over the iterator function, getting new indices
231 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
232 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
233 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
234 print ("itersum jr", jl
, jh
,
235 "end", bin(jle
), bin(jhe
))
237 if jle
== 0b111: # all loops end
240 print("transform2-inv result", vec
)
245 class DCTTestCase(FHDLTestCase
):
247 def _check_regs(self
, sim
, expected
):
249 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
251 def test_sv_ffadds_dct(self
):
252 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
254 four in-place vector adds, four in-place vector mul-subs
256 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
257 FRS to perform the two multiplies. one add, one subtract.
259 sv.fdadds FRT, FRA, FRC, FRB actually does:
261 fsubs FRT+vl, FRA, FRB+vl
263 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
267 # cheat here with these values, they're selected so that
268 # rounding errors do not occur. sigh.
270 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
271 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
272 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
274 # work out the results with the twin add-sub
275 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
276 fprs
[i
+0] = fp64toselectable(a
)
277 fprs
[i
+4] = fp64toselectable(b
)
278 fprs
[i
+8] = fp64toselectable(c
)
279 # this isn't quite a perfect replication of the
280 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
281 # and FPSUB32 directly to be honest.
284 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
287 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
288 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
290 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
292 # SVSTATE (in this case, VL=2)
293 svstate
= SVP64State()
295 svstate
.maxvl
= 4 # MAXVL
296 print ("SVSTATE", bin(svstate
.asint()))
298 with
Program(lst
, bigendian
=False) as program
:
299 sim
= self
.run_tst_program(program
, svstate
=svstate
,
301 # confirm that the results are as expected
302 for i
, (t
, u
) in enumerate(res
):
303 a
= float(sim
.fpr(i
+0))
304 b
= float(sim
.fpr(i
+4))
307 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
308 for i
, (t
, u
) in enumerate(res
):
309 self
.assertEqual(sim
.fpr(i
+0), t
)
310 self
.assertEqual(sim
.fpr(i
+4), u
)
312 def test_sv_remap_fpmadds_dct_inner_4(self
):
313 """>>> lst = ["svshape 4, 1, 1, 2, 0",
314 "svremap 27, 1, 0, 2, 0, 1, 0",
315 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
317 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
320 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
321 (3 inputs, 2 outputs)
323 Note that the coefficient (FRC) is not on a "schedule", it
324 is straight Vectorised (0123...) because DCT coefficients
325 cannot be shared between butterfly layers (due to +0.5)
327 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
328 "svremap 27, 1, 0, 2, 0, 1, 0",
329 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
333 # array and coefficients to test
335 av
= [7.0, -9.8, 3.0, -32.3]
336 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
338 levels
= n
.bit_length() - 1
340 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
341 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
342 av
= halfrev2(avi
, False)
343 av
= [av
[ri
[i
]] for i
in range(n
)]
347 for i
, c
in enumerate(coe
):
348 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
349 for i
, a
in enumerate(av
):
350 fprs
[i
+0] = fp64toselectable(a
)
352 with
Program(lst
, bigendian
=False) as program
:
353 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
354 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
355 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
356 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
357 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
358 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
359 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
360 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
362 # work out the results with the twin mul/add-sub
363 res
= transform_inner_radix2_dct(avi
, coe
)
365 for i
, expected
in enumerate(res
):
366 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
367 for i
, expected
in enumerate(res
):
368 # convert to Power single
369 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
370 expected
= float(expected
)
371 actual
= float(sim
.fpr(i
))
372 # approximate error calculation, good enough test
373 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
374 # and the rounding is different
375 err
= abs((actual
- expected
) / expected
)
376 print ("err", i
, err
)
377 self
.assertTrue(err
< 1e-6)
379 def test_sv_remap_fpmadds_idct_inner_4(self
):
380 """>>> lst = ["svshape 4, 1, 1, 10, 0",
381 "svremap 27, 0, 1, 2, 1, 0, 0",
382 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
384 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
387 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
388 (3 inputs, 2 outputs)
390 Note that the coefficient (FRC) is not on a "schedule", it
391 is straight Vectorised (0123...) because DCT coefficients
392 cannot be shared between butterfly layers (due to +0.5)
394 lst
= SVP64Asm( ["svshape 4, 1, 1, 10, 0",
395 "svremap 27, 0, 1, 2, 1, 0, 0",
396 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
400 # array and coefficients to test
402 levels
= n
.bit_length() - 1
403 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
404 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
405 av
= halfrev2(avi
, False)
409 for i
, c
in enumerate(coe
):
410 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
411 for i
, a
in enumerate(av
):
412 fprs
[i
+0] = fp64toselectable(a
)
414 with
Program(lst
, bigendian
=False) as program
:
415 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
416 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
417 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
418 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
419 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
420 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
421 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
422 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
424 # work out the results with the twin mul/add-sub
425 res
= transform_inner_radix2_idct(avi
, coe
)
427 for i
, expected
in enumerate(res
):
428 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
429 for i
, expected
in enumerate(res
):
430 # convert to Power single
431 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
432 expected
= float(expected
)
433 actual
= float(sim
.fpr(i
))
434 # approximate error calculation, good enough test
435 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
436 # and the rounding is different
437 err
= abs((actual
- expected
) / expected
)
438 print ("err", i
, err
)
439 self
.assertTrue(err
< 1e-6)
441 def test_sv_remap_fpmadds_idct_outer_8(self
):
442 """>>> lst = ["svshape 8, 1, 1, 11, 0",
443 "svremap 27, 0, 1, 2, 1, 0, 0",
444 "sv.fadds 0.v, 0.v, 0.v"
446 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
447 for inverse-DCT, does the iterative overlapped ADDs
449 SVP64 "REMAP" in Butterfly Mode.
451 lst
= SVP64Asm( ["svshape 8, 1, 1, 11, 0", # outer butterfly
452 "svremap 27, 0, 1, 2, 1, 0, 0",
453 "sv.fadds 0.v, 0.v, 0.v"
457 # array and coefficients to test
458 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
461 levels
= n
.bit_length() - 1
463 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
464 av
= [avi
[ri
[i
]] for i
in range(n
)]
465 av
= halfrev2(av
, True)
469 for i
, a
in enumerate(av
):
470 fprs
[i
+0] = fp64toselectable(a
)
472 with
Program(lst
, bigendian
=False) as program
:
473 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
474 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
475 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
476 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
477 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
478 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
479 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
480 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
482 # outer iterative sum
483 res
= transform_outer_radix2_idct(avi
)
485 for i
, expected
in enumerate(res
):
486 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
487 for i
, expected
in enumerate(res
):
488 # convert to Power single
489 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
490 expected
= float(expected
)
491 actual
= float(sim
.fpr(i
))
492 # approximate error calculation, good enough test
493 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
494 # and the rounding is different
495 err
= abs((actual
- expected
) / expected
)
496 print ("err", i
, err
)
497 self
.assertTrue(err
< 1e-6)
499 def test_sv_remap_fpmadds_dct_outer_8(self
):
500 """>>> lst = ["svshape 8, 1, 1, 3, 0",
501 "svremap 27, 1, 0, 2, 0, 1, 0",
502 "sv.fadds 0.v, 0.v, 0.v"
504 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
505 for DCT, does the iterative overlapped ADDs
507 SVP64 "REMAP" in Butterfly Mode.
509 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
510 "svremap 27, 1, 0, 2, 0, 1, 0",
511 "sv.fadds 0.v, 0.v, 0.v"
515 # array and coefficients to test
516 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
520 for i
, a
in enumerate(av
):
521 fprs
[i
+0] = fp64toselectable(a
)
523 with
Program(lst
, bigendian
=False) as program
:
524 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
525 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
526 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
527 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
528 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
529 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
530 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
531 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
533 # outer iterative sum
534 res
= transform_outer_radix2_dct(av
)
536 for i
, expected
in enumerate(res
):
537 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
538 for i
, expected
in enumerate(res
):
539 # convert to Power single
540 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
541 expected
= float(expected
)
542 actual
= float(sim
.fpr(i
))
543 # approximate error calculation, good enough test
544 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
545 # and the rounding is different
546 err
= abs((actual
- expected
) / expected
)
547 print ("err", i
, err
)
548 self
.assertTrue(err
< 1e-6)
550 def test_sv_remap_fpmadds_idct_8(self
):
551 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
552 "svshape 8, 1, 1, 11, 0",
553 "sv.fadds 0.v, 0.v, 0.v",
554 "svshape 8, 1, 1, 10, 0",
555 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
557 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
558 inner and outer butterfly "REMAP" schedules.
560 lst
= SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
561 "svshape 8, 1, 1, 11, 0",
562 "sv.fadds 0.v, 0.v, 0.v",
563 "svshape 8, 1, 1, 10, 0",
564 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
568 # array and coefficients to test
569 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
571 levels
= n
.bit_length() - 1
573 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
574 av
= [avi
[ri
[i
]] for i
in range(n
)]
575 av
= halfrev2(av
, True)
577 # divide first value by 2.0, manually. rev and halfrev should
582 print ("input data pre idct", av
)
588 for i
in range(n
//size
):
589 for ci
in range(halfsize
):
590 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
595 for i
, a
in enumerate(av
):
596 fprs
[i
+0] = fp64toselectable(a
)
597 for i
, c
in enumerate(ctable
):
598 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
600 with
Program(lst
, bigendian
=False) as program
:
601 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
602 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
603 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
604 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
605 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
606 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
607 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
608 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
611 expected
= [-15.793373940443367, 27.46969091937703,
612 -24.712331606496313, 27.03601462756265]
614 #res = inverse_transform_iter(avi)
615 res
= inverse_transform2(avi
)
616 #res = transform_outer_radix2_idct(avi)
618 for i
, expected
in enumerate(res
):
619 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
620 for i
, expected
in enumerate(res
):
621 # convert to Power single
622 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
623 expected
= float(expected
)
624 actual
= float(sim
.fpr(i
))
625 # approximate error calculation, good enough test
626 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
627 # and the rounding is different
628 err
= abs((actual
- expected
) / expected
)
629 print ("err", i
, err
)
630 self
.assertTrue(err
< 1e-5)
632 def test_sv_remap_fpmadds_dct_8(self
):
633 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
634 "svshape 8, 1, 1, 2, 0",
635 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
636 "svshape 8, 1, 1, 3, 0",
637 "sv.fadds 0.v, 0.v, 0.v"
639 runs a full in-place 8-long O(N log2 N) DCT, both
640 inner and outer butterfly "REMAP" schedules.
642 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
643 "svshape 8, 1, 1, 2, 0",
644 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
645 "svshape 8, 1, 1, 3, 0",
646 "sv.fadds 0.v, 0.v, 0.v"
650 # array and coefficients to test
651 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
653 levels
= n
.bit_length() - 1
655 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
656 av
= halfrev2(avi
, False)
657 av
= [av
[ri
[i
]] for i
in range(n
)]
662 for i
in range(n
//size
):
663 for ci
in range(halfsize
):
664 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
669 for i
, a
in enumerate(av
):
670 fprs
[i
+0] = fp64toselectable(a
)
671 for i
, c
in enumerate(ctable
):
672 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
674 with
Program(lst
, bigendian
=False) as program
:
675 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
676 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
677 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
678 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
679 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
680 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
681 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
682 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
684 # outer iterative sum
685 res
= transform2(avi
)
687 for i
, expected
in enumerate(res
):
688 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
689 for i
, expected
in enumerate(res
):
690 # convert to Power single
691 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
692 expected
= float(expected
)
693 actual
= float(sim
.fpr(i
))
694 # approximate error calculation, good enough test
695 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
696 # and the rounding is different
697 err
= abs((actual
- expected
) / expected
)
698 print ("err", i
, err
)
699 self
.assertTrue(err
< 1e-5)
701 def test_sv_remap_dct_cos_precompute_8(self
):
702 """pre-computes a DCT COS table, deliberately using a lot of
703 registers so as to be able to see what is going on (dumping all
706 the simpler (scalar) version is in test_caller_transcendentals.py
707 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
708 need the new version of fcfids which doesn't spam memory with
711 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
712 "svremap 0, 0, 0, 2, 0, 1, 1",
713 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
714 "sv.svstep 16.v, 3, 1", # svstep get vector of step
716 "setvl 0, 0, 12, 0, 1, 1",
719 "sv.fcfids 48.v, 64.v",
723 "sv.fcfids 24.v, 12.v",
724 "sv.fadds 0.v, 24.v, 43", # plus 0.5
725 "sv.fmuls 0.v, 0.v, 41", # times PI
726 "sv.fdivs 0.v, 0.v, 48.v", # div size
727 "sv.fcoss 80.v, 0.v",
728 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
735 fprs
[43] = fp64toselectable(0.5) # 0.5
736 fprs
[41] = fp64toselectable(math
.pi
) # pi
737 fprs
[44] = fp64toselectable(2.0) # 2.0
745 for i
in range(n
//size
):
746 for ci
in range(halfsize
):
747 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
750 with
Program(lst
, bigendian
=False) as program
:
751 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
755 for i
in range(len(ctable
)):
756 actual
= float(sim
.fpr(i
+24))
757 print ("i", i
, actual
)
759 for i
in range(len(ctable
)):
760 actual
= float(sim
.fpr(i
+48))
761 print ("i", i
, actual
)
763 for i
in range(len(ctable
)):
764 actual
= float(sim
.fpr(i
))
765 print ("i", i
, actual
)
766 for i
in range(len(ctable
)):
767 expected
= 1.0/ctable
[i
]
768 actual
= float(sim
.fpr(i
+80))
769 err
= abs((actual
- expected
) / expected
)
770 print ("i", i
, actual
, "1/expect", 1/expected
,
771 "expected", expected
,
773 self
.assertTrue(err
< 1e-6)
775 def test_sv_remap_dct_cos_precompute_inner_8(self
):
776 """pre-computes a DCT COS table, using the shorter costable
777 indices schedule. turns out, some COS values are repeated
778 in each layer of the DCT butterfly.
780 the simpler (scalar) version is in test_caller_transcendentals.py
781 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
782 need the new version of fcfids which doesn't spam memory with
785 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
786 "svremap 0, 0, 0, 2, 0, 1, 1",
787 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
788 "sv.svstep 16.v, 2, 1", # svstep get vector of step
790 "setvl 0, 0, 7, 0, 1, 1",
793 "sv.fcfids 48.v, 64.v",
797 "sv.fcfids 24.v, 12.v",
798 "sv.fadds 0.v, 24.v, 43", # plus 0.5
799 "sv.fmuls 0.v, 0.v, 41", # times PI
800 "sv.fdivs 0.v, 0.v, 48.v", # div size
801 "sv.fcoss 80.v, 0.v",
802 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
809 fprs
[43] = fp64toselectable(0.5) # 0.5
810 fprs
[41] = fp64toselectable(math
.pi
) # pi
811 fprs
[44] = fp64toselectable(2.0) # 2.0
819 for ci
in range(halfsize
):
820 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
822 print ("coeff", "ci", ci
, "size", size
,
823 "i/n", (ci
+0.5), 1.0/coeff
)
826 with
Program(lst
, bigendian
=False) as program
:
827 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
831 for i
in range(len(ctable
)):
832 actual
= float(sim
.fpr(i
+24))
833 print ("i", i
, actual
)
835 for i
in range(len(ctable
)):
836 actual
= float(sim
.fpr(i
+48))
837 print ("i", i
, actual
)
839 for i
in range(len(ctable
)):
840 actual
= float(sim
.fpr(i
))
841 print ("i", i
, actual
)
842 for i
in range(len(ctable
)):
843 expected
= 1.0/ctable
[i
]
844 actual
= float(sim
.fpr(i
+80))
845 err
= abs((actual
- expected
) / expected
)
846 print ("i", i
, actual
, "1/expect", 1/expected
,
847 "expected", expected
,
849 self
.assertTrue(err
< 1e-6)
851 def test_sv_remap_fpmadds_dct_8_mode_4(self
):
852 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
853 "svshape 8, 1, 1, 4, 0",
854 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
855 "svshape 8, 1, 1, 3, 0",
856 "sv.fadds 0.v, 0.v, 0.v"
858 runs a full in-place 8-long O(N log2 N) DCT, both
859 inner and outer butterfly "REMAP" schedules.
860 uses shorter tables: FRC also needs to be on a Schedule
862 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
863 "svshape 8, 1, 1, 4, 0",
864 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
865 "svshape 8, 1, 1, 3, 0",
866 "sv.fadds 0.v, 0.v, 0.v"
870 # array and coefficients to test
871 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
873 levels
= n
.bit_length() - 1
875 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
876 av
= halfrev2(avi
, False)
877 av
= [av
[ri
[i
]] for i
in range(n
)]
882 for ci
in range(halfsize
):
883 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
888 for i
, a
in enumerate(av
):
889 fprs
[i
+0] = fp64toselectable(a
)
890 for i
, c
in enumerate(ctable
):
891 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
893 with
Program(lst
, bigendian
=False) as program
:
894 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
895 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
896 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
897 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
898 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
899 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
900 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
901 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
903 # outer iterative sum
904 res
= transform2(avi
)
906 for i
, expected
in enumerate(res
):
907 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
908 for i
, expected
in enumerate(res
):
909 # convert to Power single
910 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
911 expected
= float(expected
)
912 actual
= float(sim
.fpr(i
))
913 # approximate error calculation, good enough test
914 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
915 # and the rounding is different
916 err
= abs((actual
- expected
) / expected
)
917 print ("err", i
, err
)
918 self
.assertTrue(err
< 1e-5)
920 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
):
921 """>>> lst = [# LOAD bit-reversed with half-swap
922 "svshape 8, 1, 1, 6, 0",
923 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
924 "sv.lfssh 0.v, 4(1), 2",
925 # Inner butterfly, twin +/- MUL-ADD-SUB
926 "svremap 31, 1, 0, 2, 0, 1, 1",
927 "svshape 8, 1, 1, 4, 0",
928 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
929 # Outer butterfly, iterative sum
930 "svshape 8, 1, 1, 3, 0",
931 "sv.fadds 0.v, 0.v, 0.v"
933 runs a full in-place 8-long O(N log2 N) DCT, both
934 inner and outer butterfly "REMAP" schedules, and using
935 bit-reversed half-swapped LDs.
936 uses shorter pre-loaded COS tables: FRC also needs to be on a
939 lst
= SVP64Asm( ["addi 1, 0, 0x000",
940 "svshape 8, 1, 1, 6, 0",
941 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
942 "sv.lfssh 0.v, 4(1), 2",
943 "svremap 31, 1, 0, 2, 0, 1, 1",
944 "svshape 8, 1, 1, 4, 0",
945 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
946 "svshape 8, 1, 1, 3, 0",
947 "sv.fadds 0.v, 0.v, 0.v"
951 # array and coefficients to test
952 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
954 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
955 # LD will bring them in, in the correct order.
958 for i
, a
in enumerate(avi
):
959 a
= SINGLE(fp64toselectable(a
)).value
962 val
= a
# accumulate for next iteration
964 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
966 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
972 for ci
in range(halfsize
):
973 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
978 for i
, c
in enumerate(ctable
):
979 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
981 with
Program(lst
, bigendian
=False) as program
:
982 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
984 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
985 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
986 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
987 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
988 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
989 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
990 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
992 # outer iterative sum
993 res
= transform2(avi
)
995 for i
, expected
in enumerate(res
):
996 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
998 for i
, expected
in enumerate(res
):
999 # convert to Power single
1000 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
1001 expected
= float(expected
)
1002 actual
= float(sim
.fpr(i
))
1003 # approximate error calculation, good enough test
1004 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1005 # and the rounding is different
1006 err
= abs((actual
- expected
) / expected
)
1007 print ("err", i
, err
)
1008 self
.assertTrue(err
< 1e-5)
1010 def run_tst_program(self
, prog
, initial_regs
=None,
1014 if initial_regs
is None:
1015 initial_regs
= [0] * 32
1016 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
1017 initial_fprs
=initial_fprs
,
1021 simulator
.gpr
.dump()
1023 simulator
.fpr
.dump()
1028 if __name__
== "__main__":