4 from nmutil
.formaltest
import FHDLTestCase
5 from openpower
.decoder
.helpers
import SINGLE
, fp64toselectable
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.isa
.remap_dct_yield
import (
8 halfrev2
, inverse_transform2
, iterate_dct_inner_butterfly_indices
,
9 iterate_dct_outer_butterfly_indices
, reverse_bits
, transform2
)
10 from openpower
.decoder
.isa
.test_caller
import run_tst
11 from openpower
.decoder
.isafunctions
.double2single
import (
12 ISACallerFnHelper_double2single
)
13 from openpower
.decoder
.selectable_int
import SelectableInt
14 from openpower
.simulator
.program
import Program
15 from openpower
.insndb
.asm
import SVP64Asm
17 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
19 fph
= ISACallerFnHelper_double2single(XLEN
=64, FPSCR
=None)
20 fph
.namespace
= {'FPSCR': fph
.FPSCR
,
28 def transform_inner_radix2_dct(vec
, ctable
):
33 print("transform2", n
)
34 levels
= n
.bit_length() - 1
36 # reference (read/write) the in-place data in *reverse-bit-order*
38 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
40 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
41 # TODO: merge these two
42 vec
= halfrev2(vec
, False)
43 vec
= [vec
[ri
[i
]] for i
in range(n
)]
57 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
59 SVSHAPE0
.submode2
= 0b01
61 SVSHAPE0
.offset
= 0 # experiment with different offset, here
62 SVSHAPE0
.invxyz
= [1, 0, 0] # inversion if desired
65 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
67 SVSHAPE1
.submode2
= 0b01
69 SVSHAPE1
.offset
= 0 # experiment with different offset, here
70 SVSHAPE1
.invxyz
= [1, 0, 0] # inversion if desired
72 # enumerate over the iterator function, getting new indices
73 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
74 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
75 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
76 t1
, t2
= vec
[jl
], vec
[jh
]
79 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
80 print("coeff", "ci", k
,
82 "i/n", (k
+0.5), 1.0/coeff
,
83 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
84 "end", bin(jle
), bin(jhe
))
85 if jle
== 0b111: # all loops end
91 def transform_outer_radix2_dct(vec
):
96 print("transform2", n
)
97 levels
= n
.bit_length() - 1
108 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
109 SVSHAPE0
.submode2
= 0b100
112 SVSHAPE0
.offset
= 0 # experiment with different offset, here
113 SVSHAPE0
.invxyz
= [0, 0, 0] # inversion if desired
114 # j+halfstep schedule
116 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
118 SVSHAPE1
.submode2
= 0b100
120 SVSHAPE1
.offset
= 0 # experiment with different offset, here
121 SVSHAPE1
.invxyz
= [0, 0, 0] # inversion if desired
123 # enumerate over the iterator function, getting new indices
124 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
125 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
126 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
127 print("itersum jr", jl
, jh
,
128 "end", bin(jle
), bin(jhe
))
130 if jle
== 0b111: # all loops end
133 print("transform2 result", vec
)
138 def transform_inner_radix2_idct(vec
, ctable
):
143 print("transform2", n
)
144 levels
= n
.bit_length() - 1
146 # pretend we LDed data in half-swapped order
147 vec
= halfrev2(vec
, False)
161 SVSHAPE0
.lims
= [xdim
, 0b000001, 1]
163 SVSHAPE0
.submode2
= 0b11
165 SVSHAPE0
.offset
= 0 # experiment with different offset, here
166 SVSHAPE0
.invxyz
= [0, 0, 0] # inversion if desired
167 # j+halfstep schedule
169 SVSHAPE1
.lims
= [xdim
, 0b000001, 1]
171 SVSHAPE1
.submode2
= 0b11
173 SVSHAPE1
.offset
= 0 # experiment with different offset, here
174 SVSHAPE1
.invxyz
= [0, 0, 0] # inversion if desired
176 # enumerate over the iterator function, getting new indices
177 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
178 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
179 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
180 t1
, t2
= vec
[jl
], vec
[jh
]
182 vec
[jl
] = t1
+ t2
/coeff
183 vec
[jh
] = t1
- t2
/coeff
184 print("coeff", "ci", k
,
186 "i/n", (k
+0.5), 1.0/coeff
,
187 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
188 "end", bin(jle
), bin(jhe
))
189 if jle
== 0b111: # all loops end
195 def transform_outer_radix2_idct(vec
):
200 print("transform2-inv", n
)
201 levels
= n
.bit_length() - 1
208 # reference (read/write) the in-place data in *reverse-bit-order*
210 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
212 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
213 # TODO: merge these two
214 vec
= [vec
[ri
[i
]] for i
in range(n
)]
215 vec
= halfrev2(vec
, True)
221 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
222 SVSHAPE0
.submode2
= 0b011
225 SVSHAPE0
.offset
= 0 # experiment with different offset, here
226 SVSHAPE0
.invxyz
= [1, 0, 1] # inversion if desired
227 # j+halfstep schedule
229 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
231 SVSHAPE1
.submode2
= 0b011
233 SVSHAPE1
.offset
= 0 # experiment with different offset, here
234 SVSHAPE1
.invxyz
= [1, 0, 1] # inversion if desired
236 # enumerate over the iterator function, getting new indices
237 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
238 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
239 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
240 print("itersum jr", jl
, jh
,
241 "end", bin(jle
), bin(jhe
))
243 if jle
== 0b111: # all loops end
246 print("transform2-inv result", vec
)
251 class DCTTestCase(FHDLTestCase
):
253 def _check_regs(self
, sim
, expected
):
255 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
257 def test_sv_ffadds_dct(self
):
258 """>>> lst = ["sv.fdmadds *0, *0, *0, *8"
260 four in-place vector adds, four in-place vector mul-subs
262 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
263 FRS to perform the two multiplies. one add, one subtract.
265 sv.fdadds FRT, FRA, FRC, FRB actually does:
267 fsubs FRT+vl, FRA, FRB+vl
269 lst
= SVP64Asm(["sv.fdmadds *0, *8, *0"
273 # cheat here with these values, they're selected so that
274 # rounding errors do not occur. sigh.
276 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
277 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
278 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
280 # work out the results with the twin add-sub
281 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
282 fprs
[i
+0] = fp64toselectable(a
)
283 fprs
[i
+4] = fp64toselectable(b
)
284 fprs
[i
+8] = fp64toselectable(c
)
285 # this isn't quite a perfect replication of the
286 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
287 # and FPSUB32 directly to be honest.
290 diff
= fph
.DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
293 tc
= fph
.DOUBLE2SINGLE(fp64toselectable(t
)) # cvt to Power single
294 uc
= fph
.DOUBLE2SINGLE(fp64toselectable(u
)) # from double
296 print("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
298 # SVSTATE (in this case, VL=2)
299 svstate
= SVP64State()
301 svstate
.maxvl
= 4 # MAXVL
302 print("SVSTATE", bin(svstate
.asint()))
304 with
Program(lst
, bigendian
=False) as program
:
305 sim
= self
.run_tst_program(program
, svstate
=svstate
,
307 # confirm that the results are as expected
308 for i
, (t
, u
) in enumerate(res
):
309 a
= float(sim
.fpr(i
+0))
310 b
= float(sim
.fpr(i
+4))
313 print("DCT", i
, "in", a
, b
, "res", t
, u
)
314 for i
, (t
, u
) in enumerate(res
):
315 self
.assertEqual(sim
.fpr(i
+0), t
)
316 self
.assertEqual(sim
.fpr(i
+4), u
)
318 def test_sv_remap_fpmadds_idct_outer_8(self
, stride
=2):
319 """>>> lst = ["svshape 8, 1, 1, 11, 0",
320 "svremap 27, 0, 1, 2, 1, 0, 0",
321 "sv.fadds *0, *0, *0"
323 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
324 for inverse-DCT, does the iterative overlapped ADDs
326 SVP64 "REMAP" in Butterfly Mode.
328 lst
= SVP64Asm(["svshape 8, 1, %d, 11, 0" % stride
, # outer butterfly
329 "svremap 27, 0, 1, 2, 1, 0, 0",
330 "sv.fadds *0, *0, *0"
334 # array and coefficients to test
335 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
338 levels
= n
.bit_length() - 1
340 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
341 av
= [avi
[ri
[i
]] for i
in range(n
)]
342 av
= halfrev2(av
, True)
346 for i
, a
in enumerate(av
):
347 fprs
[i
*stride
+0] = fp64toselectable(a
)
349 with
Program(lst
, bigendian
=False) as program
:
350 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
351 print("spr svshape0", sim
.spr
['SVSHAPE0'])
352 print(" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
353 print(" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
354 print(" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
355 print("spr svshape1", sim
.spr
['SVSHAPE1'])
356 print("spr svshape2", sim
.spr
['SVSHAPE2'])
357 print("spr svshape3", sim
.spr
['SVSHAPE3'])
359 # outer iterative sum
360 res
= transform_outer_radix2_idct(avi
)
362 for i
, expected
in enumerate(res
):
363 print("i", i
*stride
, float(sim
.fpr(i
*stride
)),
364 "expected", expected
)
365 for i
, expected
in enumerate(res
):
366 # convert to Power single
367 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
368 expected
= float(expected
)
369 actual
= float(sim
.fpr(i
*stride
))
370 # approximate error calculation, good enough test
371 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
372 # and the rounding is different
373 err
= abs((actual
- expected
) / expected
)
375 self
.assertTrue(err
< 1e-6)
377 def test_sv_remap_fpmadds_dct_outer_8(self
, stride
=2):
378 """>>> lst = ["svshape 8, 1, 1, 3, 0",
379 "svremap 27, 1, 0, 2, 0, 1, 0",
380 "sv.fadds *0, *0, *0"
382 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
383 for DCT, does the iterative overlapped ADDs
385 SVP64 "REMAP" in Butterfly Mode.
387 lst
= SVP64Asm(["svshape 8, 1, %d, 3, 0" % stride
,
388 "svremap 27, 1, 0, 2, 0, 1, 0",
389 "sv.fadds *0, *0, *0"
393 # array and coefficients to test
394 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
398 for i
, a
in enumerate(av
):
399 fprs
[i
*stride
+0] = fp64toselectable(a
)
401 with
Program(lst
, bigendian
=False) as program
:
402 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
403 print("spr svshape0", sim
.spr
['SVSHAPE0'])
404 print(" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
405 print(" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
406 print(" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
407 print("spr svshape1", sim
.spr
['SVSHAPE1'])
408 print("spr svshape2", sim
.spr
['SVSHAPE2'])
409 print("spr svshape3", sim
.spr
['SVSHAPE3'])
411 # outer iterative sum
412 res
= transform_outer_radix2_dct(av
)
414 for i
, expected
in enumerate(res
):
415 print("i", i
*stride
, float(sim
.fpr(i
*stride
)),
416 "expected", expected
)
417 for i
, expected
in enumerate(res
):
418 # convert to Power single
419 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
420 expected
= float(expected
)
421 actual
= float(sim
.fpr(i
*stride
))
422 # approximate error calculation, good enough test
423 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
424 # and the rounding is different
425 err
= abs((actual
- expected
) / expected
)
427 self
.assertTrue(err
< 1e-6)
429 def test_sv_remap_dct_cos_precompute_inner_8(self
):
430 """pre-computes a DCT COS table, using the shorter costable
431 indices schedule. turns out, some COS values are repeated
432 in each layer of the DCT butterfly.
434 the simpler (scalar) version is in test_caller_transcendentals.py
435 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
436 need the new version of fcfids which doesn't spam memory with
439 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
440 "svremap 0, 0, 0, 2, 0, 1, 1",
441 "sv.svstep *4, 0, 3, 1", # svstep get vector of ci
442 "sv.svstep *16, 0, 2, 1", # svstep get vector of step
444 "setvl 0, 0, 7, 0, 1, 1",
447 "sv.fcfids *48, *64",
451 "sv.fcfids *24, *12",
452 "sv.fadds *0, *24, 43", # plus 0.5
453 "sv.fmuls *0, *0, 41", # times PI
454 "sv.fdivs *0, *0, *48", # div size
456 "sv.fdivs *80, 43, *80", # div 0.5 / x
463 fprs
[43] = fp64toselectable(0.5) # 0.5
464 fprs
[41] = fp64toselectable(math
.pi
) # pi
465 fprs
[44] = fp64toselectable(2.0) # 2.0
473 for ci
in range(halfsize
):
474 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
476 print("coeff", "ci", ci
, "size", size
,
477 "i/n", (ci
+0.5), 1.0/coeff
)
480 with
Program(lst
, bigendian
=False) as program
:
481 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
485 for i
in range(len(ctable
)):
486 actual
= float(sim
.fpr(i
+24))
487 print("i", i
, actual
)
489 for i
in range(len(ctable
)):
490 actual
= float(sim
.fpr(i
+48))
491 print("i", i
, actual
)
493 for i
in range(len(ctable
)):
494 actual
= float(sim
.fpr(i
))
495 print("i", i
, actual
)
496 for i
in range(len(ctable
)):
497 expected
= 1.0/ctable
[i
]
498 actual
= float(sim
.fpr(i
+80))
499 err
= abs((actual
- expected
) / expected
)
500 print("i", i
, actual
, "1/expect", 1/expected
,
501 "expected", expected
,
503 self
.assertTrue(err
< 1e-6)
505 def test_sv_remap_fpmadds_dct_8_mode_4(self
, stride
=2):
506 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
507 "svshape 8, 1, 1, 4, 0",
508 "sv.fdmadds *0, *16, *0"
509 "svshape 8, 1, 1, 3, 0",
510 "sv.fadds *0, *0, *0"
512 runs a full in-place 8-long O(N log2 N) DCT, both
513 inner and outer butterfly "REMAP" schedules.
514 uses shorter tables: FRC also needs to be on a Schedule
516 lst
= SVP64Asm(["svremap 31, 1, 0, 2, 0, 1, 1",
517 "svshape 8, 1, %d, 4, 0" % stride
,
518 "sv.fdmadds *0, *16, *0",
519 "svshape 8, 1, %d, 3, 0" % stride
,
520 "sv.fadds *0, *0, *0"
524 # array and coefficients to test
525 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
527 levels
= n
.bit_length() - 1
529 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
530 av
= halfrev2(avi
, False)
531 av
= [av
[ri
[i
]] for i
in range(n
)]
536 for ci
in range(halfsize
):
537 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
542 for i
, a
in enumerate(av
):
543 fprs
[i
*stride
+0] = fp64toselectable(a
)
544 for i
, c
in enumerate(ctable
):
545 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
547 with
Program(lst
, bigendian
=False) as program
:
548 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
549 print("spr svshape0", sim
.spr
['SVSHAPE0'])
550 print(" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
551 print(" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
552 print(" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
553 print("spr svshape1", sim
.spr
['SVSHAPE1'])
554 print("spr svshape2", sim
.spr
['SVSHAPE2'])
555 print("spr svshape3", sim
.spr
['SVSHAPE3'])
557 # outer iterative sum
558 res
= transform2(avi
)
560 for i
, expected
in enumerate(res
):
561 print("i", i
*stride
, float(sim
.fpr(i
*stride
)),
562 "expected", expected
)
563 for i
, expected
in enumerate(res
):
564 # convert to Power single
565 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
566 expected
= float(expected
)
567 actual
= float(sim
.fpr(i
*stride
))
568 # approximate error calculation, good enough test
569 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
570 # and the rounding is different
571 err
= abs((actual
- expected
) / expected
)
573 self
.assertTrue(err
< 1e-5)
575 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
, stride
=1):
576 """>>> lst = [# LOAD bit-reversed with half-swap
577 "svshape 8, 1, 1, 6, 0",
578 "svremap 1, 0, 0, 0, 0, 0, 0",
579 "sv.lfs/els *0, 4(1)",
580 # Inner butterfly, twin +/- MUL-ADD-SUB
581 "svremap 31, 1, 0, 2, 0, 1, 1",
582 "svshape 8, 1, 1, 4, 0",
583 "sv.fdmadds *0, *32, *0"
584 # Outer butterfly, iterative sum
585 "svshape 8, 1, 1, 3, 0",
586 "sv.fadds *0, *0, *0"
588 runs a full in-place 8-long O(N log2 N) DCT, both
589 inner and outer butterfly "REMAP" schedules, and using
590 bit-reversed half-swapped LDs.
591 uses shorter pre-loaded COS tables: FRC also needs to be on a
594 lst
= SVP64Asm(["addi 1, 0, 0x000",
595 "svshape 8, 1, %d, 6, 0" % stride
,
596 "svremap 1, 0, 0, 0, 0, 0, 0",
597 "sv.lfs/els *0, 4(1)",
598 "svremap 31, 1, 0, 2, 0, 1, 1",
599 "svshape 8, 1, %d, 4, 0" % stride
,
600 "sv.fdmadds *0, *32, *0",
601 "svshape 8, 1, %d, 3, 0" % stride
,
602 "sv.fadds *0, *0, *0"
606 # array and coefficients to test
607 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
609 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
610 # LD will bring them in, in the correct order.
613 for i
, a
in enumerate(avi
):
614 a
= SINGLE(fp64toselectable(a
)).value
617 val
= a
# accumulate for next iteration
619 # even and odd 4-byte in same 8
620 mem
[(i
//2)*8] = val |
(a
<< 32)
622 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
628 for ci
in range(halfsize
):
629 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
634 for i
, c
in enumerate(ctable
):
635 fprs
[i
+32] = fp64toselectable(1.0 / c
) # invert
637 with
Program(lst
, bigendian
=False) as program
:
638 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
640 print("spr svshape0", sim
.spr
['SVSHAPE0'])
641 print(" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
642 print(" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
643 print(" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
644 print("spr svshape1", sim
.spr
['SVSHAPE1'])
645 print("spr svshape2", sim
.spr
['SVSHAPE2'])
646 print("spr svshape3", sim
.spr
['SVSHAPE3'])
648 # outer iterative sum
649 res
= transform2(avi
)
651 for i
, expected
in enumerate(res
):
652 print("i", i
*stride
, float(sim
.fpr(i
*stride
)),
653 "expected", expected
)
655 for i
, expected
in enumerate(res
):
656 # convert to Power single
657 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
658 expected
= float(expected
)
659 actual
= float(sim
.fpr(i
*stride
))
660 # approximate error calculation, good enough test
661 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
662 # and the rounding is different
663 err
= abs((actual
- expected
) / expected
)
665 self
.assertTrue(err
< 1e-5)
667 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self
):
668 """>>> lst = [# LOAD bit-reversed with half-swap
669 "svshape 8, 1, 1, 14, 0",
670 "svremap 1, 0, 0, 0, 0, 0, 0",
671 "sv.lfs/els *0, 4(1)",
672 # Outer butterfly, iterative sum
673 "svremap 31, 0, 1, 2, 1, 0, 1",
674 "svshape 8, 1, 1, 11, 0",
675 "sv.fadds *0, *0, *0",
676 # Inner butterfly, twin +/- MUL-ADD-SUB
677 "svshape 8, 1, 1, 12, 0",
678 "sv.ffmadds *0, *8, *0"
680 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
681 inner and outer butterfly "REMAP" schedules, and using
682 bit-reversed half-swapped LDs.
683 uses shorter pre-loaded COS tables: FRC also needs to be on a
684 Schedule in the sv.ffmadds instruction
686 lst
= SVP64Asm(["addi 1, 0, 0x000",
687 "svshape 8, 1, 1, 14, 0",
688 "svremap 1, 0, 0, 0, 0, 0, 0",
689 "sv.lfs/els *0, 4(1)",
690 "svremap 31, 0, 1, 2, 1, 0, 1",
691 "svshape 8, 1, 1, 11, 0",
692 "sv.fadds *0, *0, *0",
693 "svshape 8, 1, 1, 12, 0",
694 "sv.ffmadds *0, *8, *0"
698 # array and coefficients to test
699 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
701 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
702 # LD will bring them in, in the correct order.
705 for i
, a
in enumerate(avi
):
706 if i
== 0: # first element, divide by 2
708 a
= SINGLE(fp64toselectable(a
)).value
711 val
= a
# accumulate for next iteration
713 # even and odd 4-byte in same 8
714 mem
[(i
//2)*8] = val |
(a
<< 32)
716 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
722 for ci
in range(halfsize
):
723 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
728 for i
, c
in enumerate(ctable
):
729 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
731 with
Program(lst
, bigendian
=False) as program
:
732 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
734 print("spr svshape0", sim
.spr
['SVSHAPE0'])
735 print(" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
736 print(" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
737 print(" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
738 print("spr svshape1", sim
.spr
['SVSHAPE1'])
739 print("spr svshape2", sim
.spr
['SVSHAPE2'])
740 print("spr svshape3", sim
.spr
['SVSHAPE3'])
742 # outer iterative sum
743 res
= inverse_transform2(avi
)
745 for i
, expected
in enumerate(res
):
746 print("i", i
, float(sim
.fpr(i
)), "expected", expected
)
748 for i
, expected
in enumerate(res
):
749 # convert to Power single
750 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
751 expected
= float(expected
)
752 actual
= float(sim
.fpr(i
))
753 # approximate error calculation, good enough test
754 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
755 # and the rounding is different
756 err
= abs((actual
- expected
) / expected
)
758 self
.assertTrue(err
< 1e-5)
760 def run_tst_program(self
, prog
, initial_regs
=None,
764 if initial_regs
is None:
765 initial_regs
= [0] * 32
766 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
767 initial_fprs
=initial_fprs
,
778 if __name__
== "__main__":