1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
21 def transform_inner_radix2_dct(vec
, ctable
):
26 print ("transform2", n
)
27 levels
= n
.bit_length() - 1
29 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec
= halfrev2(vec
, False)
36 vec
= [vec
[ri
[i
]] for i
in range(n
)]
50 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
52 SVSHAPE0
.submode2
= 0b01
54 SVSHAPE0
.offset
= 0 # experiment with different offset, here
55 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
58 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
60 SVSHAPE1
.submode2
= 0b01
62 SVSHAPE1
.offset
= 0 # experiment with different offset, here
63 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
65 # enumerate over the iterator function, getting new indices
66 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
67 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
68 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
69 t1
, t2
= vec
[jl
], vec
[jh
]
72 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
73 print ("coeff", "ci", k
,
75 "i/n", (k
+0.5), 1.0/coeff
,
76 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
77 "end", bin(jle
), bin(jhe
))
78 if jle
== 0b111: # all loops end
84 def transform_outer_radix2_dct(vec
):
89 print ("transform2", n
)
90 levels
= n
.bit_length() - 1
101 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
102 SVSHAPE0
.submode2
= 0b100
105 SVSHAPE0
.offset
= 0 # experiment with different offset, here
106 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
107 # j+halfstep schedule
109 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
111 SVSHAPE1
.submode2
= 0b100
113 SVSHAPE1
.offset
= 0 # experiment with different offset, here
114 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
116 # enumerate over the iterator function, getting new indices
117 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
118 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
119 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
120 print ("itersum jr", jl
, jh
,
121 "end", bin(jle
), bin(jhe
))
123 if jle
== 0b111: # all loops end
126 print("transform2 result", vec
)
131 def transform_inner_radix2_idct(vec
, ctable
):
136 print ("transform2", n
)
137 levels
= n
.bit_length() - 1
139 # pretend we LDed data in half-swapped order
140 vec
= halfrev2(vec
, True)
154 SVSHAPE0
.lims
= [xdim
, 0b000001, 0]
156 SVSHAPE0
.submode2
= 0b11
158 SVSHAPE0
.offset
= 0 # experiment with different offset, here
159 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
160 # j+halfstep schedule
162 SVSHAPE1
.lims
= [xdim
, 0b000001, 0]
164 SVSHAPE1
.submode2
= 0b11
166 SVSHAPE1
.offset
= 0 # experiment with different offset, here
167 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
169 # enumerate over the iterator function, getting new indices
170 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
171 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
172 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
173 t1
, t2
= vec
[jl
], vec
[jh
]
176 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
177 print ("coeff", "ci", k
,
179 "i/n", (k
+0.5), 1.0/coeff
,
180 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
181 "end", bin(jle
), bin(jhe
))
182 if jle
== 0b111: # all loops end
188 def transform_outer_radix2_idct(vec
):
193 print ("transform2-inv", n
)
194 levels
= n
.bit_length() - 1
201 # reference (read/write) the in-place data in *reverse-bit-order*
203 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
205 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
206 # TODO: merge these two
207 vec
= [vec
[ri
[i
]] for i
in range(n
)]
208 vec
= halfrev2(vec
, True)
214 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
215 SVSHAPE0
.submode2
= 0b011
218 SVSHAPE0
.offset
= 0 # experiment with different offset, here
219 SVSHAPE0
.invxyz
= [1,0,1] # inversion if desired
220 # j+halfstep schedule
222 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
224 SVSHAPE1
.submode2
= 0b011
226 SVSHAPE1
.offset
= 0 # experiment with different offset, here
227 SVSHAPE1
.invxyz
= [1,0,1] # inversion if desired
229 # enumerate over the iterator function, getting new indices
230 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
231 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
232 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
233 print ("itersum jr", jl
, jh
,
234 "end", bin(jle
), bin(jhe
))
236 if jle
== 0b111: # all loops end
239 print("transform2-inv result", vec
)
244 class DCTTestCase(FHDLTestCase
):
246 def _check_regs(self
, sim
, expected
):
248 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
250 def test_sv_ffadds_dct(self
):
251 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
253 four in-place vector adds, four in-place vector mul-subs
255 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
256 FRS to perform the two multiplies. one add, one subtract.
258 sv.fdadds FRT, FRA, FRC, FRB actually does:
260 fsubs FRT+vl, FRA, FRB+vl
262 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
266 # cheat here with these values, they're selected so that
267 # rounding errors do not occur. sigh.
269 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
270 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
271 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
273 # work out the results with the twin add-sub
274 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
275 fprs
[i
+0] = fp64toselectable(a
)
276 fprs
[i
+4] = fp64toselectable(b
)
277 fprs
[i
+8] = fp64toselectable(c
)
278 # this isn't quite a perfect replication of the
279 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
280 # and FPSUB32 directly to be honest.
283 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
286 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
287 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
289 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
291 # SVSTATE (in this case, VL=2)
292 svstate
= SVP64State()
294 svstate
.maxvl
= 4 # MAXVL
295 print ("SVSTATE", bin(svstate
.asint()))
297 with
Program(lst
, bigendian
=False) as program
:
298 sim
= self
.run_tst_program(program
, svstate
=svstate
,
300 # confirm that the results are as expected
301 for i
, (t
, u
) in enumerate(res
):
302 a
= float(sim
.fpr(i
+0))
303 b
= float(sim
.fpr(i
+4))
306 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
307 for i
, (t
, u
) in enumerate(res
):
308 self
.assertEqual(sim
.fpr(i
+0), t
)
309 self
.assertEqual(sim
.fpr(i
+4), u
)
311 def test_sv_remap_fpmadds_dct_inner_4(self
):
312 """>>> lst = ["svshape 4, 1, 1, 2, 0",
313 "svremap 27, 1, 0, 2, 0, 1, 0",
314 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
316 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
319 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
320 (3 inputs, 2 outputs)
322 Note that the coefficient (FRC) is not on a "schedule", it
323 is straight Vectorised (0123...) because DCT coefficients
324 cannot be shared between butterfly layers (due to +0.5)
326 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
327 "svremap 27, 1, 0, 2, 0, 1, 0",
328 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
332 # array and coefficients to test
334 av
= [7.0, -9.8, 3.0, -32.3]
335 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
337 levels
= n
.bit_length() - 1
339 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
340 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
341 av
= halfrev2(avi
, False)
342 av
= [av
[ri
[i
]] for i
in range(n
)]
346 for i
, c
in enumerate(coe
):
347 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
348 for i
, a
in enumerate(av
):
349 fprs
[i
+0] = fp64toselectable(a
)
351 with
Program(lst
, bigendian
=False) as program
:
352 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
353 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
354 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
355 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
356 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
357 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
358 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
359 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
361 # work out the results with the twin mul/add-sub
362 res
= transform_inner_radix2_dct(avi
, coe
)
364 for i
, expected
in enumerate(res
):
365 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
366 for i
, expected
in enumerate(res
):
367 # convert to Power single
368 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
369 expected
= float(expected
)
370 actual
= float(sim
.fpr(i
))
371 # approximate error calculation, good enough test
372 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
373 # and the rounding is different
374 err
= abs((actual
- expected
) / expected
)
375 print ("err", i
, err
)
376 self
.assertTrue(err
< 1e-6)
378 def test_sv_remap_fpmadds_idct_outer_8(self
):
379 """>>> lst = ["svshape 8, 1, 1, 11, 0",
380 "svremap 27, 1, 0, 2, 0, 1, 0",
381 "sv.fadds 0.v, 0.v, 0.v"
383 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
384 for inverse-DCT, does the iterative overlapped ADDs
386 SVP64 "REMAP" in Butterfly Mode.
388 lst
= SVP64Asm( ["svshape 8, 1, 1, 11, 0",
389 "svremap 27, 1, 0, 2, 0, 1, 0",
390 "sv.fadds 0.v, 0.v, 0.v"
394 # array and coefficients to test
395 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
398 levels
= n
.bit_length() - 1
400 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
401 av
= [avi
[ri
[i
]] for i
in range(n
)]
402 av
= halfrev2(av
, True)
406 for i
, a
in enumerate(av
):
407 fprs
[i
+0] = fp64toselectable(a
)
409 with
Program(lst
, bigendian
=False) as program
:
410 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
411 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
412 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
413 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
414 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
415 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
416 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
417 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
419 # outer iterative sum
420 res
= transform_outer_radix2_idct(avi
)
422 for i
, expected
in enumerate(res
):
423 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
424 for i
, expected
in enumerate(res
):
425 # convert to Power single
426 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
427 expected
= float(expected
)
428 actual
= float(sim
.fpr(i
))
429 # approximate error calculation, good enough test
430 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
431 # and the rounding is different
432 err
= abs((actual
- expected
) / expected
)
433 print ("err", i
, err
)
434 self
.assertTrue(err
< 1e-6)
436 def test_sv_remap_fpmadds_dct_outer_8(self
):
437 """>>> lst = ["svshape 8, 1, 1, 3, 0",
438 "svremap 27, 1, 0, 2, 0, 1, 0",
439 "sv.fadds 0.v, 0.v, 0.v"
441 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
442 for DCT, does the iterative overlapped ADDs
444 SVP64 "REMAP" in Butterfly Mode.
446 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
447 "svremap 27, 1, 0, 2, 0, 1, 0",
448 "sv.fadds 0.v, 0.v, 0.v"
452 # array and coefficients to test
453 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
457 for i
, a
in enumerate(av
):
458 fprs
[i
+0] = fp64toselectable(a
)
460 with
Program(lst
, bigendian
=False) as program
:
461 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
462 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
463 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
464 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
465 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
466 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
467 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
468 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
470 # outer iterative sum
471 res
= transform_outer_radix2_dct(av
)
473 for i
, expected
in enumerate(res
):
474 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
475 for i
, expected
in enumerate(res
):
476 # convert to Power single
477 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
478 expected
= float(expected
)
479 actual
= float(sim
.fpr(i
))
480 # approximate error calculation, good enough test
481 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
482 # and the rounding is different
483 err
= abs((actual
- expected
) / expected
)
484 print ("err", i
, err
)
485 self
.assertTrue(err
< 1e-6)
487 def test_sv_remap_fpmadds_dct_8(self
):
488 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
489 "svshape 8, 1, 1, 2, 0",
490 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
491 "svshape 8, 1, 1, 3, 0",
492 "sv.fadds 0.v, 0.v, 0.v"
494 runs a full in-place 8-long O(N log2 N) DCT, both
495 inner and outer butterfly "REMAP" schedules.
497 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
498 "svshape 8, 1, 1, 2, 0",
499 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
500 "svshape 8, 1, 1, 3, 0",
501 "sv.fadds 0.v, 0.v, 0.v"
505 # array and coefficients to test
506 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
508 levels
= n
.bit_length() - 1
510 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
511 av
= halfrev2(avi
, False)
512 av
= [av
[ri
[i
]] for i
in range(n
)]
517 for i
in range(n
//size
):
518 for ci
in range(halfsize
):
519 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
524 for i
, a
in enumerate(av
):
525 fprs
[i
+0] = fp64toselectable(a
)
526 for i
, c
in enumerate(ctable
):
527 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
529 with
Program(lst
, bigendian
=False) as program
:
530 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
531 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
532 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
533 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
534 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
535 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
536 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
537 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
539 # outer iterative sum
540 res
= transform2(avi
)
542 for i
, expected
in enumerate(res
):
543 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
544 for i
, expected
in enumerate(res
):
545 # convert to Power single
546 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
547 expected
= float(expected
)
548 actual
= float(sim
.fpr(i
))
549 # approximate error calculation, good enough test
550 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
551 # and the rounding is different
552 err
= abs((actual
- expected
) / expected
)
553 print ("err", i
, err
)
554 self
.assertTrue(err
< 1e-5)
556 def test_sv_remap_dct_cos_precompute_8(self
):
557 """pre-computes a DCT COS table, deliberately using a lot of
558 registers so as to be able to see what is going on (dumping all
561 the simpler (scalar) version is in test_caller_transcendentals.py
562 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
563 need the new version of fcfids which doesn't spam memory with
566 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
567 "svremap 0, 0, 0, 2, 0, 1, 1",
568 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
569 "sv.svstep 16.v, 3, 1", # svstep get vector of step
571 "setvl 0, 0, 12, 0, 1, 1",
574 "sv.fcfids 48.v, 64.v",
578 "sv.fcfids 24.v, 12.v",
579 "sv.fadds 0.v, 24.v, 43", # plus 0.5
580 "sv.fmuls 0.v, 0.v, 41", # times PI
581 "sv.fdivs 0.v, 0.v, 48.v", # div size
582 "sv.fcoss 80.v, 0.v",
583 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
590 fprs
[43] = fp64toselectable(0.5) # 0.5
591 fprs
[41] = fp64toselectable(math
.pi
) # pi
592 fprs
[44] = fp64toselectable(2.0) # 2.0
600 for i
in range(n
//size
):
601 for ci
in range(halfsize
):
602 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
605 with
Program(lst
, bigendian
=False) as program
:
606 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
610 for i
in range(len(ctable
)):
611 actual
= float(sim
.fpr(i
+24))
612 print ("i", i
, actual
)
614 for i
in range(len(ctable
)):
615 actual
= float(sim
.fpr(i
+48))
616 print ("i", i
, actual
)
618 for i
in range(len(ctable
)):
619 actual
= float(sim
.fpr(i
))
620 print ("i", i
, actual
)
621 for i
in range(len(ctable
)):
622 expected
= 1.0/ctable
[i
]
623 actual
= float(sim
.fpr(i
+80))
624 err
= abs((actual
- expected
) / expected
)
625 print ("i", i
, actual
, "1/expect", 1/expected
,
626 "expected", expected
,
628 self
.assertTrue(err
< 1e-6)
630 def test_sv_remap_dct_cos_precompute_inner_8(self
):
631 """pre-computes a DCT COS table, using the shorter costable
632 indices schedule. turns out, some COS values are repeated
633 in each layer of the DCT butterfly.
635 the simpler (scalar) version is in test_caller_transcendentals.py
636 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
637 need the new version of fcfids which doesn't spam memory with
640 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
641 "svremap 0, 0, 0, 2, 0, 1, 1",
642 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
643 "sv.svstep 16.v, 2, 1", # svstep get vector of step
645 "setvl 0, 0, 7, 0, 1, 1",
648 "sv.fcfids 48.v, 64.v",
652 "sv.fcfids 24.v, 12.v",
653 "sv.fadds 0.v, 24.v, 43", # plus 0.5
654 "sv.fmuls 0.v, 0.v, 41", # times PI
655 "sv.fdivs 0.v, 0.v, 48.v", # div size
656 "sv.fcoss 80.v, 0.v",
657 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
664 fprs
[43] = fp64toselectable(0.5) # 0.5
665 fprs
[41] = fp64toselectable(math
.pi
) # pi
666 fprs
[44] = fp64toselectable(2.0) # 2.0
674 for ci
in range(halfsize
):
675 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
677 print ("coeff", "ci", ci
, "size", size
,
678 "i/n", (ci
+0.5), 1.0/coeff
)
681 with
Program(lst
, bigendian
=False) as program
:
682 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
686 for i
in range(len(ctable
)):
687 actual
= float(sim
.fpr(i
+24))
688 print ("i", i
, actual
)
690 for i
in range(len(ctable
)):
691 actual
= float(sim
.fpr(i
+48))
692 print ("i", i
, actual
)
694 for i
in range(len(ctable
)):
695 actual
= float(sim
.fpr(i
))
696 print ("i", i
, actual
)
697 for i
in range(len(ctable
)):
698 expected
= 1.0/ctable
[i
]
699 actual
= float(sim
.fpr(i
+80))
700 err
= abs((actual
- expected
) / expected
)
701 print ("i", i
, actual
, "1/expect", 1/expected
,
702 "expected", expected
,
704 self
.assertTrue(err
< 1e-6)
706 def test_sv_remap_fpmadds_dct_8_mode_4(self
):
707 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
708 "svshape 8, 1, 1, 4, 0",
709 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
710 "svshape 8, 1, 1, 3, 0",
711 "sv.fadds 0.v, 0.v, 0.v"
713 runs a full in-place 8-long O(N log2 N) DCT, both
714 inner and outer butterfly "REMAP" schedules.
715 uses shorter tables: FRC also needs to be on a Schedule
717 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
718 "svshape 8, 1, 1, 4, 0",
719 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
720 "svshape 8, 1, 1, 3, 0",
721 "sv.fadds 0.v, 0.v, 0.v"
725 # array and coefficients to test
726 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
728 levels
= n
.bit_length() - 1
730 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
731 av
= halfrev2(avi
, False)
732 av
= [av
[ri
[i
]] for i
in range(n
)]
737 for ci
in range(halfsize
):
738 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
743 for i
, a
in enumerate(av
):
744 fprs
[i
+0] = fp64toselectable(a
)
745 for i
, c
in enumerate(ctable
):
746 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
748 with
Program(lst
, bigendian
=False) as program
:
749 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
750 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
751 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
752 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
753 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
754 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
755 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
756 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
758 # outer iterative sum
759 res
= transform2(avi
)
761 for i
, expected
in enumerate(res
):
762 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
763 for i
, expected
in enumerate(res
):
764 # convert to Power single
765 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
766 expected
= float(expected
)
767 actual
= float(sim
.fpr(i
))
768 # approximate error calculation, good enough test
769 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
770 # and the rounding is different
771 err
= abs((actual
- expected
) / expected
)
772 print ("err", i
, err
)
773 self
.assertTrue(err
< 1e-5)
775 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
):
776 """>>> lst = [# LOAD bit-reversed with half-swap
777 "svshape 8, 1, 1, 6, 0",
778 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
779 "sv.lfsbr 0.v, 4(1), 2",
780 # Inner butterfly, twin +/- MUL-ADD-SUB
781 "svremap 31, 1, 0, 2, 0, 1, 1",
782 "svshape 8, 1, 1, 4, 0",
783 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
784 # Outer butterfly, iterative sum
785 "svshape 8, 1, 1, 3, 0",
786 "sv.fadds 0.v, 0.v, 0.v"
788 runs a full in-place 8-long O(N log2 N) DCT, both
789 inner and outer butterfly "REMAP" schedules, and using
790 bit-reversed half-swapped LDs.
791 uses shorter pre-loaded COS tables: FRC also needs to be on a
794 lst
= SVP64Asm( ["addi 1, 0, 0x000",
795 "svshape 8, 1, 1, 6, 0",
796 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
797 "sv.lfsbr 0.v, 4(1), 2",
798 "svremap 31, 1, 0, 2, 0, 1, 1",
799 "svshape 8, 1, 1, 4, 0",
800 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
801 "svshape 8, 1, 1, 3, 0",
802 "sv.fadds 0.v, 0.v, 0.v"
806 # array and coefficients to test
807 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
809 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
810 # LD will bring them in, in the correct order.
813 for i
, a
in enumerate(avi
):
814 a
= SINGLE(fp64toselectable(a
)).value
817 val
= a
# accumulate for next iteration
819 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
821 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
827 for ci
in range(halfsize
):
828 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
833 for i
, c
in enumerate(ctable
):
834 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
836 with
Program(lst
, bigendian
=False) as program
:
837 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
839 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
840 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
841 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
842 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
843 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
844 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
845 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
847 # outer iterative sum
848 res
= transform2(avi
)
850 for i
, expected
in enumerate(res
):
851 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
853 for i
, expected
in enumerate(res
):
854 # convert to Power single
855 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
856 expected
= float(expected
)
857 actual
= float(sim
.fpr(i
))
858 # approximate error calculation, good enough test
859 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
860 # and the rounding is different
861 err
= abs((actual
- expected
) / expected
)
862 print ("err", i
, err
)
863 self
.assertTrue(err
< 1e-5)
865 def run_tst_program(self
, prog
, initial_regs
=None,
869 if initial_regs
is None:
870 initial_regs
= [0] * 32
871 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
872 initial_fprs
=initial_fprs
,
883 if __name__
== "__main__":