1 from nmigen
import Module
, Signal
2 from nmigen
.sim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import ISACallerFnHelper
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
16 transform2
, inverse_transform2
)
17 from openpower
.decoder
.isa
.fastdctlee
import inverse_transform_iter
21 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
23 fph
= ISACallerFnHelper(XLEN
=64)
26 def transform_inner_radix2_dct(vec
, ctable
):
31 print ("transform2", n
)
32 levels
= n
.bit_length() - 1
34 # reference (read/write) the in-place data in *reverse-bit-order*
36 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
38 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
39 # TODO: merge these two
40 vec
= halfrev2(vec
, False)
41 vec
= [vec
[ri
[i
]] for i
in range(n
)]
55 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
57 SVSHAPE0
.submode2
= 0b01
59 SVSHAPE0
.offset
= 0 # experiment with different offset, here
60 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
63 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
65 SVSHAPE1
.submode2
= 0b01
67 SVSHAPE1
.offset
= 0 # experiment with different offset, here
68 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
70 # enumerate over the iterator function, getting new indices
71 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
72 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
73 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
74 t1
, t2
= vec
[jl
], vec
[jh
]
77 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
78 print ("coeff", "ci", k
,
80 "i/n", (k
+0.5), 1.0/coeff
,
81 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
82 "end", bin(jle
), bin(jhe
))
83 if jle
== 0b111: # all loops end
89 def transform_outer_radix2_dct(vec
):
94 print ("transform2", n
)
95 levels
= n
.bit_length() - 1
106 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
107 SVSHAPE0
.submode2
= 0b100
110 SVSHAPE0
.offset
= 0 # experiment with different offset, here
111 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
112 # j+halfstep schedule
114 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
116 SVSHAPE1
.submode2
= 0b100
118 SVSHAPE1
.offset
= 0 # experiment with different offset, here
119 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
121 # enumerate over the iterator function, getting new indices
122 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
123 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
124 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
125 print ("itersum jr", jl
, jh
,
126 "end", bin(jle
), bin(jhe
))
128 if jle
== 0b111: # all loops end
131 print("transform2 result", vec
)
136 def transform_inner_radix2_idct(vec
, ctable
):
141 print ("transform2", n
)
142 levels
= n
.bit_length() - 1
144 # pretend we LDed data in half-swapped order
145 vec
= halfrev2(vec
, False)
159 SVSHAPE0
.lims
= [xdim
, 0b000001, 1]
161 SVSHAPE0
.submode2
= 0b11
163 SVSHAPE0
.offset
= 0 # experiment with different offset, here
164 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
165 # j+halfstep schedule
167 SVSHAPE1
.lims
= [xdim
, 0b000001, 1]
169 SVSHAPE1
.submode2
= 0b11
171 SVSHAPE1
.offset
= 0 # experiment with different offset, here
172 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
174 # enumerate over the iterator function, getting new indices
175 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
176 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
177 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
178 t1
, t2
= vec
[jl
], vec
[jh
]
180 vec
[jl
] = t1
+ t2
/coeff
181 vec
[jh
] = t1
- t2
/coeff
182 print ("coeff", "ci", k
,
184 "i/n", (k
+0.5), 1.0/coeff
,
185 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
186 "end", bin(jle
), bin(jhe
))
187 if jle
== 0b111: # all loops end
193 def transform_outer_radix2_idct(vec
):
198 print ("transform2-inv", n
)
199 levels
= n
.bit_length() - 1
206 # reference (read/write) the in-place data in *reverse-bit-order*
208 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
210 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
211 # TODO: merge these two
212 vec
= [vec
[ri
[i
]] for i
in range(n
)]
213 vec
= halfrev2(vec
, True)
219 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
220 SVSHAPE0
.submode2
= 0b011
223 SVSHAPE0
.offset
= 0 # experiment with different offset, here
224 SVSHAPE0
.invxyz
= [1,0,1] # inversion if desired
225 # j+halfstep schedule
227 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
229 SVSHAPE1
.submode2
= 0b011
231 SVSHAPE1
.offset
= 0 # experiment with different offset, here
232 SVSHAPE1
.invxyz
= [1,0,1] # inversion if desired
234 # enumerate over the iterator function, getting new indices
235 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
236 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
237 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
238 print ("itersum jr", jl
, jh
,
239 "end", bin(jle
), bin(jhe
))
241 if jle
== 0b111: # all loops end
244 print("transform2-inv result", vec
)
249 class DCTTestCase(FHDLTestCase
):
251 def _check_regs(self
, sim
, expected
):
253 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
255 def test_sv_ffadds_dct(self
):
256 """>>> lst = ["sv.fdmadds *0, *0, *0, *8"
258 four in-place vector adds, four in-place vector mul-subs
260 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
261 FRS to perform the two multiplies. one add, one subtract.
263 sv.fdadds FRT, FRA, FRC, FRB actually does:
265 fsubs FRT+vl, FRA, FRB+vl
267 lst
= SVP64Asm(["sv.fdmadds *0, *0, *0, *8"
271 # cheat here with these values, they're selected so that
272 # rounding errors do not occur. sigh.
274 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
275 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
276 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
278 # work out the results with the twin add-sub
279 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
280 fprs
[i
+0] = fp64toselectable(a
)
281 fprs
[i
+4] = fp64toselectable(b
)
282 fprs
[i
+8] = fp64toselectable(c
)
283 # this isn't quite a perfect replication of the
284 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
285 # and FPSUB32 directly to be honest.
288 diff
= fph
.DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
291 tc
= fph
.DOUBLE2SINGLE(fp64toselectable(t
)) # cvt to Power single
292 uc
= fph
.DOUBLE2SINGLE(fp64toselectable(u
)) # from double
294 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
296 # SVSTATE (in this case, VL=2)
297 svstate
= SVP64State()
299 svstate
.maxvl
= 4 # MAXVL
300 print ("SVSTATE", bin(svstate
.asint()))
302 with
Program(lst
, bigendian
=False) as program
:
303 sim
= self
.run_tst_program(program
, svstate
=svstate
,
305 # confirm that the results are as expected
306 for i
, (t
, u
) in enumerate(res
):
307 a
= float(sim
.fpr(i
+0))
308 b
= float(sim
.fpr(i
+4))
311 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
312 for i
, (t
, u
) in enumerate(res
):
313 self
.assertEqual(sim
.fpr(i
+0), t
)
314 self
.assertEqual(sim
.fpr(i
+4), u
)
316 def test_sv_remap_fpmadds_dct_inner_4_stride_1(self
):
317 self
.sv_remap_fpmadds_dct_inner_4(stride
=2)
319 def test_sv_remap_fpmadds_dct_inner_4_stride_1(self
):
320 self
.sv_remap_fpmadds_dct_inner_4(stride
=1)
322 def sv_remap_fpmadds_dct_inner_4(self
, stride
=2):
323 """>>> lst = ["svshape 4, 1, 1, 2, 0",
324 "svremap 27, 1, 0, 2, 0, 1, 0",
325 "sv.fdmadds *0, *0, *0, *32"
327 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
330 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
331 (3 inputs, 2 outputs)
333 Note that the coefficient (FRC) is not on a "schedule", it
334 is straight Vectorised (0123...) because DCT coefficients
335 cannot be shared between butterfly layers (due to +0.5)
337 lst
= SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride
,
338 "svremap 27, 1, 0, 2, 0, 1, 0",
339 "sv.fdmadds *0, *0, *0, *16"
343 # array and coefficients to test
345 av
= [7.0, -9.8, 3.0, -32.3]
346 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
348 levels
= n
.bit_length() - 1
350 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
351 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
352 av
= halfrev2(avi
, False)
353 av
= [av
[ri
[i
]] for i
in range(n
)]
357 for i
, c
in enumerate(coe
):
358 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
359 for i
, a
in enumerate(av
):
360 fprs
[i
*stride
+0] = fp64toselectable(a
)
362 with
Program(lst
, bigendian
=False) as program
:
363 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
364 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
365 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
366 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
367 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
368 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
369 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
370 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
372 # work out the results with the twin mul/add-sub
373 res
= transform_inner_radix2_dct(avi
, coe
)
375 for i
, expected
in enumerate(res
):
376 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
377 "expected", expected
)
378 for i
, expected
in enumerate(res
):
379 # convert to Power single
380 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
381 expected
= float(expected
)
382 actual
= float(sim
.fpr(i
*stride
))
383 # approximate error calculation, good enough test
384 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
385 # and the rounding is different
386 err
= abs((actual
- expected
) / expected
)
387 print ("err", i
, err
)
388 self
.assertTrue(err
< 1e-6)
390 def test_sv_remap_fpmadds_idct_inner_4_stride_1(self
):
391 self
.sv_remap_fpmadds_idct_inner_4(stride
=2)
393 def test_sv_remap_fpmadds_idct_inner_4_stride_1(self
):
394 self
.sv_remap_fpmadds_idct_inner_4(stride
=1)
396 def sv_remap_fpmadds_idct_inner_4(self
, stride
=2):
397 """>>> lst = ["svshape 4, 1, 1, 10, 0",
398 "svremap 27, 0, 1, 2, 1, 0, 0",
399 "sv.ffmadds *0, *0, *0, *8"
401 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
404 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
405 (3 inputs, 2 outputs)
407 Note that the coefficient (FRC) is not on a "schedule", it
408 is straight Vectorised (0123...) because DCT coefficients
409 cannot be shared between butterfly layers (due to +0.5)
411 lst
= SVP64Asm( ["svshape 4, 1, %d, 10, 0" % stride
,
412 "svremap 27, 0, 1, 2, 1, 0, 0",
413 "sv.ffmadds *0, *0, *0, *16"
417 # array and coefficients to test
419 levels
= n
.bit_length() - 1
420 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
421 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
422 av
= halfrev2(avi
, False)
426 for i
, c
in enumerate(coe
):
427 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
428 for i
, a
in enumerate(av
):
429 fprs
[i
*stride
+0] = fp64toselectable(a
)
431 with
Program(lst
, bigendian
=False) as program
:
432 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
433 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
434 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
435 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
436 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
437 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
438 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
439 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
441 # work out the results with the twin mul/add-sub
442 res
= transform_inner_radix2_idct(avi
, coe
)
444 for i
, expected
in enumerate(res
):
445 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
446 "expected", expected
)
447 for i
, expected
in enumerate(res
):
448 # convert to Power single
449 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
450 expected
= float(expected
)
451 actual
= float(sim
.fpr(i
*stride
))
452 # approximate error calculation, good enough test
453 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
454 # and the rounding is different
455 err
= abs((actual
- expected
) / expected
)
456 print ("err", i
, err
)
457 self
.assertTrue(err
< 1e-6)
459 def test_sv_remap_fpmadds_idct_outer_8(self
, stride
=2):
460 """>>> lst = ["svshape 8, 1, 1, 11, 0",
461 "svremap 27, 0, 1, 2, 1, 0, 0",
462 "sv.fadds *0, *0, *0"
464 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
465 for inverse-DCT, does the iterative overlapped ADDs
467 SVP64 "REMAP" in Butterfly Mode.
469 lst
= SVP64Asm( ["svshape 8, 1, %d, 11, 0" % stride
, # outer butterfly
470 "svremap 27, 0, 1, 2, 1, 0, 0",
471 "sv.fadds *0, *0, *0"
475 # array and coefficients to test
476 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
479 levels
= n
.bit_length() - 1
481 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
482 av
= [avi
[ri
[i
]] for i
in range(n
)]
483 av
= halfrev2(av
, True)
487 for i
, a
in enumerate(av
):
488 fprs
[i
*stride
+0] = fp64toselectable(a
)
490 with
Program(lst
, bigendian
=False) as program
:
491 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
492 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
493 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
494 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
495 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
496 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
497 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
498 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
500 # outer iterative sum
501 res
= transform_outer_radix2_idct(avi
)
503 for i
, expected
in enumerate(res
):
504 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
505 "expected", expected
)
506 for i
, expected
in enumerate(res
):
507 # convert to Power single
508 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
509 expected
= float(expected
)
510 actual
= float(sim
.fpr(i
*stride
))
511 # approximate error calculation, good enough test
512 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
513 # and the rounding is different
514 err
= abs((actual
- expected
) / expected
)
515 print ("err", i
, err
)
516 self
.assertTrue(err
< 1e-6)
518 def test_sv_remap_fpmadds_dct_outer_8(self
, stride
=2):
519 """>>> lst = ["svshape 8, 1, 1, 3, 0",
520 "svremap 27, 1, 0, 2, 0, 1, 0",
521 "sv.fadds *0, *0, *0"
523 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
524 for DCT, does the iterative overlapped ADDs
526 SVP64 "REMAP" in Butterfly Mode.
528 lst
= SVP64Asm( ["svshape 8, 1, %d, 3, 0" % stride
,
529 "svremap 27, 1, 0, 2, 0, 1, 0",
530 "sv.fadds *0, *0, *0"
534 # array and coefficients to test
535 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
539 for i
, a
in enumerate(av
):
540 fprs
[i
*stride
+0] = fp64toselectable(a
)
542 with
Program(lst
, bigendian
=False) as program
:
543 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
544 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
545 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
546 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
547 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
548 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
549 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
550 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
552 # outer iterative sum
553 res
= transform_outer_radix2_dct(av
)
555 for i
, expected
in enumerate(res
):
556 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
557 "expected", expected
)
558 for i
, expected
in enumerate(res
):
559 # convert to Power single
560 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
561 expected
= float(expected
)
562 actual
= float(sim
.fpr(i
*stride
))
563 # approximate error calculation, good enough test
564 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
565 # and the rounding is different
566 err
= abs((actual
- expected
) / expected
)
567 print ("err", i
, err
)
568 self
.assertTrue(err
< 1e-6)
570 def test_sv_remap_fpmadds_idct_8(self
, stride
=2):
571 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
572 "svshape 8, 1, 1, 11, 0",
573 "sv.fadds *0, *0, *0",
574 "svshape 8, 1, 1, 10, 0",
575 "sv.ffmadds *0, *0, *0, *16"
577 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
578 inner and outer butterfly "REMAP" schedules.
580 lst
= SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
581 "svshape 8, 1, %d, 11, 0" % stride
,
582 "sv.fadds *0, *0, *0",
583 "svshape 8, 1, %d, 10, 0" % stride
,
584 "sv.ffmadds *0, *0, *0, *16"
588 # array and coefficients to test
589 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
591 levels
= n
.bit_length() - 1
593 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
594 av
= [avi
[ri
[i
]] for i
in range(n
)]
595 av
= halfrev2(av
, True)
597 # divide first value by 2.0, manually. rev and halfrev should
602 print ("input data pre idct", av
)
608 for i
in range(n
//size
):
609 for ci
in range(halfsize
):
610 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
615 for i
, a
in enumerate(av
):
616 fprs
[i
*stride
+0] = fp64toselectable(a
)
617 for i
, c
in enumerate(ctable
):
618 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
620 with
Program(lst
, bigendian
=False) as program
:
621 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
622 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
623 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
624 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
625 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
626 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
627 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
628 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
631 expected
= [-15.793373940443367, 27.46969091937703,
632 -24.712331606496313, 27.03601462756265]
634 #res = inverse_transform_iter(avi)
635 res
= inverse_transform2(avi
)
636 #res = transform_outer_radix2_idct(avi)
638 for i
, expected
in enumerate(res
):
639 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
640 "expected", expected
)
641 for i
, expected
in enumerate(res
):
642 # convert to Power single
643 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
644 expected
= float(expected
)
645 actual
= float(sim
.fpr(i
*stride
))
646 # approximate error calculation, good enough test
647 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
648 # and the rounding is different
649 err
= abs((actual
- expected
) / expected
)
650 print ("err", i
*stride
, err
)
651 self
.assertTrue(err
< 1e-5)
653 def test_sv_remap_fpmadds_dct_8(self
, stride
=2):
654 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
655 "svshape 8, 1, 1, 2, 0",
656 "sv.fdmadds *0, *0, *0, *8"
657 "svshape 8, 1, 1, 3, 0",
658 "sv.fadds *0, *0, *0"
660 runs a full in-place 8-long O(N log2 N) DCT, both
661 inner and outer butterfly "REMAP" schedules.
663 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
664 "svshape 8, 1, %d, 2, 0" % stride
,
665 "sv.fdmadds *0, *0, *0, *16",
666 "svshape 8, 1, %d, 3, 0" % stride
,
667 "sv.fadds *0, *0, *0"
671 # array and coefficients to test
672 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
674 levels
= n
.bit_length() - 1
676 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
677 av
= halfrev2(avi
, False)
678 av
= [av
[ri
[i
]] for i
in range(n
)]
683 for i
in range(n
//size
):
684 for ci
in range(halfsize
):
685 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
690 for i
, a
in enumerate(av
):
691 fprs
[i
*stride
+0] = fp64toselectable(a
)
692 for i
, c
in enumerate(ctable
):
693 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
695 with
Program(lst
, bigendian
=False) as program
:
696 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
697 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
698 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
699 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
700 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
701 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
702 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
703 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
705 # outer iterative sum
706 res
= transform2(avi
)
708 for i
, expected
in enumerate(res
):
709 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
710 "expected", expected
)
711 for i
, expected
in enumerate(res
):
712 # convert to Power single
713 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
714 expected
= float(expected
)
715 actual
= float(sim
.fpr(i
*stride
))
716 # approximate error calculation, good enough test
717 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
718 # and the rounding is different
719 err
= abs((actual
- expected
) / expected
)
720 print ("err", i
, err
)
721 self
.assertTrue(err
< 1e-5)
723 def test_sv_remap_dct_cos_precompute_8(self
):
724 """pre-computes a DCT COS table, deliberately using a lot of
725 registers so as to be able to see what is going on (dumping all
728 the simpler (scalar) version is in test_caller_transcendentals.py
729 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
730 need the new version of fcfids which doesn't spam memory with
733 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
734 "svremap 0, 0, 0, 2, 0, 1, 1",
735 "sv.svstep *4, 4, 1", # svstep get vector of ci
736 "sv.svstep *16, 3, 1", # svstep get vector of step
738 "setvl 0, 0, 12, 0, 1, 1",
741 "sv.fcfids *48, *64",
745 "sv.fcfids *24, *12",
746 "sv.fadds *0, *24, 43", # plus 0.5
747 "sv.fmuls *0, *0, 41", # times PI
748 "sv.fdivs *0, *0, *48", # div size
750 "sv.fdivs *80, 43, *80", # div 0.5 / x
757 fprs
[43] = fp64toselectable(0.5) # 0.5
758 fprs
[41] = fp64toselectable(math
.pi
) # pi
759 fprs
[44] = fp64toselectable(2.0) # 2.0
767 for i
in range(n
//size
):
768 for ci
in range(halfsize
):
769 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
772 with
Program(lst
, bigendian
=False) as program
:
773 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
777 for i
in range(len(ctable
)):
778 actual
= float(sim
.fpr(i
+24))
779 print ("i", i
, actual
)
781 for i
in range(len(ctable
)):
782 actual
= float(sim
.fpr(i
+48))
783 print ("i", i
, actual
)
785 for i
in range(len(ctable
)):
786 actual
= float(sim
.fpr(i
))
787 print ("i", i
, actual
)
788 for i
in range(len(ctable
)):
789 expected
= 1.0/ctable
[i
]
790 actual
= float(sim
.fpr(i
+80))
791 err
= abs((actual
- expected
) / expected
)
792 print ("i", i
, actual
, "1/expect", 1/expected
,
793 "expected", expected
,
795 self
.assertTrue(err
< 1e-6)
797 def test_sv_remap_dct_cos_precompute_inner_8(self
):
798 """pre-computes a DCT COS table, using the shorter costable
799 indices schedule. turns out, some COS values are repeated
800 in each layer of the DCT butterfly.
802 the simpler (scalar) version is in test_caller_transcendentals.py
803 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
804 need the new version of fcfids which doesn't spam memory with
807 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
808 "svremap 0, 0, 0, 2, 0, 1, 1",
809 "sv.svstep *4, 3, 1", # svstep get vector of ci
810 "sv.svstep *16, 2, 1", # svstep get vector of step
812 "setvl 0, 0, 7, 0, 1, 1",
815 "sv.fcfids *48, *64",
819 "sv.fcfids *24, *12",
820 "sv.fadds *0, *24, 43", # plus 0.5
821 "sv.fmuls *0, *0, 41", # times PI
822 "sv.fdivs *0, *0, *48", # div size
824 "sv.fdivs *80, 43, *80", # div 0.5 / x
831 fprs
[43] = fp64toselectable(0.5) # 0.5
832 fprs
[41] = fp64toselectable(math
.pi
) # pi
833 fprs
[44] = fp64toselectable(2.0) # 2.0
841 for ci
in range(halfsize
):
842 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
844 print ("coeff", "ci", ci
, "size", size
,
845 "i/n", (ci
+0.5), 1.0/coeff
)
848 with
Program(lst
, bigendian
=False) as program
:
849 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
853 for i
in range(len(ctable
)):
854 actual
= float(sim
.fpr(i
+24))
855 print ("i", i
, actual
)
857 for i
in range(len(ctable
)):
858 actual
= float(sim
.fpr(i
+48))
859 print ("i", i
, actual
)
861 for i
in range(len(ctable
)):
862 actual
= float(sim
.fpr(i
))
863 print ("i", i
, actual
)
864 for i
in range(len(ctable
)):
865 expected
= 1.0/ctable
[i
]
866 actual
= float(sim
.fpr(i
+80))
867 err
= abs((actual
- expected
) / expected
)
868 print ("i", i
, actual
, "1/expect", 1/expected
,
869 "expected", expected
,
871 self
.assertTrue(err
< 1e-6)
873 def test_sv_remap_fpmadds_dct_8_mode_4(self
, stride
=2):
874 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
875 "svshape 8, 1, 1, 4, 0",
876 "sv.fdmadds *0, *0, *0, *8"
877 "svshape 8, 1, 1, 3, 0",
878 "sv.fadds *0, *0, *0"
880 runs a full in-place 8-long O(N log2 N) DCT, both
881 inner and outer butterfly "REMAP" schedules.
882 uses shorter tables: FRC also needs to be on a Schedule
884 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
885 "svshape 8, 1, %d, 4, 0" % stride
,
886 "sv.fdmadds *0, *0, *0, *16",
887 "svshape 8, 1, %d, 3, 0" % stride
,
888 "sv.fadds *0, *0, *0"
892 # array and coefficients to test
893 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
895 levels
= n
.bit_length() - 1
897 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
898 av
= halfrev2(avi
, False)
899 av
= [av
[ri
[i
]] for i
in range(n
)]
904 for ci
in range(halfsize
):
905 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
910 for i
, a
in enumerate(av
):
911 fprs
[i
*stride
+0] = fp64toselectable(a
)
912 for i
, c
in enumerate(ctable
):
913 fprs
[i
+16] = fp64toselectable(1.0 / c
) # invert
915 with
Program(lst
, bigendian
=False) as program
:
916 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
917 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
918 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
919 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
920 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
921 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
922 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
923 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
925 # outer iterative sum
926 res
= transform2(avi
)
928 for i
, expected
in enumerate(res
):
929 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
930 "expected", expected
)
931 for i
, expected
in enumerate(res
):
932 # convert to Power single
933 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
934 expected
= float(expected
)
935 actual
= float(sim
.fpr(i
*stride
))
936 # approximate error calculation, good enough test
937 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
938 # and the rounding is different
939 err
= abs((actual
- expected
) / expected
)
940 print ("err", i
, err
)
941 self
.assertTrue(err
< 1e-5)
943 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
, stride
=1):
944 """>>> lst = [# LOAD bit-reversed with half-swap
945 "svshape 8, 1, 1, 6, 0",
946 "svremap 1, 0, 0, 0, 0, 0, 0",
947 "sv.lfs/els *0, 4(1)",
948 # Inner butterfly, twin +/- MUL-ADD-SUB
949 "svremap 31, 1, 0, 2, 0, 1, 1",
950 "svshape 8, 1, 1, 4, 0",
951 "sv.fdmadds *0, *0, *0, *8"
952 # Outer butterfly, iterative sum
953 "svshape 8, 1, 1, 3, 0",
954 "sv.fadds *0, *0, *0"
956 runs a full in-place 8-long O(N log2 N) DCT, both
957 inner and outer butterfly "REMAP" schedules, and using
958 bit-reversed half-swapped LDs.
959 uses shorter pre-loaded COS tables: FRC also needs to be on a
962 lst
= SVP64Asm( ["addi 1, 0, 0x000",
963 "svshape 8, 1, %d, 6, 0" % stride
,
964 "svremap 1, 0, 0, 0, 0, 0, 0",
965 "sv.lfs/els *0, 4(1)",
966 "svremap 31, 1, 0, 2, 0, 1, 1",
967 "svshape 8, 1, %d, 4, 0" % stride
,
968 "sv.fdmadds *0, *0, *0, *32",
969 "svshape 8, 1, %d, 3, 0" % stride
,
970 "sv.fadds *0, *0, *0"
974 # array and coefficients to test
975 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
977 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
978 # LD will bring them in, in the correct order.
981 for i
, a
in enumerate(avi
):
982 a
= SINGLE(fp64toselectable(a
)).value
985 val
= a
# accumulate for next iteration
987 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
989 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
995 for ci
in range(halfsize
):
996 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
1001 for i
, c
in enumerate(ctable
):
1002 fprs
[i
+32] = fp64toselectable(1.0 / c
) # invert
1004 with
Program(lst
, bigendian
=False) as program
:
1005 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
1007 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
1008 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
1009 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
1010 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
1011 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
1012 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
1013 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
1015 # outer iterative sum
1016 res
= transform2(avi
)
1018 for i
, expected
in enumerate(res
):
1019 print ("i", i
*stride
, float(sim
.fpr(i
*stride
)),
1020 "expected", expected
)
1022 for i
, expected
in enumerate(res
):
1023 # convert to Power single
1024 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
1025 expected
= float(expected
)
1026 actual
= float(sim
.fpr(i
*stride
))
1027 # approximate error calculation, good enough test
1028 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1029 # and the rounding is different
1030 err
= abs((actual
- expected
) / expected
)
1031 print ("err", i
, err
)
1032 self
.assertTrue(err
< 1e-5)
1034 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self
):
1035 """>>> lst = [# LOAD bit-reversed with half-swap
1036 "svshape 8, 1, 1, 14, 0",
1037 "svremap 1, 0, 0, 0, 0, 0, 0",
1038 "sv.lfs/els *0, 4(1)",
1039 # Outer butterfly, iterative sum
1040 "svremap 31, 0, 1, 2, 1, 0, 1",
1041 "svshape 8, 1, 1, 11, 0",
1042 "sv.fadds *0, *0, *0",
1043 # Inner butterfly, twin +/- MUL-ADD-SUB
1044 "svshape 8, 1, 1, 10, 0",
1045 "sv.ffmadds *0, *0, *0, *8"
1047 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
1048 inner and outer butterfly "REMAP" schedules, and using
1049 bit-reversed half-swapped LDs.
1050 uses shorter pre-loaded COS tables: FRC also needs to be on a
1051 Schedule in the sv.ffmadds instruction
1053 lst
= SVP64Asm( ["addi 1, 0, 0x000",
1054 "svshape 8, 1, 1, 14, 0",
1055 "svremap 1, 0, 0, 0, 0, 0, 0",
1056 "sv.lfs/els *0, 4(1)",
1057 "svremap 31, 0, 1, 2, 1, 0, 1",
1058 "svshape 8, 1, 1, 11, 0",
1059 "sv.fadds *0, *0, *0",
1060 "svshape 8, 1, 1, 12, 0",
1061 "sv.ffmadds *0, *0, *0, *8"
1065 # array and coefficients to test
1066 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
1068 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
1069 # LD will bring them in, in the correct order.
1072 for i
, a
in enumerate(avi
):
1073 if i
== 0: # first element, divide by 2
1075 a
= SINGLE(fp64toselectable(a
)).value
1076 shift
= (i
% 2) == 1
1078 val
= a
# accumulate for next iteration
1080 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
1082 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
1087 halfsize
= size
// 2
1088 for ci
in range(halfsize
):
1089 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
1094 for i
, c
in enumerate(ctable
):
1095 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
1097 with
Program(lst
, bigendian
=False) as program
:
1098 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
1100 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
1101 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
1102 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
1103 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
1104 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
1105 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
1106 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
1108 # outer iterative sum
1109 res
= inverse_transform2(avi
)
1111 for i
, expected
in enumerate(res
):
1112 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
1114 for i
, expected
in enumerate(res
):
1115 # convert to Power single
1116 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
1117 expected
= float(expected
)
1118 actual
= float(sim
.fpr(i
))
1119 # approximate error calculation, good enough test
1120 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1121 # and the rounding is different
1122 err
= abs((actual
- expected
) / expected
)
1123 print ("err", i
, err
)
1124 self
.assertTrue(err
< 1e-5)
1126 def run_tst_program(self
, prog
, initial_regs
=None,
1130 if initial_regs
is None:
1131 initial_regs
= [0] * 32
1132 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
1133 initial_fprs
=initial_fprs
,
1137 simulator
.gpr
.dump()
1139 simulator
.fpr
.dump()
1144 if __name__
== "__main__":