1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import ISACallerFnHelper
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
16 transform2
, inverse_transform2
)
17 from openpower
.decoder
.isa
.fastdctlee
import inverse_transform_iter
21 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
23 fph
= ISACallerFnHelper(XLEN
=64)
26 def transform_inner_radix2_dct(vec
, ctable
):
31 print ("transform2", n
)
32 levels
= n
.bit_length() - 1
34 # reference (read/write) the in-place data in *reverse-bit-order*
36 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
38 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
39 # TODO: merge these two
40 vec
= halfrev2(vec
, False)
41 vec
= [vec
[ri
[i
]] for i
in range(n
)]
55 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
57 SVSHAPE0
.submode2
= 0b01
59 SVSHAPE0
.offset
= 0 # experiment with different offset, here
60 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
63 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
65 SVSHAPE1
.submode2
= 0b01
67 SVSHAPE1
.offset
= 0 # experiment with different offset, here
68 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
70 # enumerate over the iterator function, getting new indices
71 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
72 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
73 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
74 t1
, t2
= vec
[jl
], vec
[jh
]
77 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
78 print ("coeff", "ci", k
,
80 "i/n", (k
+0.5), 1.0/coeff
,
81 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
82 "end", bin(jle
), bin(jhe
))
83 if jle
== 0b111: # all loops end
89 def transform_outer_radix2_dct(vec
):
94 print ("transform2", n
)
95 levels
= n
.bit_length() - 1
106 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
107 SVSHAPE0
.submode2
= 0b100
110 SVSHAPE0
.offset
= 0 # experiment with different offset, here
111 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
112 # j+halfstep schedule
114 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
116 SVSHAPE1
.submode2
= 0b100
118 SVSHAPE1
.offset
= 0 # experiment with different offset, here
119 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
121 # enumerate over the iterator function, getting new indices
122 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
123 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
124 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
125 print ("itersum jr", jl
, jh
,
126 "end", bin(jle
), bin(jhe
))
128 if jle
== 0b111: # all loops end
131 print("transform2 result", vec
)
136 def transform_inner_radix2_idct(vec
, ctable
):
141 print ("transform2", n
)
142 levels
= n
.bit_length() - 1
144 # pretend we LDed data in half-swapped order
145 vec
= halfrev2(vec
, False)
159 SVSHAPE0
.lims
= [xdim
, 0b000001, 0]
161 SVSHAPE0
.submode2
= 0b11
163 SVSHAPE0
.offset
= 0 # experiment with different offset, here
164 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
165 # j+halfstep schedule
167 SVSHAPE1
.lims
= [xdim
, 0b000001, 0]
169 SVSHAPE1
.submode2
= 0b11
171 SVSHAPE1
.offset
= 0 # experiment with different offset, here
172 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
174 # enumerate over the iterator function, getting new indices
175 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
176 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
177 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
178 t1
, t2
= vec
[jl
], vec
[jh
]
180 vec
[jl
] = t1
+ t2
/coeff
181 vec
[jh
] = t1
- t2
/coeff
182 print ("coeff", "ci", k
,
184 "i/n", (k
+0.5), 1.0/coeff
,
185 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
186 "end", bin(jle
), bin(jhe
))
187 if jle
== 0b111: # all loops end
193 def transform_outer_radix2_idct(vec
):
198 print ("transform2-inv", n
)
199 levels
= n
.bit_length() - 1
206 # reference (read/write) the in-place data in *reverse-bit-order*
208 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
210 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
211 # TODO: merge these two
212 vec
= [vec
[ri
[i
]] for i
in range(n
)]
213 vec
= halfrev2(vec
, True)
219 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
220 SVSHAPE0
.submode2
= 0b011
223 SVSHAPE0
.offset
= 0 # experiment with different offset, here
224 SVSHAPE0
.invxyz
= [1,0,1] # inversion if desired
225 # j+halfstep schedule
227 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
229 SVSHAPE1
.submode2
= 0b011
231 SVSHAPE1
.offset
= 0 # experiment with different offset, here
232 SVSHAPE1
.invxyz
= [1,0,1] # inversion if desired
234 # enumerate over the iterator function, getting new indices
235 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
236 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
237 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
238 print ("itersum jr", jl
, jh
,
239 "end", bin(jle
), bin(jhe
))
241 if jle
== 0b111: # all loops end
244 print("transform2-inv result", vec
)
249 class DCTTestCase(FHDLTestCase
):
251 def _check_regs(self
, sim
, expected
):
253 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
255 def test_sv_ffadds_dct(self
):
256 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
258 four in-place vector adds, four in-place vector mul-subs
260 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
261 FRS to perform the two multiplies. one add, one subtract.
263 sv.fdadds FRT, FRA, FRC, FRB actually does:
265 fsubs FRT+vl, FRA, FRB+vl
267 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
271 # cheat here with these values, they're selected so that
272 # rounding errors do not occur. sigh.
274 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
275 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
276 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
278 # work out the results with the twin add-sub
279 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
280 fprs
[i
+0] = fp64toselectable(a
)
281 fprs
[i
+4] = fp64toselectable(b
)
282 fprs
[i
+8] = fp64toselectable(c
)
283 # this isn't quite a perfect replication of the
284 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
285 # and FPSUB32 directly to be honest.
288 diff
= fph
.DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
291 tc
= fph
.DOUBLE2SINGLE(fp64toselectable(t
)) # cvt to Power single
292 uc
= fph
.DOUBLE2SINGLE(fp64toselectable(u
)) # from double
294 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
296 # SVSTATE (in this case, VL=2)
297 svstate
= SVP64State()
299 svstate
.maxvl
= 4 # MAXVL
300 print ("SVSTATE", bin(svstate
.asint()))
302 with
Program(lst
, bigendian
=False) as program
:
303 sim
= self
.run_tst_program(program
, svstate
=svstate
,
305 # confirm that the results are as expected
306 for i
, (t
, u
) in enumerate(res
):
307 a
= float(sim
.fpr(i
+0))
308 b
= float(sim
.fpr(i
+4))
311 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
312 for i
, (t
, u
) in enumerate(res
):
313 self
.assertEqual(sim
.fpr(i
+0), t
)
314 self
.assertEqual(sim
.fpr(i
+4), u
)
316 def test_sv_remap_fpmadds_dct_inner_4(self
):
317 """>>> lst = ["svshape 4, 1, 1, 2, 0",
318 "svremap 27, 1, 0, 2, 0, 1, 0",
319 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
321 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
324 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
325 (3 inputs, 2 outputs)
327 Note that the coefficient (FRC) is not on a "schedule", it
328 is straight Vectorised (0123...) because DCT coefficients
329 cannot be shared between butterfly layers (due to +0.5)
331 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
332 "svremap 27, 1, 0, 2, 0, 1, 0",
333 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
337 # array and coefficients to test
339 av
= [7.0, -9.8, 3.0, -32.3]
340 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
342 levels
= n
.bit_length() - 1
344 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
345 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
346 av
= halfrev2(avi
, False)
347 av
= [av
[ri
[i
]] for i
in range(n
)]
351 for i
, c
in enumerate(coe
):
352 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
353 for i
, a
in enumerate(av
):
354 fprs
[i
+0] = fp64toselectable(a
)
356 with
Program(lst
, bigendian
=False) as program
:
357 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
358 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
359 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
360 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
361 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
362 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
363 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
364 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
366 # work out the results with the twin mul/add-sub
367 res
= transform_inner_radix2_dct(avi
, coe
)
369 for i
, expected
in enumerate(res
):
370 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
371 for i
, expected
in enumerate(res
):
372 # convert to Power single
373 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
374 expected
= float(expected
)
375 actual
= float(sim
.fpr(i
))
376 # approximate error calculation, good enough test
377 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
378 # and the rounding is different
379 err
= abs((actual
- expected
) / expected
)
380 print ("err", i
, err
)
381 self
.assertTrue(err
< 1e-6)
383 def test_sv_remap_fpmadds_idct_inner_4(self
):
384 """>>> lst = ["svshape 4, 1, 1, 10, 0",
385 "svremap 27, 0, 1, 2, 1, 0, 0",
386 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
388 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
391 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
392 (3 inputs, 2 outputs)
394 Note that the coefficient (FRC) is not on a "schedule", it
395 is straight Vectorised (0123...) because DCT coefficients
396 cannot be shared between butterfly layers (due to +0.5)
398 lst
= SVP64Asm( ["svshape 4, 1, 1, 10, 0",
399 "svremap 27, 0, 1, 2, 1, 0, 0",
400 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
404 # array and coefficients to test
406 levels
= n
.bit_length() - 1
407 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
408 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
409 av
= halfrev2(avi
, False)
413 for i
, c
in enumerate(coe
):
414 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
415 for i
, a
in enumerate(av
):
416 fprs
[i
+0] = fp64toselectable(a
)
418 with
Program(lst
, bigendian
=False) as program
:
419 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
420 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
421 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
422 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
423 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
424 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
425 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
426 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
428 # work out the results with the twin mul/add-sub
429 res
= transform_inner_radix2_idct(avi
, coe
)
431 for i
, expected
in enumerate(res
):
432 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
433 for i
, expected
in enumerate(res
):
434 # convert to Power single
435 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
436 expected
= float(expected
)
437 actual
= float(sim
.fpr(i
))
438 # approximate error calculation, good enough test
439 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
440 # and the rounding is different
441 err
= abs((actual
- expected
) / expected
)
442 print ("err", i
, err
)
443 self
.assertTrue(err
< 1e-6)
445 def test_sv_remap_fpmadds_idct_outer_8(self
):
446 """>>> lst = ["svshape 8, 1, 1, 11, 0",
447 "svremap 27, 0, 1, 2, 1, 0, 0",
448 "sv.fadds 0.v, 0.v, 0.v"
450 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
451 for inverse-DCT, does the iterative overlapped ADDs
453 SVP64 "REMAP" in Butterfly Mode.
455 lst
= SVP64Asm( ["svshape 8, 1, 1, 11, 0", # outer butterfly
456 "svremap 27, 0, 1, 2, 1, 0, 0",
457 "sv.fadds 0.v, 0.v, 0.v"
461 # array and coefficients to test
462 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
465 levels
= n
.bit_length() - 1
467 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
468 av
= [avi
[ri
[i
]] for i
in range(n
)]
469 av
= halfrev2(av
, True)
473 for i
, a
in enumerate(av
):
474 fprs
[i
+0] = fp64toselectable(a
)
476 with
Program(lst
, bigendian
=False) as program
:
477 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
478 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
479 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
480 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
481 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
482 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
483 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
484 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
486 # outer iterative sum
487 res
= transform_outer_radix2_idct(avi
)
489 for i
, expected
in enumerate(res
):
490 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
491 for i
, expected
in enumerate(res
):
492 # convert to Power single
493 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
494 expected
= float(expected
)
495 actual
= float(sim
.fpr(i
))
496 # approximate error calculation, good enough test
497 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
498 # and the rounding is different
499 err
= abs((actual
- expected
) / expected
)
500 print ("err", i
, err
)
501 self
.assertTrue(err
< 1e-6)
503 def test_sv_remap_fpmadds_dct_outer_8(self
):
504 """>>> lst = ["svshape 8, 1, 1, 3, 0",
505 "svremap 27, 1, 0, 2, 0, 1, 0",
506 "sv.fadds 0.v, 0.v, 0.v"
508 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
509 for DCT, does the iterative overlapped ADDs
511 SVP64 "REMAP" in Butterfly Mode.
513 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
514 "svremap 27, 1, 0, 2, 0, 1, 0",
515 "sv.fadds 0.v, 0.v, 0.v"
519 # array and coefficients to test
520 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
524 for i
, a
in enumerate(av
):
525 fprs
[i
+0] = fp64toselectable(a
)
527 with
Program(lst
, bigendian
=False) as program
:
528 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
529 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
530 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
531 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
532 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
533 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
534 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
535 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
537 # outer iterative sum
538 res
= transform_outer_radix2_dct(av
)
540 for i
, expected
in enumerate(res
):
541 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
542 for i
, expected
in enumerate(res
):
543 # convert to Power single
544 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
545 expected
= float(expected
)
546 actual
= float(sim
.fpr(i
))
547 # approximate error calculation, good enough test
548 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
549 # and the rounding is different
550 err
= abs((actual
- expected
) / expected
)
551 print ("err", i
, err
)
552 self
.assertTrue(err
< 1e-6)
554 def test_sv_remap_fpmadds_idct_8(self
):
555 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
556 "svshape 8, 1, 1, 11, 0",
557 "sv.fadds 0.v, 0.v, 0.v",
558 "svshape 8, 1, 1, 10, 0",
559 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
561 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
562 inner and outer butterfly "REMAP" schedules.
564 lst
= SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
565 "svshape 8, 1, 1, 11, 0",
566 "sv.fadds 0.v, 0.v, 0.v",
567 "svshape 8, 1, 1, 10, 0",
568 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
572 # array and coefficients to test
573 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
575 levels
= n
.bit_length() - 1
577 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
578 av
= [avi
[ri
[i
]] for i
in range(n
)]
579 av
= halfrev2(av
, True)
581 # divide first value by 2.0, manually. rev and halfrev should
586 print ("input data pre idct", av
)
592 for i
in range(n
//size
):
593 for ci
in range(halfsize
):
594 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
599 for i
, a
in enumerate(av
):
600 fprs
[i
+0] = fp64toselectable(a
)
601 for i
, c
in enumerate(ctable
):
602 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
604 with
Program(lst
, bigendian
=False) as program
:
605 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
606 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
607 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
608 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
609 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
610 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
611 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
612 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
615 expected
= [-15.793373940443367, 27.46969091937703,
616 -24.712331606496313, 27.03601462756265]
618 #res = inverse_transform_iter(avi)
619 res
= inverse_transform2(avi
)
620 #res = transform_outer_radix2_idct(avi)
622 for i
, expected
in enumerate(res
):
623 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
624 for i
, expected
in enumerate(res
):
625 # convert to Power single
626 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
627 expected
= float(expected
)
628 actual
= float(sim
.fpr(i
))
629 # approximate error calculation, good enough test
630 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
631 # and the rounding is different
632 err
= abs((actual
- expected
) / expected
)
633 print ("err", i
, err
)
634 self
.assertTrue(err
< 1e-5)
636 def test_sv_remap_fpmadds_dct_8(self
):
637 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
638 "svshape 8, 1, 1, 2, 0",
639 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
640 "svshape 8, 1, 1, 3, 0",
641 "sv.fadds 0.v, 0.v, 0.v"
643 runs a full in-place 8-long O(N log2 N) DCT, both
644 inner and outer butterfly "REMAP" schedules.
646 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
647 "svshape 8, 1, 1, 2, 0",
648 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
649 "svshape 8, 1, 1, 3, 0",
650 "sv.fadds 0.v, 0.v, 0.v"
654 # array and coefficients to test
655 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
657 levels
= n
.bit_length() - 1
659 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
660 av
= halfrev2(avi
, False)
661 av
= [av
[ri
[i
]] for i
in range(n
)]
666 for i
in range(n
//size
):
667 for ci
in range(halfsize
):
668 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
673 for i
, a
in enumerate(av
):
674 fprs
[i
+0] = fp64toselectable(a
)
675 for i
, c
in enumerate(ctable
):
676 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
678 with
Program(lst
, bigendian
=False) as program
:
679 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
680 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
681 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
682 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
683 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
684 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
685 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
686 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
688 # outer iterative sum
689 res
= transform2(avi
)
691 for i
, expected
in enumerate(res
):
692 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
693 for i
, expected
in enumerate(res
):
694 # convert to Power single
695 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
696 expected
= float(expected
)
697 actual
= float(sim
.fpr(i
))
698 # approximate error calculation, good enough test
699 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
700 # and the rounding is different
701 err
= abs((actual
- expected
) / expected
)
702 print ("err", i
, err
)
703 self
.assertTrue(err
< 1e-5)
705 def test_sv_remap_dct_cos_precompute_8(self
):
706 """pre-computes a DCT COS table, deliberately using a lot of
707 registers so as to be able to see what is going on (dumping all
710 the simpler (scalar) version is in test_caller_transcendentals.py
711 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
712 need the new version of fcfids which doesn't spam memory with
715 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
716 "svremap 0, 0, 0, 2, 0, 1, 1",
717 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
718 "sv.svstep 16.v, 3, 1", # svstep get vector of step
720 "setvl 0, 0, 12, 0, 1, 1",
723 "sv.fcfids 48.v, 64.v",
727 "sv.fcfids 24.v, 12.v",
728 "sv.fadds 0.v, 24.v, 43", # plus 0.5
729 "sv.fmuls 0.v, 0.v, 41", # times PI
730 "sv.fdivs 0.v, 0.v, 48.v", # div size
731 "sv.fcoss 80.v, 0.v",
732 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
739 fprs
[43] = fp64toselectable(0.5) # 0.5
740 fprs
[41] = fp64toselectable(math
.pi
) # pi
741 fprs
[44] = fp64toselectable(2.0) # 2.0
749 for i
in range(n
//size
):
750 for ci
in range(halfsize
):
751 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
754 with
Program(lst
, bigendian
=False) as program
:
755 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
759 for i
in range(len(ctable
)):
760 actual
= float(sim
.fpr(i
+24))
761 print ("i", i
, actual
)
763 for i
in range(len(ctable
)):
764 actual
= float(sim
.fpr(i
+48))
765 print ("i", i
, actual
)
767 for i
in range(len(ctable
)):
768 actual
= float(sim
.fpr(i
))
769 print ("i", i
, actual
)
770 for i
in range(len(ctable
)):
771 expected
= 1.0/ctable
[i
]
772 actual
= float(sim
.fpr(i
+80))
773 err
= abs((actual
- expected
) / expected
)
774 print ("i", i
, actual
, "1/expect", 1/expected
,
775 "expected", expected
,
777 self
.assertTrue(err
< 1e-6)
779 def test_sv_remap_dct_cos_precompute_inner_8(self
):
780 """pre-computes a DCT COS table, using the shorter costable
781 indices schedule. turns out, some COS values are repeated
782 in each layer of the DCT butterfly.
784 the simpler (scalar) version is in test_caller_transcendentals.py
785 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
786 need the new version of fcfids which doesn't spam memory with
789 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
790 "svremap 0, 0, 0, 2, 0, 1, 1",
791 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
792 "sv.svstep 16.v, 2, 1", # svstep get vector of step
794 "setvl 0, 0, 7, 0, 1, 1",
797 "sv.fcfids 48.v, 64.v",
801 "sv.fcfids 24.v, 12.v",
802 "sv.fadds 0.v, 24.v, 43", # plus 0.5
803 "sv.fmuls 0.v, 0.v, 41", # times PI
804 "sv.fdivs 0.v, 0.v, 48.v", # div size
805 "sv.fcoss 80.v, 0.v",
806 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
813 fprs
[43] = fp64toselectable(0.5) # 0.5
814 fprs
[41] = fp64toselectable(math
.pi
) # pi
815 fprs
[44] = fp64toselectable(2.0) # 2.0
823 for ci
in range(halfsize
):
824 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
826 print ("coeff", "ci", ci
, "size", size
,
827 "i/n", (ci
+0.5), 1.0/coeff
)
830 with
Program(lst
, bigendian
=False) as program
:
831 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
835 for i
in range(len(ctable
)):
836 actual
= float(sim
.fpr(i
+24))
837 print ("i", i
, actual
)
839 for i
in range(len(ctable
)):
840 actual
= float(sim
.fpr(i
+48))
841 print ("i", i
, actual
)
843 for i
in range(len(ctable
)):
844 actual
= float(sim
.fpr(i
))
845 print ("i", i
, actual
)
846 for i
in range(len(ctable
)):
847 expected
= 1.0/ctable
[i
]
848 actual
= float(sim
.fpr(i
+80))
849 err
= abs((actual
- expected
) / expected
)
850 print ("i", i
, actual
, "1/expect", 1/expected
,
851 "expected", expected
,
853 self
.assertTrue(err
< 1e-6)
855 def test_sv_remap_fpmadds_dct_8_mode_4(self
):
856 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
857 "svshape 8, 1, 1, 4, 0",
858 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
859 "svshape 8, 1, 1, 3, 0",
860 "sv.fadds 0.v, 0.v, 0.v"
862 runs a full in-place 8-long O(N log2 N) DCT, both
863 inner and outer butterfly "REMAP" schedules.
864 uses shorter tables: FRC also needs to be on a Schedule
866 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
867 "svshape 8, 1, 1, 4, 0",
868 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
869 "svshape 8, 1, 1, 3, 0",
870 "sv.fadds 0.v, 0.v, 0.v"
874 # array and coefficients to test
875 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
877 levels
= n
.bit_length() - 1
879 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
880 av
= halfrev2(avi
, False)
881 av
= [av
[ri
[i
]] for i
in range(n
)]
886 for ci
in range(halfsize
):
887 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
892 for i
, a
in enumerate(av
):
893 fprs
[i
+0] = fp64toselectable(a
)
894 for i
, c
in enumerate(ctable
):
895 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
897 with
Program(lst
, bigendian
=False) as program
:
898 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
899 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
900 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
901 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
902 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
903 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
904 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
905 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
907 # outer iterative sum
908 res
= transform2(avi
)
910 for i
, expected
in enumerate(res
):
911 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
912 for i
, expected
in enumerate(res
):
913 # convert to Power single
914 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
915 expected
= float(expected
)
916 actual
= float(sim
.fpr(i
))
917 # approximate error calculation, good enough test
918 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
919 # and the rounding is different
920 err
= abs((actual
- expected
) / expected
)
921 print ("err", i
, err
)
922 self
.assertTrue(err
< 1e-5)
924 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
):
925 """>>> lst = [# LOAD bit-reversed with half-swap
926 "svshape 8, 1, 1, 6, 0",
927 "svremap 1, 0, 0, 0, 0, 0, 0",
928 "sv.lfssh 0.v, 4(1), 2",
929 # Inner butterfly, twin +/- MUL-ADD-SUB
930 "svremap 31, 1, 0, 2, 0, 1, 1",
931 "svshape 8, 1, 1, 4, 0",
932 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
933 # Outer butterfly, iterative sum
934 "svshape 8, 1, 1, 3, 0",
935 "sv.fadds 0.v, 0.v, 0.v"
937 runs a full in-place 8-long O(N log2 N) DCT, both
938 inner and outer butterfly "REMAP" schedules, and using
939 bit-reversed half-swapped LDs.
940 uses shorter pre-loaded COS tables: FRC also needs to be on a
943 lst
= SVP64Asm( ["addi 1, 0, 0x000",
944 "svshape 8, 1, 1, 6, 0",
945 "svremap 1, 0, 0, 0, 0, 0, 0",
946 "sv.lfssh 0.v, 4(1), 2",
947 "svremap 31, 1, 0, 2, 0, 1, 1",
948 "svshape 8, 1, 1, 4, 0",
949 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
950 "svshape 8, 1, 1, 3, 0",
951 "sv.fadds 0.v, 0.v, 0.v"
955 # array and coefficients to test
956 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
958 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
959 # LD will bring them in, in the correct order.
962 for i
, a
in enumerate(avi
):
963 a
= SINGLE(fp64toselectable(a
)).value
966 val
= a
# accumulate for next iteration
968 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
970 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
976 for ci
in range(halfsize
):
977 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
982 for i
, c
in enumerate(ctable
):
983 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
985 with
Program(lst
, bigendian
=False) as program
:
986 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
988 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
989 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
990 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
991 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
992 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
993 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
994 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
996 # outer iterative sum
997 res
= transform2(avi
)
999 for i
, expected
in enumerate(res
):
1000 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
1002 for i
, expected
in enumerate(res
):
1003 # convert to Power single
1004 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
1005 expected
= float(expected
)
1006 actual
= float(sim
.fpr(i
))
1007 # approximate error calculation, good enough test
1008 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1009 # and the rounding is different
1010 err
= abs((actual
- expected
) / expected
)
1011 print ("err", i
, err
)
1012 self
.assertTrue(err
< 1e-5)
1014 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self
):
1015 """>>> lst = [# LOAD bit-reversed with half-swap
1016 "svshape 8, 1, 1, 14, 0",
1017 "svremap 1, 0, 0, 0, 0, 0, 0",
1018 "sv.lfssh 0.v, 4(1), 2",
1019 # Outer butterfly, iterative sum
1020 "svremap 31, 0, 1, 2, 1, 0, 1",
1021 "svshape 8, 1, 1, 11, 0",
1022 "sv.fadds 0.v, 0.v, 0.v",
1023 # Inner butterfly, twin +/- MUL-ADD-SUB
1024 "svshape 8, 1, 1, 10, 0",
1025 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
1027 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
1028 inner and outer butterfly "REMAP" schedules, and using
1029 bit-reversed half-swapped LDs.
1030 uses shorter pre-loaded COS tables: FRC also needs to be on a
1031 Schedule in the sv.ffmadds instruction
1033 lst
= SVP64Asm( ["addi 1, 0, 0x000",
1034 "svshape 8, 1, 1, 14, 0",
1035 "svremap 1, 0, 0, 0, 0, 0, 0",
1036 "sv.lfssh 0.v, 4(1), 2",
1037 "svremap 31, 0, 1, 2, 1, 0, 1",
1038 "svshape 8, 1, 1, 11, 0",
1039 "sv.fadds 0.v, 0.v, 0.v",
1040 "svshape 8, 1, 1, 12, 0",
1041 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
1045 # array and coefficients to test
1046 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
1048 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
1049 # LD will bring them in, in the correct order.
1052 for i
, a
in enumerate(avi
):
1053 if i
== 0: # first element, divide by 2
1055 a
= SINGLE(fp64toselectable(a
)).value
1056 shift
= (i
% 2) == 1
1058 val
= a
# accumulate for next iteration
1060 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
1062 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
1067 halfsize
= size
// 2
1068 for ci
in range(halfsize
):
1069 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
1074 for i
, c
in enumerate(ctable
):
1075 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
1077 with
Program(lst
, bigendian
=False) as program
:
1078 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
1080 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
1081 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
1082 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
1083 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
1084 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
1085 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
1086 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
1088 # outer iterative sum
1089 res
= inverse_transform2(avi
)
1091 for i
, expected
in enumerate(res
):
1092 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
1094 for i
, expected
in enumerate(res
):
1095 # convert to Power single
1096 expected
= fph
.DOUBLE2SINGLE(fp64toselectable(expected
))
1097 expected
= float(expected
)
1098 actual
= float(sim
.fpr(i
))
1099 # approximate error calculation, good enough test
1100 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1101 # and the rounding is different
1102 err
= abs((actual
- expected
) / expected
)
1103 print ("err", i
, err
)
1104 self
.assertTrue(err
< 1e-5)
1106 def run_tst_program(self
, prog
, initial_regs
=None,
1110 if initial_regs
is None:
1111 initial_regs
= [0] * 32
1112 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
1113 initial_fprs
=initial_fprs
,
1117 simulator
.gpr
.dump()
1119 simulator
.fpr
.dump()
1124 if __name__
== "__main__":