1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
21 def transform_inner_radix2_dct(vec
, ctable
):
26 print ("transform2", n
)
27 levels
= n
.bit_length() - 1
29 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec
= halfrev2(vec
, False)
36 vec
= [vec
[ri
[i
]] for i
in range(n
)]
50 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
52 SVSHAPE0
.submode2
= 0b01
54 SVSHAPE0
.offset
= 0 # experiment with different offset, here
55 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
58 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
60 SVSHAPE1
.submode2
= 0b01
62 SVSHAPE1
.offset
= 0 # experiment with different offset, here
63 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
65 # enumerate over the iterator function, getting new indices
66 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
67 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
68 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
69 t1
, t2
= vec
[jl
], vec
[jh
]
72 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
73 print ("coeff", "ci", k
,
75 "i/n", (k
+0.5), 1.0/coeff
,
76 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
77 "end", bin(jle
), bin(jhe
))
78 if jle
== 0b111: # all loops end
84 def transform_outer_radix2_dct(vec
):
89 print ("transform2", n
)
90 levels
= n
.bit_length() - 1
101 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
102 SVSHAPE0
.submode2
= 0b100
105 SVSHAPE0
.offset
= 0 # experiment with different offset, here
106 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
107 # j+halfstep schedule
109 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
111 SVSHAPE1
.submode2
= 0b100
113 SVSHAPE1
.offset
= 0 # experiment with different offset, here
114 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
116 # enumerate over the iterator function, getting new indices
117 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
118 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
119 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
120 print ("itersum jr", jl
, jh
,
121 "end", bin(jle
), bin(jhe
))
123 if jle
== 0b111: # all loops end
126 print("transform2 result", vec
)
131 def transform_inner_radix2_idct(vec
, ctable
):
136 print ("transform2", n
)
137 levels
= n
.bit_length() - 1
139 # pretend we LDed data in half-swapped order
140 vec
= halfrev2(vec
, False)
154 SVSHAPE0
.lims
= [xdim
, 0b000001, 0]
156 SVSHAPE0
.submode2
= 0b11
158 SVSHAPE0
.offset
= 0 # experiment with different offset, here
159 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
160 # j+halfstep schedule
162 SVSHAPE1
.lims
= [xdim
, 0b000001, 0]
164 SVSHAPE1
.submode2
= 0b11
166 SVSHAPE1
.offset
= 0 # experiment with different offset, here
167 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
169 # enumerate over the iterator function, getting new indices
170 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
171 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
172 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
173 t1
, t2
= vec
[jl
], vec
[jh
]
176 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
177 print ("coeff", "ci", k
,
179 "i/n", (k
+0.5), 1.0/coeff
,
180 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
181 "end", bin(jle
), bin(jhe
))
182 if jle
== 0b111: # all loops end
188 def transform_outer_radix2_idct(vec
):
193 print ("transform2-inv", n
)
194 levels
= n
.bit_length() - 1
201 # reference (read/write) the in-place data in *reverse-bit-order*
203 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
205 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
206 # TODO: merge these two
207 vec
= [vec
[ri
[i
]] for i
in range(n
)]
208 vec
= halfrev2(vec
, True)
214 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
215 SVSHAPE0
.submode2
= 0b011
218 SVSHAPE0
.offset
= 0 # experiment with different offset, here
219 SVSHAPE0
.invxyz
= [1,0,1] # inversion if desired
220 # j+halfstep schedule
222 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
224 SVSHAPE1
.submode2
= 0b011
226 SVSHAPE1
.offset
= 0 # experiment with different offset, here
227 SVSHAPE1
.invxyz
= [1,0,1] # inversion if desired
229 # enumerate over the iterator function, getting new indices
230 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
231 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
232 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
233 print ("itersum jr", jl
, jh
,
234 "end", bin(jle
), bin(jhe
))
236 if jle
== 0b111: # all loops end
239 print("transform2-inv result", vec
)
244 class DCTTestCase(FHDLTestCase
):
246 def _check_regs(self
, sim
, expected
):
248 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
250 def test_sv_ffadds_dct(self
):
251 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
253 four in-place vector adds, four in-place vector mul-subs
255 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
256 FRS to perform the two multiplies. one add, one subtract.
258 sv.fdadds FRT, FRA, FRC, FRB actually does:
260 fsubs FRT+vl, FRA, FRB+vl
262 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
266 # cheat here with these values, they're selected so that
267 # rounding errors do not occur. sigh.
269 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
270 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
271 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
273 # work out the results with the twin add-sub
274 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
275 fprs
[i
+0] = fp64toselectable(a
)
276 fprs
[i
+4] = fp64toselectable(b
)
277 fprs
[i
+8] = fp64toselectable(c
)
278 # this isn't quite a perfect replication of the
279 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
280 # and FPSUB32 directly to be honest.
283 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
286 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
287 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
289 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
291 # SVSTATE (in this case, VL=2)
292 svstate
= SVP64State()
294 svstate
.maxvl
= 4 # MAXVL
295 print ("SVSTATE", bin(svstate
.asint()))
297 with
Program(lst
, bigendian
=False) as program
:
298 sim
= self
.run_tst_program(program
, svstate
=svstate
,
300 # confirm that the results are as expected
301 for i
, (t
, u
) in enumerate(res
):
302 a
= float(sim
.fpr(i
+0))
303 b
= float(sim
.fpr(i
+4))
306 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
307 for i
, (t
, u
) in enumerate(res
):
308 self
.assertEqual(sim
.fpr(i
+0), t
)
309 self
.assertEqual(sim
.fpr(i
+4), u
)
311 def test_sv_remap_fpmadds_dct_inner_4(self
):
312 """>>> lst = ["svshape 4, 1, 1, 2, 0",
313 "svremap 27, 1, 0, 2, 0, 1, 0",
314 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
316 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
319 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
320 (3 inputs, 2 outputs)
322 Note that the coefficient (FRC) is not on a "schedule", it
323 is straight Vectorised (0123...) because DCT coefficients
324 cannot be shared between butterfly layers (due to +0.5)
326 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
327 "svremap 27, 1, 0, 2, 0, 1, 0",
328 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
332 # array and coefficients to test
334 av
= [7.0, -9.8, 3.0, -32.3]
335 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
337 levels
= n
.bit_length() - 1
339 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
340 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
341 av
= halfrev2(avi
, False)
342 av
= [av
[ri
[i
]] for i
in range(n
)]
346 for i
, c
in enumerate(coe
):
347 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
348 for i
, a
in enumerate(av
):
349 fprs
[i
+0] = fp64toselectable(a
)
351 with
Program(lst
, bigendian
=False) as program
:
352 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
353 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
354 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
355 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
356 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
357 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
358 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
359 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
361 # work out the results with the twin mul/add-sub
362 res
= transform_inner_radix2_dct(avi
, coe
)
364 for i
, expected
in enumerate(res
):
365 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
366 for i
, expected
in enumerate(res
):
367 # convert to Power single
368 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
369 expected
= float(expected
)
370 actual
= float(sim
.fpr(i
))
371 # approximate error calculation, good enough test
372 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
373 # and the rounding is different
374 err
= abs((actual
- expected
) / expected
)
375 print ("err", i
, err
)
376 self
.assertTrue(err
< 1e-6)
378 def test_sv_remap_fpmadds_dct_inner_4(self
):
379 """>>> lst = ["svshape 4, 1, 1, 10, 0",
380 "svremap 27, 1, 0, 2, 0, 1, 0",
381 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
383 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
386 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
387 (3 inputs, 2 outputs)
389 Note that the coefficient (FRC) is not on a "schedule", it
390 is straight Vectorised (0123...) because DCT coefficients
391 cannot be shared between butterfly layers (due to +0.5)
393 lst
= SVP64Asm( ["svshape 4, 1, 1, 10, 0",
394 "svremap 27, 1, 0, 2, 0, 1, 0",
395 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
399 # array and coefficients to test
401 levels
= n
.bit_length() - 1
402 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
403 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
404 av
= halfrev2(avi
, False)
408 for i
, c
in enumerate(coe
):
409 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
410 for i
, a
in enumerate(av
):
411 fprs
[i
+0] = fp64toselectable(a
)
413 with
Program(lst
, bigendian
=False) as program
:
414 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
415 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
416 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
417 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
418 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
419 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
420 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
421 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
423 # work out the results with the twin mul/add-sub
424 res
= transform_inner_radix2_idct(avi
, coe
)
426 for i
, expected
in enumerate(res
):
427 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
428 for i
, expected
in enumerate(res
):
429 # convert to Power single
430 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
431 expected
= float(expected
)
432 actual
= float(sim
.fpr(i
))
433 # approximate error calculation, good enough test
434 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
435 # and the rounding is different
436 err
= abs((actual
- expected
) / expected
)
437 print ("err", i
, err
)
438 self
.assertTrue(err
< 1e-6)
440 def test_sv_remap_fpmadds_idct_outer_8(self
):
441 """>>> lst = ["svshape 8, 1, 1, 11, 0",
442 "svremap 27, 1, 0, 2, 0, 1, 0",
443 "sv.fadds 0.v, 0.v, 0.v"
445 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
446 for inverse-DCT, does the iterative overlapped ADDs
448 SVP64 "REMAP" in Butterfly Mode.
450 lst
= SVP64Asm( ["svshape 8, 1, 1, 11, 0",
451 "svremap 27, 1, 0, 2, 0, 1, 0",
452 "sv.fadds 0.v, 0.v, 0.v"
456 # array and coefficients to test
457 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
460 levels
= n
.bit_length() - 1
462 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
463 av
= [avi
[ri
[i
]] for i
in range(n
)]
464 av
= halfrev2(av
, True)
468 for i
, a
in enumerate(av
):
469 fprs
[i
+0] = fp64toselectable(a
)
471 with
Program(lst
, bigendian
=False) as program
:
472 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
473 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
474 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
475 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
476 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
477 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
478 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
479 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
481 # outer iterative sum
482 res
= transform_outer_radix2_idct(avi
)
484 for i
, expected
in enumerate(res
):
485 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
486 for i
, expected
in enumerate(res
):
487 # convert to Power single
488 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
489 expected
= float(expected
)
490 actual
= float(sim
.fpr(i
))
491 # approximate error calculation, good enough test
492 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
493 # and the rounding is different
494 err
= abs((actual
- expected
) / expected
)
495 print ("err", i
, err
)
496 self
.assertTrue(err
< 1e-6)
498 def test_sv_remap_fpmadds_dct_outer_8(self
):
499 """>>> lst = ["svshape 8, 1, 1, 3, 0",
500 "svremap 27, 1, 0, 2, 0, 1, 0",
501 "sv.fadds 0.v, 0.v, 0.v"
503 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
504 for DCT, does the iterative overlapped ADDs
506 SVP64 "REMAP" in Butterfly Mode.
508 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
509 "svremap 27, 1, 0, 2, 0, 1, 0",
510 "sv.fadds 0.v, 0.v, 0.v"
514 # array and coefficients to test
515 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
519 for i
, a
in enumerate(av
):
520 fprs
[i
+0] = fp64toselectable(a
)
522 with
Program(lst
, bigendian
=False) as program
:
523 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
524 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
525 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
526 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
527 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
528 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
529 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
530 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
532 # outer iterative sum
533 res
= transform_outer_radix2_dct(av
)
535 for i
, expected
in enumerate(res
):
536 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
537 for i
, expected
in enumerate(res
):
538 # convert to Power single
539 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
540 expected
= float(expected
)
541 actual
= float(sim
.fpr(i
))
542 # approximate error calculation, good enough test
543 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
544 # and the rounding is different
545 err
= abs((actual
- expected
) / expected
)
546 print ("err", i
, err
)
547 self
.assertTrue(err
< 1e-6)
549 def test_sv_remap_fpmadds_dct_8(self
):
550 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
551 "svshape 8, 1, 1, 2, 0",
552 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
553 "svshape 8, 1, 1, 3, 0",
554 "sv.fadds 0.v, 0.v, 0.v"
556 runs a full in-place 8-long O(N log2 N) DCT, both
557 inner and outer butterfly "REMAP" schedules.
559 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
560 "svshape 8, 1, 1, 2, 0",
561 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
562 "svshape 8, 1, 1, 3, 0",
563 "sv.fadds 0.v, 0.v, 0.v"
567 # array and coefficients to test
568 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
570 levels
= n
.bit_length() - 1
572 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
573 av
= halfrev2(avi
, False)
574 av
= [av
[ri
[i
]] for i
in range(n
)]
579 for i
in range(n
//size
):
580 for ci
in range(halfsize
):
581 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
586 for i
, a
in enumerate(av
):
587 fprs
[i
+0] = fp64toselectable(a
)
588 for i
, c
in enumerate(ctable
):
589 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
591 with
Program(lst
, bigendian
=False) as program
:
592 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
593 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
594 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
595 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
596 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
597 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
598 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
599 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
601 # outer iterative sum
602 res
= transform2(avi
)
604 for i
, expected
in enumerate(res
):
605 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
606 for i
, expected
in enumerate(res
):
607 # convert to Power single
608 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
609 expected
= float(expected
)
610 actual
= float(sim
.fpr(i
))
611 # approximate error calculation, good enough test
612 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
613 # and the rounding is different
614 err
= abs((actual
- expected
) / expected
)
615 print ("err", i
, err
)
616 self
.assertTrue(err
< 1e-5)
618 def test_sv_remap_dct_cos_precompute_8(self
):
619 """pre-computes a DCT COS table, deliberately using a lot of
620 registers so as to be able to see what is going on (dumping all
623 the simpler (scalar) version is in test_caller_transcendentals.py
624 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
625 need the new version of fcfids which doesn't spam memory with
628 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
629 "svremap 0, 0, 0, 2, 0, 1, 1",
630 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
631 "sv.svstep 16.v, 3, 1", # svstep get vector of step
633 "setvl 0, 0, 12, 0, 1, 1",
636 "sv.fcfids 48.v, 64.v",
640 "sv.fcfids 24.v, 12.v",
641 "sv.fadds 0.v, 24.v, 43", # plus 0.5
642 "sv.fmuls 0.v, 0.v, 41", # times PI
643 "sv.fdivs 0.v, 0.v, 48.v", # div size
644 "sv.fcoss 80.v, 0.v",
645 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
652 fprs
[43] = fp64toselectable(0.5) # 0.5
653 fprs
[41] = fp64toselectable(math
.pi
) # pi
654 fprs
[44] = fp64toselectable(2.0) # 2.0
662 for i
in range(n
//size
):
663 for ci
in range(halfsize
):
664 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
667 with
Program(lst
, bigendian
=False) as program
:
668 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
672 for i
in range(len(ctable
)):
673 actual
= float(sim
.fpr(i
+24))
674 print ("i", i
, actual
)
676 for i
in range(len(ctable
)):
677 actual
= float(sim
.fpr(i
+48))
678 print ("i", i
, actual
)
680 for i
in range(len(ctable
)):
681 actual
= float(sim
.fpr(i
))
682 print ("i", i
, actual
)
683 for i
in range(len(ctable
)):
684 expected
= 1.0/ctable
[i
]
685 actual
= float(sim
.fpr(i
+80))
686 err
= abs((actual
- expected
) / expected
)
687 print ("i", i
, actual
, "1/expect", 1/expected
,
688 "expected", expected
,
690 self
.assertTrue(err
< 1e-6)
692 def test_sv_remap_dct_cos_precompute_inner_8(self
):
693 """pre-computes a DCT COS table, using the shorter costable
694 indices schedule. turns out, some COS values are repeated
695 in each layer of the DCT butterfly.
697 the simpler (scalar) version is in test_caller_transcendentals.py
698 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
699 need the new version of fcfids which doesn't spam memory with
702 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
703 "svremap 0, 0, 0, 2, 0, 1, 1",
704 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
705 "sv.svstep 16.v, 2, 1", # svstep get vector of step
707 "setvl 0, 0, 7, 0, 1, 1",
710 "sv.fcfids 48.v, 64.v",
714 "sv.fcfids 24.v, 12.v",
715 "sv.fadds 0.v, 24.v, 43", # plus 0.5
716 "sv.fmuls 0.v, 0.v, 41", # times PI
717 "sv.fdivs 0.v, 0.v, 48.v", # div size
718 "sv.fcoss 80.v, 0.v",
719 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
726 fprs
[43] = fp64toselectable(0.5) # 0.5
727 fprs
[41] = fp64toselectable(math
.pi
) # pi
728 fprs
[44] = fp64toselectable(2.0) # 2.0
736 for ci
in range(halfsize
):
737 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
739 print ("coeff", "ci", ci
, "size", size
,
740 "i/n", (ci
+0.5), 1.0/coeff
)
743 with
Program(lst
, bigendian
=False) as program
:
744 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
748 for i
in range(len(ctable
)):
749 actual
= float(sim
.fpr(i
+24))
750 print ("i", i
, actual
)
752 for i
in range(len(ctable
)):
753 actual
= float(sim
.fpr(i
+48))
754 print ("i", i
, actual
)
756 for i
in range(len(ctable
)):
757 actual
= float(sim
.fpr(i
))
758 print ("i", i
, actual
)
759 for i
in range(len(ctable
)):
760 expected
= 1.0/ctable
[i
]
761 actual
= float(sim
.fpr(i
+80))
762 err
= abs((actual
- expected
) / expected
)
763 print ("i", i
, actual
, "1/expect", 1/expected
,
764 "expected", expected
,
766 self
.assertTrue(err
< 1e-6)
768 def test_sv_remap_fpmadds_dct_8_mode_4(self
):
769 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
770 "svshape 8, 1, 1, 4, 0",
771 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
772 "svshape 8, 1, 1, 3, 0",
773 "sv.fadds 0.v, 0.v, 0.v"
775 runs a full in-place 8-long O(N log2 N) DCT, both
776 inner and outer butterfly "REMAP" schedules.
777 uses shorter tables: FRC also needs to be on a Schedule
779 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
780 "svshape 8, 1, 1, 4, 0",
781 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
782 "svshape 8, 1, 1, 3, 0",
783 "sv.fadds 0.v, 0.v, 0.v"
787 # array and coefficients to test
788 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
790 levels
= n
.bit_length() - 1
792 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
793 av
= halfrev2(avi
, False)
794 av
= [av
[ri
[i
]] for i
in range(n
)]
799 for ci
in range(halfsize
):
800 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
805 for i
, a
in enumerate(av
):
806 fprs
[i
+0] = fp64toselectable(a
)
807 for i
, c
in enumerate(ctable
):
808 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
810 with
Program(lst
, bigendian
=False) as program
:
811 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
812 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
813 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
814 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
815 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
816 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
817 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
818 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
820 # outer iterative sum
821 res
= transform2(avi
)
823 for i
, expected
in enumerate(res
):
824 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
825 for i
, expected
in enumerate(res
):
826 # convert to Power single
827 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
828 expected
= float(expected
)
829 actual
= float(sim
.fpr(i
))
830 # approximate error calculation, good enough test
831 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
832 # and the rounding is different
833 err
= abs((actual
- expected
) / expected
)
834 print ("err", i
, err
)
835 self
.assertTrue(err
< 1e-5)
837 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self
):
838 """>>> lst = [# LOAD bit-reversed with half-swap
839 "svshape 8, 1, 1, 6, 0",
840 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
841 "sv.lfsbr 0.v, 4(1), 2",
842 # Inner butterfly, twin +/- MUL-ADD-SUB
843 "svremap 31, 1, 0, 2, 0, 1, 1",
844 "svshape 8, 1, 1, 4, 0",
845 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
846 # Outer butterfly, iterative sum
847 "svshape 8, 1, 1, 3, 0",
848 "sv.fadds 0.v, 0.v, 0.v"
850 runs a full in-place 8-long O(N log2 N) DCT, both
851 inner and outer butterfly "REMAP" schedules, and using
852 bit-reversed half-swapped LDs.
853 uses shorter pre-loaded COS tables: FRC also needs to be on a
856 lst
= SVP64Asm( ["addi 1, 0, 0x000",
857 "svshape 8, 1, 1, 6, 0",
858 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
859 "sv.lfsbr 0.v, 4(1), 2",
860 "svremap 31, 1, 0, 2, 0, 1, 1",
861 "svshape 8, 1, 1, 4, 0",
862 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
863 "svshape 8, 1, 1, 3, 0",
864 "sv.fadds 0.v, 0.v, 0.v"
868 # array and coefficients to test
869 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
871 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
872 # LD will bring them in, in the correct order.
875 for i
, a
in enumerate(avi
):
876 a
= SINGLE(fp64toselectable(a
)).value
879 val
= a
# accumulate for next iteration
881 mem
[(i
//2)*8] = val |
(a
<< 32) # even and odd 4-byte in same 8
883 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
889 for ci
in range(halfsize
):
890 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
895 for i
, c
in enumerate(ctable
):
896 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
898 with
Program(lst
, bigendian
=False) as program
:
899 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
,
901 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
902 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
903 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
904 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
905 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
906 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
907 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
909 # outer iterative sum
910 res
= transform2(avi
)
912 for i
, expected
in enumerate(res
):
913 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
915 for i
, expected
in enumerate(res
):
916 # convert to Power single
917 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
918 expected
= float(expected
)
919 actual
= float(sim
.fpr(i
))
920 # approximate error calculation, good enough test
921 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
922 # and the rounding is different
923 err
= abs((actual
- expected
) / expected
)
924 print ("err", i
, err
)
925 self
.assertTrue(err
< 1e-5)
927 def run_tst_program(self
, prog
, initial_regs
=None,
931 if initial_regs
is None:
932 initial_regs
= [0] * 32
933 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
934 initial_fprs
=initial_fprs
,
945 if __name__
== "__main__":