1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
21 def transform_inner_radix2(vec
, ctable
):
26 print ("transform2", n
)
27 levels
= n
.bit_length() - 1
29 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec
= halfrev2(vec
, False)
36 vec
= [vec
[ri
[i
]] for i
in range(n
)]
50 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
52 SVSHAPE0
.submode2
= 0b01
54 SVSHAPE0
.offset
= 0 # experiment with different offset, here
55 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
58 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
60 SVSHAPE1
.submode2
= 0b01
62 SVSHAPE1
.offset
= 0 # experiment with different offset, here
63 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
65 # enumerate over the iterator function, getting new indices
66 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
67 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
68 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
69 t1
, t2
= vec
[jl
], vec
[jh
]
72 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
73 print ("coeff", "ci", k
,
75 "i/n", (k
+0.5), 1.0/coeff
,
76 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
77 "end", bin(jle
), bin(jhe
))
78 if jle
== 0b111: # all loops end
83 def transform_outer_radix2(vec
):
88 print ("transform2", n
)
89 levels
= n
.bit_length() - 1
100 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
101 SVSHAPE0
.submode2
= 0b100
104 SVSHAPE0
.offset
= 0 # experiment with different offset, here
105 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
106 # j+halfstep schedule
108 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
110 SVSHAPE1
.submode2
= 0b100
112 SVSHAPE1
.offset
= 0 # experiment with different offset, here
113 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
115 # enumerate over the iterator function, getting new indices
116 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
117 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
118 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
119 print ("itersum jr", jl
, jh
,
120 "end", bin(jle
), bin(jhe
))
122 if jle
== 0b111: # all loops end
125 print("transform2 result", vec
)
130 class DCTTestCase(FHDLTestCase
):
132 def _check_regs(self
, sim
, expected
):
134 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
136 def test_sv_ffadds_dct(self
):
137 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
139 four in-place vector adds, four in-place vector mul-subs
141 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
142 FRS to perform the two multiplies. one add, one subtract.
144 sv.fdadds FRT, FRA, FRC, FRB actually does:
146 fsubs FRT+vl, FRA, FRB+vl
148 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
152 # cheat here with these values, they're selected so that
153 # rounding errors do not occur. sigh.
155 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
156 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
157 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
159 # work out the results with the twin add-sub
160 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
161 fprs
[i
+0] = fp64toselectable(a
)
162 fprs
[i
+4] = fp64toselectable(b
)
163 fprs
[i
+8] = fp64toselectable(c
)
164 # this isn't quite a perfect replication of the
165 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
166 # and FPSUB32 directly to be honest.
169 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
172 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
173 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
175 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
177 # SVSTATE (in this case, VL=2)
178 svstate
= SVP64State()
180 svstate
.maxvl
= 4 # MAXVL
181 print ("SVSTATE", bin(svstate
.asint()))
183 with
Program(lst
, bigendian
=False) as program
:
184 sim
= self
.run_tst_program(program
, svstate
=svstate
,
186 # confirm that the results are as expected
187 for i
, (t
, u
) in enumerate(res
):
188 a
= float(sim
.fpr(i
+0))
189 b
= float(sim
.fpr(i
+4))
192 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
193 for i
, (t
, u
) in enumerate(res
):
194 self
.assertEqual(sim
.fpr(i
+0), t
)
195 self
.assertEqual(sim
.fpr(i
+4), u
)
197 def test_sv_remap_fpmadds_dct_inner_4(self
):
198 """>>> lst = ["svshape 4, 1, 1, 2, 0",
199 "svremap 27, 1, 0, 2, 0, 1, 0",
200 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
202 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
205 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
206 (3 inputs, 2 outputs)
208 Note that the coefficient (FRC) is not on a "schedule", it
209 is straight Vectorised (0123...) because DCT coefficients
210 cannot be shared between butterfly layers (due to +0.5)
212 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
213 "svremap 27, 1, 0, 2, 0, 1, 0",
214 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
218 # array and coefficients to test
220 av
= [7.0, -9.8, 3.0, -32.3]
221 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
223 levels
= n
.bit_length() - 1
225 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
226 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
227 av
= halfrev2(avi
, False)
228 av
= [av
[ri
[i
]] for i
in range(n
)]
232 for i
, c
in enumerate(coe
):
233 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
234 for i
, a
in enumerate(av
):
235 fprs
[i
+0] = fp64toselectable(a
)
237 with
Program(lst
, bigendian
=False) as program
:
238 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
239 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
240 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
241 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
242 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
243 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
244 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
245 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
247 # work out the results with the twin mul/add-sub
248 res
= transform_inner_radix2(avi
, coe
)
250 for i
, expected
in enumerate(res
):
251 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
252 for i
, expected
in enumerate(res
):
253 # convert to Power single
254 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
255 expected
= float(expected
)
256 actual
= float(sim
.fpr(i
))
257 # approximate error calculation, good enough test
258 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
259 # and the rounding is different
260 err
= abs((actual
- expected
) / expected
)
261 print ("err", i
, err
)
262 self
.assertTrue(err
< 1e-6)
264 def test_sv_remap_fpmadds_dct_outer_8(self
):
265 """>>> lst = ["svshape 8, 1, 1, 3, 0",
266 "svremap 27, 1, 0, 2, 0, 1, 0",
267 "sv.fadds 0.v, 0.v, 0.v"
269 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
270 for DCT, does the iterative overlapped ADDs
272 SVP64 "REMAP" in Butterfly Mode.
274 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
275 "svremap 27, 1, 0, 2, 0, 1, 0",
276 "sv.fadds 0.v, 0.v, 0.v"
280 # array and coefficients to test
281 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
285 for i
, a
in enumerate(av
):
286 fprs
[i
+0] = fp64toselectable(a
)
288 with
Program(lst
, bigendian
=False) as program
:
289 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
290 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
291 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
292 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
293 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
294 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
295 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
296 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
298 # outer iterative sum
299 res
= transform_outer_radix2(av
)
301 for i
, expected
in enumerate(res
):
302 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
303 for i
, expected
in enumerate(res
):
304 # convert to Power single
305 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
306 expected
= float(expected
)
307 actual
= float(sim
.fpr(i
))
308 # approximate error calculation, good enough test
309 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
310 # and the rounding is different
311 err
= abs((actual
- expected
) / expected
)
312 print ("err", i
, err
)
313 self
.assertTrue(err
< 1e-6)
315 def test_sv_remap_fpmadds_dct_8(self
):
316 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
317 "svshape 8, 1, 1, 2, 0",
318 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
319 "svshape 8, 1, 1, 3, 0",
320 "sv.fadds 0.v, 0.v, 0.v"
322 runs a full in-place 8-long O(N log2 N) DCT, both
323 inner and outer butterfly "REMAP" schedules.
325 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
326 "svshape 8, 1, 1, 2, 0",
327 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
328 "svshape 8, 1, 1, 3, 0",
329 "sv.fadds 0.v, 0.v, 0.v"
333 # array and coefficients to test
334 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
336 levels
= n
.bit_length() - 1
338 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
339 av
= halfrev2(avi
, False)
340 av
= [av
[ri
[i
]] for i
in range(n
)]
345 for i
in range(n
//size
):
346 for ci
in range(halfsize
):
347 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
352 for i
, a
in enumerate(av
):
353 fprs
[i
+0] = fp64toselectable(a
)
354 for i
, c
in enumerate(ctable
):
355 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
357 with
Program(lst
, bigendian
=False) as program
:
358 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
359 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
360 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
361 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
362 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
363 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
364 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
365 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
367 # outer iterative sum
368 res
= transform2(avi
)
370 for i
, expected
in enumerate(res
):
371 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
372 for i
, expected
in enumerate(res
):
373 # convert to Power single
374 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
375 expected
= float(expected
)
376 actual
= float(sim
.fpr(i
))
377 # approximate error calculation, good enough test
378 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
379 # and the rounding is different
380 err
= abs((actual
- expected
) / expected
)
381 print ("err", i
, err
)
382 self
.assertTrue(err
< 1e-5)
384 def test_sv_remap_dct_cos_precompute_8(self
):
385 """pre-computes a DCT COS table, deliberately using a lot of
386 registers so as to be able to see what is going on (dumping all
389 the simpler (scalar) version is in test_caller_transcendentals.py
390 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
391 need the new version of fcfids which doesn't spam memory with
394 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
395 "svremap 0, 0, 0, 2, 0, 1, 1",
396 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
397 "sv.svstep 16.v, 3, 1", # svstep get vector of step
399 "setvl 0, 0, 12, 0, 1, 1",
402 "sv.fcfids 48.v, 64.v",
406 "sv.fcfids 24.v, 12.v",
407 "sv.fadds 0.v, 24.v, 43", # plus 0.5
408 "sv.fmuls 0.v, 0.v, 41", # times PI
409 "sv.fdivs 0.v, 0.v, 48.v", # div size
410 "sv.fcoss 80.v, 0.v",
411 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
418 fprs
[43] = fp64toselectable(0.5) # 0.5
419 fprs
[41] = fp64toselectable(math
.pi
) # pi
420 fprs
[44] = fp64toselectable(2.0) # 2.0
428 for i
in range(n
//size
):
429 for ci
in range(halfsize
):
430 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
433 with
Program(lst
, bigendian
=False) as program
:
434 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
438 for i
in range(len(ctable
)):
439 actual
= float(sim
.fpr(i
+24))
440 print ("i", i
, actual
)
442 for i
in range(len(ctable
)):
443 actual
= float(sim
.fpr(i
+48))
444 print ("i", i
, actual
)
446 for i
in range(len(ctable
)):
447 actual
= float(sim
.fpr(i
))
448 print ("i", i
, actual
)
449 for i
in range(len(ctable
)):
450 expected
= 1.0/ctable
[i
]
451 actual
= float(sim
.fpr(i
+80))
452 err
= abs((actual
- expected
) / expected
)
453 print ("i", i
, actual
, "1/expect", 1/expected
,
454 "expected", expected
,
456 self
.assertTrue(err
< 1e-6)
458 def test_sv_remap_dct_cos_precompute_inner_8(self
):
459 """pre-computes a DCT COS table, using the shorter costable
460 indices schedule. turns out, some COS values are repeated
461 in each layer of the DCT butterfly.
463 the simpler (scalar) version is in test_caller_transcendentals.py
464 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
465 need the new version of fcfids which doesn't spam memory with
468 lst
= SVP64Asm(["svshape 8, 1, 1, 5, 0",
469 "svremap 0, 0, 0, 2, 0, 1, 1",
470 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
471 "sv.svstep 16.v, 2, 1", # svstep get vector of step
473 "setvl 0, 0, 7, 0, 1, 1",
476 "sv.fcfids 48.v, 64.v",
480 "sv.fcfids 24.v, 12.v",
481 "sv.fadds 0.v, 24.v, 43", # plus 0.5
482 "sv.fmuls 0.v, 0.v, 41", # times PI
483 "sv.fdivs 0.v, 0.v, 48.v", # div size
484 "sv.fcoss 80.v, 0.v",
485 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
492 fprs
[43] = fp64toselectable(0.5) # 0.5
493 fprs
[41] = fp64toselectable(math
.pi
) # pi
494 fprs
[44] = fp64toselectable(2.0) # 2.0
502 for ci
in range(halfsize
):
503 coeff
= math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0
505 print ("coeff", "ci", ci
, "size", size
,
506 "i/n", (ci
+0.5), 1.0/coeff
)
509 with
Program(lst
, bigendian
=False) as program
:
510 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
514 for i
in range(len(ctable
)):
515 actual
= float(sim
.fpr(i
+24))
516 print ("i", i
, actual
)
518 for i
in range(len(ctable
)):
519 actual
= float(sim
.fpr(i
+48))
520 print ("i", i
, actual
)
522 for i
in range(len(ctable
)):
523 actual
= float(sim
.fpr(i
))
524 print ("i", i
, actual
)
525 for i
in range(len(ctable
)):
526 expected
= 1.0/ctable
[i
]
527 actual
= float(sim
.fpr(i
+80))
528 err
= abs((actual
- expected
) / expected
)
529 print ("i", i
, actual
, "1/expect", 1/expected
,
530 "expected", expected
,
532 self
.assertTrue(err
< 1e-6)
534 def test_sv_remap_fpmadds_dct_8_mode_4(self
):
535 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
536 "svshape 8, 1, 1, 4, 0",
537 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
538 "svshape 8, 1, 1, 3, 0",
539 "sv.fadds 0.v, 0.v, 0.v"
541 runs a full in-place 8-long O(N log2 N) DCT, both
542 inner and outer butterfly "REMAP" schedules.
543 uses shorter tables: FRC also needs to be on a Schedule
545 lst
= SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
546 "svshape 8, 1, 1, 4, 0",
547 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
548 "svshape 8, 1, 1, 3, 0",
549 "sv.fadds 0.v, 0.v, 0.v"
553 # array and coefficients to test
554 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
556 levels
= n
.bit_length() - 1
558 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
559 av
= halfrev2(avi
, False)
560 av
= [av
[ri
[i
]] for i
in range(n
)]
565 for ci
in range(halfsize
):
566 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
571 for i
, a
in enumerate(av
):
572 fprs
[i
+0] = fp64toselectable(a
)
573 for i
, c
in enumerate(ctable
):
574 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
576 with
Program(lst
, bigendian
=False) as program
:
577 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
578 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
579 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
580 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
581 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
582 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
583 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
584 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
586 # outer iterative sum
587 res
= transform2(avi
)
589 for i
, expected
in enumerate(res
):
590 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
591 for i
, expected
in enumerate(res
):
592 # convert to Power single
593 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
594 expected
= float(expected
)
595 actual
= float(sim
.fpr(i
))
596 # approximate error calculation, good enough test
597 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
598 # and the rounding is different
599 err
= abs((actual
- expected
) / expected
)
600 print ("err", i
, err
)
601 self
.assertTrue(err
< 1e-5)
603 def run_tst_program(self
, prog
, initial_regs
=None,
607 if initial_regs
is None:
608 initial_regs
= [0] * 32
609 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
610 initial_fprs
=initial_fprs
,
621 if __name__
== "__main__":