1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
21 def transform_inner_radix2(vec
, ctable
):
26 print ("transform2", n
)
27 levels
= n
.bit_length() - 1
29 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec
= halfrev2(vec
, False)
36 vec
= [vec
[ri
[i
]] for i
in range(n
)]
50 SVSHAPE0
.lims
= [xdim
, 2, zdim
]
52 SVSHAPE0
.submode2
= 0b01
54 SVSHAPE0
.offset
= 0 # experiment with different offset, here
55 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
58 SVSHAPE1
.lims
= [xdim
, 2, zdim
]
60 SVSHAPE1
.submode2
= 0b01
62 SVSHAPE1
.offset
= 0 # experiment with different offset, here
63 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
65 # enumerate over the iterator function, getting new indices
66 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
67 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
68 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
69 t1
, t2
= vec
[jl
], vec
[jh
]
72 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
73 print ("coeff", "ci", k
,
75 "i/n", (k
+0.5), 1.0/coeff
,
76 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
77 "end", bin(jle
), bin(jhe
))
78 if jle
== 0b111: # all loops end
83 def transform_outer_radix2(vec
):
88 print ("transform2", n
)
89 levels
= n
.bit_length() - 1
100 SVSHAPE0
.lims
= [xdim
, 3, zdim
]
101 SVSHAPE0
.submode2
= 0b100
104 SVSHAPE0
.offset
= 0 # experiment with different offset, here
105 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
106 # j+halfstep schedule
108 SVSHAPE1
.lims
= [xdim
, 3, zdim
]
110 SVSHAPE1
.submode2
= 0b100
112 SVSHAPE1
.offset
= 0 # experiment with different offset, here
113 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
115 # enumerate over the iterator function, getting new indices
116 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
117 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
118 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
119 print ("itersum jr", jl
, jh
,
120 "end", bin(jle
), bin(jhe
))
122 if jle
== 0b111: # all loops end
125 print("transform2 result", vec
)
130 class DCTTestCase(FHDLTestCase
):
132 def _check_regs(self
, sim
, expected
):
134 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
136 def test_sv_ffadds_dct(self
):
137 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
139 four in-place vector adds, four in-place vector mul-subs
141 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
142 FRS to perform the two multiplies. one add, one subtract.
144 sv.fdadds FRT, FRA, FRC, FRB actually does:
146 fsubs FRT+vl, FRA, FRB+vl
148 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
152 # cheat here with these values, they're selected so that
153 # rounding errors do not occur. sigh.
155 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
156 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
157 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
159 # work out the results with the twin add-sub
160 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
161 fprs
[i
+0] = fp64toselectable(a
)
162 fprs
[i
+4] = fp64toselectable(b
)
163 fprs
[i
+8] = fp64toselectable(c
)
164 # this isn't quite a perfect replication of the
165 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
166 # and FPSUB32 directly to be honest.
169 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
172 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
173 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
175 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
177 # SVSTATE (in this case, VL=2)
178 svstate
= SVP64State()
180 svstate
.maxvl
= 4 # MAXVL
181 print ("SVSTATE", bin(svstate
.asint()))
183 with
Program(lst
, bigendian
=False) as program
:
184 sim
= self
.run_tst_program(program
, svstate
=svstate
,
186 # confirm that the results are as expected
187 for i
, (t
, u
) in enumerate(res
):
188 a
= float(sim
.fpr(i
+0))
189 b
= float(sim
.fpr(i
+4))
192 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
193 for i
, (t
, u
) in enumerate(res
):
194 self
.assertEqual(sim
.fpr(i
+0), t
)
195 self
.assertEqual(sim
.fpr(i
+4), u
)
197 def test_sv_remap_fpmadds_dct_inner_4(self
):
198 """>>> lst = ["svshape 4, 1, 1, 2, 0",
199 "svremap 27, 1, 0, 2, 0, 1, 0",
200 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
202 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
205 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
206 (3 inputs, 2 outputs)
208 Note that the coefficient (FRC) is not on a "schedule", it
209 is straight Vectorised (0123...) because DCT coefficients
210 cannot be shared between butterfly layers (due to +0.5)
212 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
213 "svremap 27, 1, 0, 2, 0, 1, 0",
214 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
218 # array and coefficients to test
220 av
= [7.0, -9.8, 3.0, -32.3]
221 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
223 levels
= n
.bit_length() - 1
225 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
226 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
227 av
= halfrev2(avi
, False)
228 av
= [av
[ri
[i
]] for i
in range(n
)]
232 for i
, c
in enumerate(coe
):
233 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
234 for i
, a
in enumerate(av
):
235 fprs
[i
+0] = fp64toselectable(a
)
237 with
Program(lst
, bigendian
=False) as program
:
238 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
239 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
240 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
241 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
242 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
243 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
244 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
245 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
247 # work out the results with the twin mul/add-sub
248 res
= transform_inner_radix2(avi
, coe
)
250 for i
, expected
in enumerate(res
):
251 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
252 for i
, expected
in enumerate(res
):
253 # convert to Power single
254 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
255 expected
= float(expected
)
256 actual
= float(sim
.fpr(i
))
257 # approximate error calculation, good enough test
258 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
259 # and the rounding is different
260 err
= abs((actual
- expected
) / expected
)
261 print ("err", i
, err
)
262 self
.assertTrue(err
< 1e-6)
264 def test_sv_remap_fpmadds_dct_outer_8(self
):
265 """>>> lst = ["svshape 8, 1, 1, 3, 0",
266 "svremap 27, 1, 0, 2, 0, 1, 0",
267 "sv.fadds 0.v, 0.v, 0.v"
269 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
270 for DCT, does the iterative overlapped ADDs
272 SVP64 "REMAP" in Butterfly Mode.
274 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
275 "svremap 27, 1, 0, 2, 0, 1, 0",
276 "sv.fadds 0.v, 0.v, 0.v"
280 # array and coefficients to test
281 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
285 for i
, a
in enumerate(av
):
286 fprs
[i
+0] = fp64toselectable(a
)
288 with
Program(lst
, bigendian
=False) as program
:
289 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
290 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
291 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
292 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
293 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
294 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
295 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
296 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
298 # outer iterative sum
299 res
= transform_outer_radix2(av
)
301 for i
, expected
in enumerate(res
):
302 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
303 for i
, expected
in enumerate(res
):
304 # convert to Power single
305 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
306 expected
= float(expected
)
307 actual
= float(sim
.fpr(i
))
308 # approximate error calculation, good enough test
309 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
310 # and the rounding is different
311 err
= abs((actual
- expected
) / expected
)
312 print ("err", i
, err
)
313 self
.assertTrue(err
< 1e-6)
315 def test_sv_remap_fpmadds_dct_8(self
):
316 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
317 "svshape 8, 1, 1, 2, 0",
318 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
319 "svshape 8, 1, 1, 3, 0",
320 "sv.fadds 0.v, 0.v, 0.v"
322 runs a full in-place 8-long O(N log2 N) DCT, both
323 inner and outer butterfly "REMAP" schedules.
325 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
326 "svshape 8, 1, 1, 2, 0",
327 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
328 "svshape 8, 1, 1, 3, 0",
329 "sv.fadds 0.v, 0.v, 0.v"
333 # array and coefficients to test
334 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
336 levels
= n
.bit_length() - 1
338 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
339 av
= halfrev2(avi
, False)
340 av
= [av
[ri
[i
]] for i
in range(n
)]
345 for i
in range(n
//size
):
346 for ci
in range(halfsize
):
347 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
352 for i
, a
in enumerate(av
):
353 fprs
[i
+0] = fp64toselectable(a
)
354 for i
, c
in enumerate(ctable
):
355 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
357 with
Program(lst
, bigendian
=False) as program
:
358 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
359 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
360 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
361 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
362 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
363 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
364 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
365 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
367 # outer iterative sum
368 res
= transform2(avi
)
370 for i
, expected
in enumerate(res
):
371 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
372 for i
, expected
in enumerate(res
):
373 # convert to Power single
374 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
375 expected
= float(expected
)
376 actual
= float(sim
.fpr(i
))
377 # approximate error calculation, good enough test
378 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
379 # and the rounding is different
380 err
= abs((actual
- expected
) / expected
)
381 print ("err", i
, err
)
382 self
.assertTrue(err
< 1e-5)
384 def test_sv_remap_dct_cos_8(self
):
385 lst
= SVP64Asm(["svshape 8, 1, 1, 2, 0",
386 "svremap 0, 0, 0, 2, 0, 1, 1",
387 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
388 "sv.svstep 16.v, 3, 1", # svstep get vector of step
390 "setvl 0, 0, 12, 0, 1, 1",
395 "sv.fcfids 0.v, 0.v",
396 "sv.fadds 0.v, 0.v, 43", # plus 0.5
397 "sv.fmuls 0.v, 0.v, 41", # times PI
398 "sv.fdivs 0.v, 12.v, 0.v", # div size
399 "sv.fcoss 12.v, 0.v",
400 "sv.fdivs 12.v, 44, 12.v", # div 2.0 / x
407 fprs
[43] = fp64toselectable(0.5) # 0.5
408 fprs
[41] = fp64toselectable(math
.pi
) # pi
409 fprs
[42] = fp64toselectable(8.0) # 8.0
410 fprs
[44] = fp64toselectable(2.0) # 2.0
418 for i
in range(n
//size
):
419 for ci
in range(halfsize
):
420 ctable
.append(math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0)
423 with
Program(lst
, bigendian
=False) as program
:
424 sim
= self
.run_tst_program(program
, gprs
, initial_fprs
=fprs
)
428 def run_tst_program(self
, prog
, initial_regs
=None,
432 if initial_regs
is None:
433 initial_regs
= [0] * 32
434 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
435 initial_fprs
=initial_fprs
,
446 if __name__
== "__main__":