1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
4 from openpower
.decoder
.power_decoder
import (create_pdecode
)
5 from openpower
.simulator
.program
import Program
6 from openpower
.decoder
.isa
.caller
import SVP64State
7 from openpower
.decoder
.selectable_int
import SelectableInt
8 from openpower
.decoder
.isa
.test_caller
import run_tst
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
10 from copy
import deepcopy
11 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
12 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
13 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
14 iterate_dct_inner_butterfly_indices
,
15 iterate_dct_outer_butterfly_indices
,
21 def transform_inner_radix2(vec
, ctable
):
26 print ("transform2", n
)
27 levels
= n
.bit_length() - 1
29 # reference (read/write) the in-place data in *reverse-bit-order*
31 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec
= halfrev2(vec
, False)
36 vec
= [vec
[ri
[i
]] for i
in range(n
)]
50 SVSHAPE0
.lims
= [xdim
, ydim
, zdim
]
51 SVSHAPE0
.order
= [0,1,2] # experiment with different permutations, here
53 SVSHAPE0
.submode2
= 0b01
55 SVSHAPE0
.offset
= 0 # experiment with different offset, here
56 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
59 SVSHAPE1
.lims
= [xdim
, ydim
, zdim
]
60 SVSHAPE1
.order
= [0,1,2] # experiment with different permutations, here
62 SVSHAPE1
.submode2
= 0b01
64 SVSHAPE1
.offset
= 0 # experiment with different offset, here
65 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
67 # enumerate over the iterator function, getting new indices
68 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
69 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
70 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
71 t1
, t2
= vec
[jl
], vec
[jh
]
74 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
75 print ("coeff", "ci", k
,
77 "i/n", (k
+0.5), 1.0/coeff
,
78 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
79 "end", bin(jle
), bin(jhe
))
80 if jle
== 0b111: # all loops end
85 def transform_outer_radix2(vec
):
90 print ("transform2", n
)
91 levels
= n
.bit_length() - 1
102 SVSHAPE0
.lims
= [xdim
, ydim
, zdim
]
103 SVSHAPE0
.submode2
= 0b100
106 SVSHAPE0
.offset
= 0 # experiment with different offset, here
107 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
108 # j+halfstep schedule
110 SVSHAPE1
.lims
= [xdim
, ydim
, zdim
]
112 SVSHAPE1
.submode2
= 0b100
114 SVSHAPE1
.offset
= 0 # experiment with different offset, here
115 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
117 # enumerate over the iterator function, getting new indices
118 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
119 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
120 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
121 print ("itersum jr", jl
, jh
,
122 "end", bin(jle
), bin(jhe
))
124 if jle
== 0b111: # all loops end
127 print("transform2 result", vec
)
132 class DCTTestCase(FHDLTestCase
):
134 def _check_regs(self
, sim
, expected
):
136 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
138 def test_sv_ffadds_dct(self
):
139 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
141 four in-place vector adds, four in-place vector mul-subs
143 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
144 FRS to perform the two multiplies. one add, one subtract.
146 sv.fdadds FRT, FRA, FRC, FRB actually does:
148 fsubs FRT+vl, FRA, FRB+vl
150 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
154 # cheat here with these values, they're selected so that
155 # rounding errors do not occur. sigh.
157 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
158 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
159 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
161 # work out the results with the twin add-sub
162 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
163 fprs
[i
+0] = fp64toselectable(a
)
164 fprs
[i
+4] = fp64toselectable(b
)
165 fprs
[i
+8] = fp64toselectable(c
)
166 # this isn't quite a perfect replication of the
167 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
168 # and FPSUB32 directly to be honest.
171 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
174 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
175 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
177 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
179 # SVSTATE (in this case, VL=2)
180 svstate
= SVP64State()
182 svstate
.maxvl
= 4 # MAXVL
183 print ("SVSTATE", bin(svstate
.asint()))
185 with
Program(lst
, bigendian
=False) as program
:
186 sim
= self
.run_tst_program(program
, svstate
=svstate
,
188 # confirm that the results are as expected
189 for i
, (t
, u
) in enumerate(res
):
190 a
= float(sim
.fpr(i
+0))
191 b
= float(sim
.fpr(i
+4))
194 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
195 for i
, (t
, u
) in enumerate(res
):
196 self
.assertEqual(sim
.fpr(i
+0), t
)
197 self
.assertEqual(sim
.fpr(i
+4), u
)
199 def test_sv_remap_fpmadds_dct_inner_4(self
):
200 """>>> lst = ["svshape 4, 1, 1, 2, 0",
201 "svremap 27, 1, 0, 2, 0, 1, 0",
202 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
204 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
207 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
208 (3 inputs, 2 outputs)
210 Note that the coefficient (FRC) is not on a "schedule", it
211 is straight Vectorised (0123...) because DCT coefficients
212 cannot be shared between butterfly layers (due to +0.5)
214 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
215 "svremap 27, 1, 0, 2, 0, 1, 0",
216 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
220 # array and coefficients to test
222 av
= [7.0, -9.8, 3.0, -32.3]
223 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
225 levels
= n
.bit_length() - 1
227 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
228 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
229 av
= halfrev2(avi
, False)
230 av
= [av
[ri
[i
]] for i
in range(n
)]
234 for i
, c
in enumerate(coe
):
235 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
236 for i
, a
in enumerate(av
):
237 fprs
[i
+0] = fp64toselectable(a
)
239 with
Program(lst
, bigendian
=False) as program
:
240 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
241 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
242 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
243 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
244 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
245 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
246 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
247 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
249 # work out the results with the twin mul/add-sub
250 res
= transform_inner_radix2(avi
, coe
)
252 for i
, expected
in enumerate(res
):
253 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
254 for i
, expected
in enumerate(res
):
255 # convert to Power single
256 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
257 expected
= float(expected
)
258 actual
= float(sim
.fpr(i
))
259 # approximate error calculation, good enough test
260 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
261 # and the rounding is different
262 err
= abs((actual
- expected
) / expected
)
263 print ("err", i
, err
)
264 self
.assertTrue(err
< 1e-6)
266 def test_sv_remap_fpmadds_dct_outer_8(self
):
267 """>>> lst = ["svshape 8, 1, 1, 3, 0",
268 "svremap 27, 1, 0, 2, 0, 1, 0",
269 "sv.fadds 0.v, 0.v, 0.v"
271 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
272 for DCT, does the iterative overlapped ADDs
274 SVP64 "REMAP" in Butterfly Mode.
276 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
277 "svremap 27, 1, 0, 2, 0, 1, 0",
278 "sv.fadds 0.v, 0.v, 0.v"
282 # array and coefficients to test
283 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
287 for i
, a
in enumerate(av
):
288 fprs
[i
+0] = fp64toselectable(a
)
290 with
Program(lst
, bigendian
=False) as program
:
291 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
292 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
293 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
294 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
295 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
296 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
297 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
298 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
300 # outer iterative sum
301 res
= transform_outer_radix2(av
)
303 for i
, expected
in enumerate(res
):
304 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
305 for i
, expected
in enumerate(res
):
306 # convert to Power single
307 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
308 expected
= float(expected
)
309 actual
= float(sim
.fpr(i
))
310 # approximate error calculation, good enough test
311 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
312 # and the rounding is different
313 err
= abs((actual
- expected
) / expected
)
314 print ("err", i
, err
)
315 self
.assertTrue(err
< 1e-6)
317 def test_sv_remap_fpmadds_dct_8(self
):
318 """>>> lst = ["svshape 8, 1, 1, 3, 0",
319 "svremap 27, 1, 0, 2, 0, 1, 0",
320 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
321 "sv.fadds 0.v, 0.v, 0.v"
323 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
324 for DCT, does the iterative overlapped ADDs
326 SVP64 "REMAP" in Butterfly Mode.
328 lst
= SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
329 "svshape 8, 1, 1, 2, 0",
330 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
331 "svshape 8, 1, 1, 3, 0",
332 "sv.fadds 0.v, 0.v, 0.v"
336 # array and coefficients to test
337 avi
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
339 levels
= n
.bit_length() - 1
341 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
342 av
= halfrev2(avi
, False)
343 av
= [av
[ri
[i
]] for i
in range(n
)]
348 for i
in range(n
//size
):
349 for ci
in range(halfsize
):
350 ctable
.append((math
.cos((ci
+ 0.5) * math
.pi
/ size
) * 2.0))
355 for i
, a
in enumerate(av
):
356 fprs
[i
+0] = fp64toselectable(a
)
357 for i
, c
in enumerate(ctable
):
358 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
360 with
Program(lst
, bigendian
=False) as program
:
361 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
362 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
363 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
364 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
365 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
366 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
367 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
368 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
370 # outer iterative sum
371 res
= transform2(avi
)
373 for i
, expected
in enumerate(res
):
374 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
375 for i
, expected
in enumerate(res
):
376 # convert to Power single
377 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
378 expected
= float(expected
)
379 actual
= float(sim
.fpr(i
))
380 # approximate error calculation, good enough test
381 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
382 # and the rounding is different
383 err
= abs((actual
- expected
) / expected
)
384 print ("err", i
, err
)
385 self
.assertTrue(err
< 1e-5)
387 def run_tst_program(self
, prog
, initial_regs
=None,
391 if initial_regs
is None:
392 initial_regs
= [0] * 32
393 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
394 initial_fprs
=initial_fprs
,
405 if __name__
== "__main__":