1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
5 from openpower
.decoder
.power_decoder
import (create_pdecode
)
6 from openpower
.simulator
.program
import Program
7 from openpower
.decoder
.isa
.caller
import SVP64State
8 from openpower
.decoder
.selectable_int
import SelectableInt
9 from openpower
.decoder
.isa
.test_caller
import run_tst
10 from openpower
.sv
.trans
.svp64
import SVP64Asm
11 from copy
import deepcopy
12 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
13 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
14 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
15 iterate_dct_inner_butterfly_indices
,
16 iterate_dct_outer_butterfly_indices
)
19 def transform_inner_radix2(vec
, ctable
):
24 print ("transform2", n
)
25 levels
= n
.bit_length() - 1
27 # reference (read/write) the in-place data in *reverse-bit-order*
29 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
31 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
32 # TODO: merge these two
33 vec
= halfrev2(vec
, False)
34 vec
= [vec
[ri
[i
]] for i
in range(n
)]
48 SVSHAPE0
.lims
= [xdim
, ydim
, zdim
]
49 SVSHAPE0
.order
= [0,1,2] # experiment with different permutations, here
51 SVSHAPE0
.submode2
= 0b01
53 SVSHAPE0
.offset
= 0 # experiment with different offset, here
54 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
57 SVSHAPE1
.lims
= [xdim
, ydim
, zdim
]
58 SVSHAPE1
.order
= [0,1,2] # experiment with different permutations, here
60 SVSHAPE1
.submode2
= 0b01
62 SVSHAPE1
.offset
= 0 # experiment with different offset, here
63 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
65 # enumerate over the iterator function, getting new indices
66 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
67 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
68 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
69 t1
, t2
= vec
[jl
], vec
[jh
]
72 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
73 print ("coeff", "ci", k
,
75 "i/n", (k
+0.5), 1.0/coeff
,
76 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
77 "end", bin(jle
), bin(jhe
))
78 if jle
== 0b111: # all loops end
83 def transform_outer_radix2(vec
):
88 print ("transform2", n
)
89 levels
= n
.bit_length() - 1
100 SVSHAPE0
.lims
= [xdim
, ydim
, zdim
]
101 SVSHAPE0
.submode2
= 0b100
104 SVSHAPE0
.offset
= 0 # experiment with different offset, here
105 SVSHAPE0
.invxyz
= [0,0,0] # inversion if desired
106 # j+halfstep schedule
108 SVSHAPE1
.lims
= [xdim
, ydim
, zdim
]
110 SVSHAPE1
.submode2
= 0b100
112 SVSHAPE1
.offset
= 0 # experiment with different offset, here
113 SVSHAPE1
.invxyz
= [0,0,0] # inversion if desired
115 # enumerate over the iterator function, getting new indices
116 i0
= iterate_dct_outer_butterfly_indices(SVSHAPE0
)
117 i1
= iterate_dct_outer_butterfly_indices(SVSHAPE1
)
118 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
119 print ("itersum jr", jl
, jh
,
120 "end", bin(jle
), bin(jhe
))
122 if jle
== 0b111: # all loops end
125 print("transform2 result", vec
)
130 class DCTTestCase(FHDLTestCase
):
132 def _check_regs(self
, sim
, expected
):
134 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
136 def test_sv_ffadds_dct(self
):
137 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
139 four in-place vector adds, four in-place vector mul-subs
141 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
142 FRS to perform the two multiplies. one add, one subtract.
144 sv.fdadds FRT, FRA, FRC, FRB actually does:
146 fsubs FRT+vl, FRA, FRB+vl
148 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
152 # cheat here with these values, they're selected so that
153 # rounding errors do not occur. sigh.
155 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
156 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
157 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
159 # work out the results with the twin add-sub
160 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
161 fprs
[i
+0] = fp64toselectable(a
)
162 fprs
[i
+4] = fp64toselectable(b
)
163 fprs
[i
+8] = fp64toselectable(c
)
164 # this isn't quite a perfect replication of the
165 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
166 # and FPSUB32 directly to be honest.
169 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
172 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
173 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
175 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
177 # SVSTATE (in this case, VL=2)
178 svstate
= SVP64State()
180 svstate
.maxvl
= 4 # MAXVL
181 print ("SVSTATE", bin(svstate
.asint()))
183 with
Program(lst
, bigendian
=False) as program
:
184 sim
= self
.run_tst_program(program
, svstate
=svstate
,
186 # confirm that the results are as expected
187 for i
, (t
, u
) in enumerate(res
):
188 a
= float(sim
.fpr(i
+0))
189 b
= float(sim
.fpr(i
+4))
192 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
193 for i
, (t
, u
) in enumerate(res
):
194 self
.assertEqual(sim
.fpr(i
+0), t
)
195 self
.assertEqual(sim
.fpr(i
+4), u
)
197 def test_sv_remap_fpmadds_dct_inner_4(self
):
198 """>>> lst = ["svshape 4, 1, 1, 2, 0",
199 "svremap 27, 1, 0, 2, 0, 1, 0",
200 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
202 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
205 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
206 (3 inputs, 2 outputs)
208 Note that the coefficient (FRC) is not on a "schedule", it
209 is straight Vectorised (0123...) because DCT coefficients
210 cannot be shared between butterfly layers (due to +0.5)
212 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
213 "svremap 27, 1, 0, 2, 0, 1, 0",
214 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
218 # array and coefficients to test
220 av
= [7.0, -9.8, 3.0, -32.3]
221 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
223 levels
= n
.bit_length() - 1
225 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
226 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
227 av
= halfrev2(avi
, False)
228 av
= [av
[ri
[i
]] for i
in range(n
)]
232 for i
, c
in enumerate(coe
):
233 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
234 for i
, a
in enumerate(av
):
235 fprs
[i
+0] = fp64toselectable(a
)
237 with
Program(lst
, bigendian
=False) as program
:
238 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
239 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
240 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
241 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
242 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
243 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
244 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
245 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
247 # work out the results with the twin mul/add-sub
248 res
= transform_inner_radix2(avi
, coe
)
250 for i
, expected
in enumerate(res
):
251 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
252 for i
, expected
in enumerate(res
):
253 # convert to Power single
254 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
255 expected
= float(expected
)
256 actual
= float(sim
.fpr(i
))
257 # approximate error calculation, good enough test
258 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
259 # and the rounding is different
260 err
= abs((actual
- expected
) / expected
)
261 print ("err", i
, err
)
262 self
.assertTrue(err
< 1e-6)
264 def test_sv_remap_fpmadds_dct_outer_8(self
):
265 """>>> lst = ["svshape 8, 1, 1, 3, 0",
266 "svremap 27, 1, 0, 2, 0, 1, 0",
267 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
269 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
270 for DCT, does the iterative overlapped ADDs
272 SVP64 "REMAP" in Butterfly Mode.
274 lst
= SVP64Asm( ["svshape 8, 1, 1, 3, 0",
275 "svremap 27, 1, 0, 2, 0, 1, 0",
276 "sv.fadds 0.v, 0.v, 0.v"
280 # array and coefficients to test
281 av
= [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
285 for i
, a
in enumerate(av
):
286 fprs
[i
+0] = fp64toselectable(a
)
288 with
Program(lst
, bigendian
=False) as program
:
289 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
290 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
291 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
292 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
293 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
294 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
295 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
296 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
298 # outer iterative sum
299 res
= transform_outer_radix2(av
)
301 for i
, expected
in enumerate(res
):
302 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
303 for i
, expected
in enumerate(res
):
304 # convert to Power single
305 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
306 expected
= float(expected
)
307 actual
= float(sim
.fpr(i
))
308 # approximate error calculation, good enough test
309 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
310 # and the rounding is different
311 err
= abs((actual
- expected
) / expected
)
312 print ("err", i
, err
)
313 self
.assertTrue(err
< 1e-6)
315 def run_tst_program(self
, prog
, initial_regs
=None,
319 if initial_regs
is None:
320 initial_regs
= [0] * 32
321 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
322 initial_fprs
=initial_fprs
,
333 if __name__
== "__main__":