1 from nmigen
import Module
, Signal
2 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
3 from nmutil
.formaltest
import FHDLTestCase
5 from openpower
.decoder
.power_decoder
import (create_pdecode
)
6 from openpower
.simulator
.program
import Program
7 from openpower
.decoder
.isa
.caller
import SVP64State
8 from openpower
.decoder
.selectable_int
import SelectableInt
9 from openpower
.decoder
.isa
.test_caller
import run_tst
10 from openpower
.sv
.trans
.svp64
import SVP64Asm
11 from copy
import deepcopy
12 from openpower
.decoder
.helpers
import fp64toselectable
, SINGLE
13 from openpower
.decoder
.isafunctions
.double2single
import DOUBLE2SINGLE
14 from openpower
.decoder
.isa
.remap_dct_yield
import (halfrev2
, reverse_bits
,
15 iterate_dct_inner_butterfly_indices
)
18 def transform_inner_radix2(vec
, ctable
):
23 print ("transform2", n
)
24 levels
= n
.bit_length() - 1
26 # reference (read/write) the in-place data in *reverse-bit-order*
28 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
30 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
31 # TODO: merge these two
32 vec
= halfrev2(vec
, False)
33 vec
= [vec
[ri
[i
]] for i
in range(n
)]
47 SVSHAPE0
.lims
= [xdim
, ydim
, zdim
]
48 SVSHAPE0
.order
= [0,1,2] # experiment with different permutations, here
50 SVSHAPE0
.submode2
= 0b01
52 SVSHAPE0
.offset
= 0 # experiment with different offset, here
53 SVSHAPE0
.invxyz
= [1,0,0] # inversion if desired
56 SVSHAPE1
.lims
= [xdim
, ydim
, zdim
]
57 SVSHAPE1
.order
= [0,1,2] # experiment with different permutations, here
59 SVSHAPE1
.submode2
= 0b01
61 SVSHAPE1
.offset
= 0 # experiment with different offset, here
62 SVSHAPE1
.invxyz
= [1,0,0] # inversion if desired
64 # enumerate over the iterator function, getting new indices
65 i0
= iterate_dct_inner_butterfly_indices(SVSHAPE0
)
66 i1
= iterate_dct_inner_butterfly_indices(SVSHAPE1
)
67 for k
, ((jl
, jle
), (jh
, jhe
)) in enumerate(zip(i0
, i1
)):
68 t1
, t2
= vec
[jl
], vec
[jh
]
71 vec
[jh
] = (t1
- t2
) * (1.0/coeff
)
72 print ("coeff", "ci", k
,
74 "i/n", (k
+0.5), 1.0/coeff
,
75 "t1, t2", t1
, t2
, "res", vec
[jl
], vec
[jh
],
76 "end", bin(jle
), bin(jhe
))
77 if jle
== 0b111: # all loops end
83 class DCTTestCase(FHDLTestCase
):
85 def _check_regs(self
, sim
, expected
):
87 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64))
89 def test_sv_ffadds_dct(self
):
90 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
92 four in-place vector adds, four in-place vector mul-subs
94 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
95 FRS to perform the two multiplies. one add, one subtract.
97 sv.fdadds FRT, FRA, FRC, FRB actually does:
99 fsubs FRT+vl, FRA, FRB+vl
101 lst
= SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
105 # cheat here with these values, they're selected so that
106 # rounding errors do not occur. sigh.
108 av
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
109 bv
= [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
110 cv
= [-1.0, 0.5, 2.5, -0.25] # coefficients
112 # work out the results with the twin add-sub
113 for i
, (a
, b
, c
) in enumerate(zip(av
, bv
, cv
)):
114 fprs
[i
+0] = fp64toselectable(a
)
115 fprs
[i
+4] = fp64toselectable(b
)
116 fprs
[i
+8] = fp64toselectable(c
)
117 # this isn't quite a perfect replication of the
118 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
119 # and FPSUB32 directly to be honest.
122 diff
= DOUBLE2SINGLE(fp64toselectable(diff
)) # FP32 round
125 tc
= DOUBLE2SINGLE(fp64toselectable(t
)) # convert to Power single
126 uc
= DOUBLE2SINGLE(fp64toselectable(u
)) # from double
128 print ("DCT", i
, "in", a
, b
, "c", c
, "res", t
, u
)
130 # SVSTATE (in this case, VL=2)
131 svstate
= SVP64State()
133 svstate
.maxvl
= 4 # MAXVL
134 print ("SVSTATE", bin(svstate
.asint()))
136 with
Program(lst
, bigendian
=False) as program
:
137 sim
= self
.run_tst_program(program
, svstate
=svstate
,
139 # confirm that the results are as expected
140 for i
, (t
, u
) in enumerate(res
):
141 a
= float(sim
.fpr(i
+0))
142 b
= float(sim
.fpr(i
+4))
145 print ("DCT", i
, "in", a
, b
, "res", t
, u
)
146 for i
, (t
, u
) in enumerate(res
):
147 self
.assertEqual(sim
.fpr(i
+0), t
)
148 self
.assertEqual(sim
.fpr(i
+4), u
)
150 def test_sv_remap_fpmadds_dct_4(self
):
151 """>>> lst = ["svshape 4, 1, 1, 2, 0",
152 "svremap 27, 1, 0, 2, 0, 1, 0",
153 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
155 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
158 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
159 (3 inputs, 2 outputs)
161 Note that the coefficient (FRC) is not on a "schedule", it
162 is straight Vectorised (0123...) because DCT coefficients
163 cannot be shared between butterfly layers (due to +0.5)
165 lst
= SVP64Asm( ["svshape 4, 1, 1, 2, 0",
166 "svremap 27, 1, 0, 2, 0, 1, 0",
167 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
171 # array and coefficients to test
173 av
= [7.0, -9.8, 3.0, -32.3]
174 coe
= [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
176 levels
= n
.bit_length() - 1
178 ri
= [ri
[reverse_bits(i
, levels
)] for i
in range(n
)]
179 avi
= [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
180 av
= halfrev2(avi
, False)
181 av
= [av
[ri
[i
]] for i
in range(n
)]
185 for i
, c
in enumerate(coe
):
186 fprs
[i
+8] = fp64toselectable(1.0 / c
) # invert
187 for i
, a
in enumerate(av
):
188 fprs
[i
+0] = fp64toselectable(a
)
190 with
Program(lst
, bigendian
=False) as program
:
191 sim
= self
.run_tst_program(program
, initial_fprs
=fprs
)
192 print ("spr svshape0", sim
.spr
['SVSHAPE0'])
193 print (" xdimsz", sim
.spr
['SVSHAPE0'].xdimsz
)
194 print (" ydimsz", sim
.spr
['SVSHAPE0'].ydimsz
)
195 print (" zdimsz", sim
.spr
['SVSHAPE0'].zdimsz
)
196 print ("spr svshape1", sim
.spr
['SVSHAPE1'])
197 print ("spr svshape2", sim
.spr
['SVSHAPE2'])
198 print ("spr svshape3", sim
.spr
['SVSHAPE3'])
200 # work out the results with the twin mul/add-sub
201 res
= transform_inner_radix2(avi
, coe
)
203 for i
, expected
in enumerate(res
):
204 print ("i", i
, float(sim
.fpr(i
)), "expected", expected
)
205 for i
, expected
in enumerate(res
):
206 # convert to Power single
207 expected
= DOUBLE2SINGLE(fp64toselectable(expected
))
208 expected
= float(expected
)
209 actual
= float(sim
.fpr(i
))
210 # approximate error calculation, good enough test
211 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
212 # and the rounding is different
213 err
= abs((actual
- expected
) / expected
)
214 print ("err", i
, err
)
215 self
.assertTrue(err
< 1e-6)
217 def run_tst_program(self
, prog
, initial_regs
=None,
221 if initial_regs
is None:
222 initial_regs
= [0] * 32
223 simulator
= run_tst(prog
, initial_regs
, mem
=initial_mem
,
224 initial_fprs
=initial_fprs
,
235 if __name__
== "__main__":