add DCT inner butterfly results test
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
14 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
15 iterate_dct_inner_butterfly_indices)
16
17
18 def transform_inner_radix2(vec, ctable):
19
20 # Initialization
21 n = len(vec)
22 print ()
23 print ("transform2", n)
24 levels = n.bit_length() - 1
25
26 # reference (read/write) the in-place data in *reverse-bit-order*
27 ri = list(range(n))
28 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
29
30 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
31 # TODO: merge these two
32 vec = halfrev2(vec, False)
33 vec = [vec[ri[i]] for i in range(n)]
34
35 ################
36 # INNER butterfly
37 ################
38 xdim = n
39 ydim = 0
40 zdim = 0
41
42 # set up an SVSHAPE
43 class SVSHAPE:
44 pass
45 # j schedule
46 SVSHAPE0 = SVSHAPE()
47 SVSHAPE0.lims = [xdim, ydim, zdim]
48 SVSHAPE0.order = [0,1,2] # experiment with different permutations, here
49 SVSHAPE0.mode = 0b01
50 SVSHAPE0.submode2 = 0b01
51 SVSHAPE0.skip = 0b00
52 SVSHAPE0.offset = 0 # experiment with different offset, here
53 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
54 # j+halfstep schedule
55 SVSHAPE1 = SVSHAPE()
56 SVSHAPE1.lims = [xdim, ydim, zdim]
57 SVSHAPE1.order = [0,1,2] # experiment with different permutations, here
58 SVSHAPE1.mode = 0b01
59 SVSHAPE1.submode2 = 0b01
60 SVSHAPE1.skip = 0b01
61 SVSHAPE1.offset = 0 # experiment with different offset, here
62 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
63
64 # enumerate over the iterator function, getting new indices
65 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
66 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
67 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
68 t1, t2 = vec[jl], vec[jh]
69 coeff = ctable[k]
70 vec[jl] = t1 + t2
71 vec[jh] = (t1 - t2) * (1/coeff)
72 print ("coeff", "ci", k,
73 "jl", jl, "jh", jh,
74 "i/n", (k+0.5), coeff, vec[jl], vec[jh],
75 "end", bin(jle), bin(jhe))
76 if jle == 0b111: # all loops end
77 break
78
79 return vec
80
81
82 class DCTTestCase(FHDLTestCase):
83
84 def _check_regs(self, sim, expected):
85 for i in range(32):
86 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
87
88 def tst_sv_ffadds_dct(self):
89 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
90 ]
91 four in-place vector adds, four in-place vector mul-subs
92
93 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
94 FRS to perform the two multiplies. one add, one subtract.
95
96 sv.fdadds FRT, FRA, FRC, FRB actually does:
97 fadds FRT , FRB, FRA
98 fsubs FRT+vl, FRA, FRB+vl
99 """
100 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
101 ])
102 lst = list(lst)
103
104 # cheat here with these values, they're selected so that
105 # rounding errors do not occur. sigh.
106 fprs = [0] * 32
107 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
108 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
109 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
110 res = []
111 # work out the results with the twin add-sub
112 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
113 fprs[i+0] = fp64toselectable(a)
114 fprs[i+4] = fp64toselectable(b)
115 fprs[i+8] = fp64toselectable(c)
116 # this isn't quite a perfect replication of the
117 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
118 # and FPSUB32 directly to be honest.
119 t = b + a
120 diff = (b - a)
121 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
122 diff = float(diff)
123 u = diff * c
124 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
125 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
126 res.append((tc, uc))
127 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
128
129 # SVSTATE (in this case, VL=2)
130 svstate = SVP64State()
131 svstate.vl = 4 # VL
132 svstate.maxvl = 4 # MAXVL
133 print ("SVSTATE", bin(svstate.asint()))
134
135 with Program(lst, bigendian=False) as program:
136 sim = self.run_tst_program(program, svstate=svstate,
137 initial_fprs=fprs)
138 # confirm that the results are as expected
139 for i, (t, u) in enumerate(res):
140 a = float(sim.fpr(i+0))
141 b = float(sim.fpr(i+4))
142 t = float(t)
143 u = float(u)
144 print ("DCT", i, "in", a, b, "res", t, u)
145 for i, (t, u) in enumerate(res):
146 self.assertEqual(sim.fpr(i+0), t)
147 self.assertEqual(sim.fpr(i+4), u)
148
149 def test_sv_remap_fpmadds_dct(self):
150 """>>> lst = ["svshape 4, 1, 1, 2, 0",
151 "svremap 31, 1, 0, 2, 0, 1, 0",
152 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
153 ]
154 runs a full in-place O(N log2 N) butterfly schedule for
155 DCT
156
157 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
158 (3 inputs, 2 outputs)
159 """
160 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
161 "svremap 31, 1, 0, 2, 0, 1, 0",
162 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
163 ])
164 lst = list(lst)
165
166 # array and coefficients to test
167 n = 4
168 av = [7.0, -9.8, 3.0, -32.3]
169 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
170
171 levels = n.bit_length() - 1
172 ri = list(range(n))
173 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
174 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
175 av = halfrev2(avi, False)
176 av = [av[ri[i]] for i in range(n)]
177
178 # store in regfile
179 fprs = [0] * 32
180 for i, c in enumerate(coe):
181 fprs[i+8] = fp64toselectable(c)
182 for i, a in enumerate(av):
183 fprs[i+0] = fp64toselectable(a)
184
185 with Program(lst, bigendian=False) as program:
186 sim = self.run_tst_program(program, initial_fprs=fprs)
187 print ("spr svshape0", sim.spr['SVSHAPE0'])
188 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
189 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
190 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
191 print ("spr svshape1", sim.spr['SVSHAPE1'])
192 print ("spr svshape2", sim.spr['SVSHAPE2'])
193 print ("spr svshape3", sim.spr['SVSHAPE3'])
194
195 # work out the results with the twin mul/add-sub
196 res = transform_inner_radix2(avi, coe)
197
198 for i, expected in enumerate(res):
199 print ("i", i, float(sim.fpr(i)), "expected", expected)
200 for i, expected in enumerate(res):
201 # convert to Power single
202 expected = DOUBLE2SINGLE(fp64toselectable(expected))
203 expected = float(expected)
204 actual = float(sim.fpr(i))
205 # approximate error calculation, good enough test
206 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
207 # and the rounding is different
208 err = abs((actual - expected) / expected)
209 print ("err", i, err)
210 self.assertTrue(err < 1e-7)
211
212 def run_tst_program(self, prog, initial_regs=None,
213 svstate=None,
214 initial_mem=None,
215 initial_fprs=None):
216 if initial_regs is None:
217 initial_regs = [0] * 32
218 simulator = run_tst(prog, initial_regs, mem=initial_mem,
219 initial_fprs=initial_fprs,
220 svstate=svstate)
221
222 print ("GPRs")
223 simulator.gpr.dump()
224 print ("FPRs")
225 simulator.fpr.dump()
226
227 return simulator
228
229
230 if __name__ == "__main__":
231 unittest.main()