d7f9ae9a84ce7ece0fd0c63ad704a7fb4dd89e2b
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_fft.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable
13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
14
15
16 def transform_radix2(vec, exptable):
17 """
18 # FFT and convolution test (Python), based on Project Nayuki
19 #
20 # Copyright (c) 2020 Project Nayuki. (MIT License)
21 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
22
23 """
24 # bits of the integer 'val'.
25 def reverse_bits(val, width):
26 result = 0
27 for _ in range(width):
28 result = (result << 1) | (val & 1)
29 val >>= 1
30 return result
31
32 # Initialization
33 n = len(vec)
34 levels = n.bit_length() - 1
35
36 # Copy with bit-reversed permutation
37 #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
38
39 size = 2
40 while size <= n:
41 halfsize = size // 2
42 tablestep = n // size
43 for i in range(0, n, size):
44 k = 0
45 for j in range(i, i + halfsize):
46 # exact same actual computation, just embedded in
47 # triple-nested for-loops
48 jl, jh = j, j+halfsize
49 vjh = vec[jh]
50 temp1 = vec[jh] * exptable[k]
51 temp2 = vec[jl]
52 vec[jh] = temp2 - temp1
53 vec[jl] = temp2 + temp1
54 print ("xform jl jh k", jl, jh, k,
55 "vj vjh ek", temp2, vjh, exptable[k],
56 "t1, t2", temp1, temp2,
57 "v[jh] v[jl]", vec[jh], vec[jl])
58 k += tablestep
59 size *= 2
60
61 return vec
62
63
64 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i):
65 """
66 # FFT and convolution test (Python), based on Project Nayuki
67 #
68 # Copyright (c) 2020 Project Nayuki. (MIT License)
69 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
70
71 """
72 # bits of the integer 'val'.
73 def reverse_bits(val, width):
74 result = 0
75 for _ in range(width):
76 result = (result << 1) | (val & 1)
77 val >>= 1
78 return result
79
80 # Initialization
81 n = len(vec_r)
82 levels = n.bit_length() - 1
83
84 # Copy with bit-reversed permutation
85 #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
86
87 size = 2
88 while size <= n:
89 halfsize = size // 2
90 tablestep = n // size
91 for i in range(0, n, size):
92 k = 0
93 for j in range(i, i + halfsize):
94 # exact same actual computation, just embedded in
95 # triple-nested for-loops
96 jl, jh = j, j+halfsize
97
98 print ("xform jl jh k", jl, jh, k,
99 "vr h l", vec_r[jh], vec_r[jl],
100 "vi h l", vec_i[jh], vec_i[jl])
101 print (" cr k", cos_r[k], "si k", sin_i[k])
102 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
103 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
104 vec_r[jh] = vec_r[jl] - tpre
105 vec_i[jh] = vec_i[jl] - tpim
106 vec_r[jl] += tpre
107 vec_i[jl] += tpim
108
109 print (" xform jl jh k", jl, jh, k,
110 "\n vr h l", vec_r[jh], vec_r[jl],
111 "\n vi h l", vec_i[jh], vec_i[jl])
112 k += tablestep
113 size *= 2
114
115 return vec_r, vec_i
116
117
118 class FFTTestCase(FHDLTestCase):
119
120 def _check_regs(self, sim, expected):
121 for i in range(32):
122 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
123
124 def test_sv_remap_fpmadds_fft(self):
125 """>>> lst = ["svshape 8, 1, 1, 1, 0",
126 "svremap 31, 1, 0, 2, 0, 1",
127 "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
128 ]
129 runs a full in-place O(N log2 N) butterfly schedule for
130 Discrete Fourier Transform.
131
132 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
133 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
134
135 there is the *option* to target a different location (non-in-place)
136 just in case.
137
138 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
139 (3 inputs, 2 outputs)
140 """
141 lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
142 "svremap 31, 1, 0, 2, 0, 1",
143 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
144 ])
145 lst = list(lst)
146
147 # array and coefficients to test
148 av = [7.0, -9.8, 3.0, -32.3,
149 -2.0, 5.0, -9.8, 31.3] # array 0..7
150 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
151
152 # store in regfile
153 fprs = [0] * 32
154 for i, c in enumerate(coe):
155 fprs[i+8] = fp64toselectable(c)
156 for i, a in enumerate(av):
157 fprs[i+0] = fp64toselectable(a)
158
159 with Program(lst, bigendian=False) as program:
160 sim = self.run_tst_program(program, initial_fprs=fprs)
161 print ("spr svshape0", sim.spr['SVSHAPE0'])
162 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
163 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
164 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
165 print ("spr svshape1", sim.spr['SVSHAPE1'])
166 print ("spr svshape2", sim.spr['SVSHAPE2'])
167 print ("spr svshape3", sim.spr['SVSHAPE3'])
168
169 # work out the results with the twin mul/add-sub
170 res = transform_radix2(av, coe)
171
172 for i, expected in enumerate(res):
173 print ("i", i, float(sim.fpr(i)), "expected", expected)
174 for i, expected in enumerate(res):
175 # convert to Power single
176 expected = DOUBLE2SINGLE(fp64toselectable(expected))
177 expected = float(expected)
178 actual = float(sim.fpr(i))
179 # approximate error calculation, good enough test
180 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
181 # and the rounding is different
182 err = abs(actual - expected) / expected
183 self.assertTrue(err < 1e-7)
184
185 def test_sv_remap_fpmadds_fft_svstep(self):
186 """>>> lst = SVP64Asm( [
187 "svshape 8, 1, 1, 1, 1",
188 "svremap 31, 1, 0, 2, 0, 1",
189 "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
190 "setvl. 0, 0, 0, 1, 0, 0",
191 "bc 4, 2, -16"
192 ])
193 runs a full in-place O(N log2 N) butterfly schedule for
194 Discrete Fourier Transform. this version however uses
195 SVP64 "Vertical-First" Mode and so needs an explicit
196 branch, testing CR0.
197
198 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
199 (3 inputs, 2 outputs)
200 """
201 lst = SVP64Asm( [
202 "svshape 8, 1, 1, 1, 1",
203 "svremap 31, 1, 0, 2, 0, 1",
204 "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
205 "setvl. 0, 0, 0, 1, 0, 0",
206 "bc 4, 2, -16"
207 ])
208 lst = list(lst)
209
210 # array and coefficients to test
211 av = [7.0, -9.8, 3.0, -32.3,
212 -2.0, 5.0, -9.8, 31.3] # array 0..7
213 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
214
215 # store in regfile
216 fprs = [0] * 32
217 for i, c in enumerate(coe):
218 fprs[i+8] = fp64toselectable(c)
219 for i, a in enumerate(av):
220 fprs[i+0] = fp64toselectable(a)
221
222 # set total. err don't know how to calculate how many there are...
223 # do it manually for now
224 VL = 0
225 size = 2
226 n = len(av)
227 while size <= n:
228 halfsize = size // 2
229 tablestep = n // size
230 for i in range(0, n, size):
231 for j in range(i, i + halfsize):
232 VL += 1
233 size *= 2
234
235 # SVSTATE (calculated VL)
236 svstate = SVP64State()
237 svstate.vl[0:7] = VL # VL
238 svstate.maxvl[0:7] = VL # MAXVL
239 print ("SVSTATE", bin(svstate.spr.asint()))
240
241 with Program(lst, bigendian=False) as program:
242 sim = self.run_tst_program(program, svstate=svstate,
243 initial_fprs=fprs)
244 print ("spr svshape0", sim.spr['SVSHAPE0'])
245 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
246 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
247 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
248 print ("spr svshape1", sim.spr['SVSHAPE1'])
249 print ("spr svshape2", sim.spr['SVSHAPE2'])
250 print ("spr svshape3", sim.spr['SVSHAPE3'])
251
252 # work out the results with the twin mul/add-sub
253 res = transform_radix2(av, coe)
254
255 for i, expected in enumerate(res):
256 print ("i", i, float(sim.fpr(i)), "expected", expected)
257 for i, expected in enumerate(res):
258 # convert to Power single
259 expected = DOUBLE2SINGLE(fp64toselectable(expected))
260 expected = float(expected)
261 actual = float(sim.fpr(i))
262 # approximate error calculation, good enough test
263 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
264 # and the rounding is different
265 err = abs(actual - expected) / expected
266 self.assertTrue(err < 1e-7)
267
268 def test_sv_fpmadds_fft(self):
269 """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
270 ]
271 four in-place vector mul-adds, four in-place vector mul-subs
272
273 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
274 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
275
276 there is the *option* to target a different location (non-in-place)
277 just in case.
278
279 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
280 FRS to perform the two multiplies. one add, one subtract.
281
282 sv.ffmadds FRT, FRA, FRC, FRB actually does:
283 fmadds FRT , FRA, FRC, FRA
284 fnmsubs FRT+vl, FRA, FRC, FRB+vl
285 """
286 lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
287 ])
288 lst = list(lst)
289
290 fprs = [0] * 32
291 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
292 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
293 coe = [-1.0, 4.0, 3.1, 6.2] # coefficients
294 res = []
295 # work out the results with the twin mul/add-sub
296 for i, (a, b, c) in enumerate(zip(av, bv, coe)):
297 fprs[i+2] = fp64toselectable(a)
298 fprs[i+6] = fp64toselectable(b)
299 fprs[i+10] = fp64toselectable(c)
300 mul = a * c
301 t = b + mul
302 u = b - mul
303 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
304 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
305 res.append((t, u))
306 print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
307
308 # SVSTATE (in this case, VL=2)
309 svstate = SVP64State()
310 svstate.vl[0:7] = 4 # VL
311 svstate.maxvl[0:7] = 4 # MAXVL
312 print ("SVSTATE", bin(svstate.spr.asint()))
313
314 with Program(lst, bigendian=False) as program:
315 sim = self.run_tst_program(program, svstate=svstate,
316 initial_fprs=fprs)
317 # confirm that the results are as expected
318 for i, (t, u) in enumerate(res):
319 self.assertEqual(sim.fpr(i+2), t)
320 self.assertEqual(sim.fpr(i+6), u)
321
322 def test_sv_ffadds_fft(self):
323 """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
324 ]
325 four in-place vector adds, four in-place vector subs
326
327 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
328 FRS to perform the two multiplies. one add, one subtract.
329
330 sv.ffadds FRT, FRA, FRB actually does:
331 fadds FRT , FRB, FRA
332 fsubs FRT+vl, FRA, FRB+vl
333 """
334 lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
335 ])
336 lst = list(lst)
337
338 fprs = [0] * 32
339 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
340 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
341 res = []
342 # work out the results with the twin add-sub
343 for i, (a, b) in enumerate(zip(av, bv)):
344 fprs[i+2] = fp64toselectable(a)
345 fprs[i+6] = fp64toselectable(b)
346 t = b + a
347 u = b - a
348 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
349 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
350 res.append((t, u))
351 print ("FFT", i, "in", a, b, "res", t, u)
352
353 # SVSTATE (in this case, VL=2)
354 svstate = SVP64State()
355 svstate.vl[0:7] = 4 # VL
356 svstate.maxvl[0:7] = 4 # MAXVL
357 print ("SVSTATE", bin(svstate.spr.asint()))
358
359 with Program(lst, bigendian=False) as program:
360 sim = self.run_tst_program(program, svstate=svstate,
361 initial_fprs=fprs)
362 # confirm that the results are as expected
363 for i, (t, u) in enumerate(res):
364 a = float(sim.fpr(i+2))
365 b = float(sim.fpr(i+6))
366 t = float(t)
367 u = float(u)
368 print ("FFT", i, "in", a, b, "res", t, u)
369 for i, (t, u) in enumerate(res):
370 self.assertEqual(sim.fpr(i+2), t)
371 self.assertEqual(sim.fpr(i+6), u)
372
373 def tst_sv_remap_fpmadds_fft_svstep_complex(self):
374 """
375 runs a full in-place O(N log2 N) butterfly schedule for
376 Discrete Fourier Transform. this version however uses
377 SVP64 "Vertical-First" Mode and so needs an explicit
378 branch, testing CR0.
379
380 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
381 (3 inputs, 2 outputs)
382
383 complex calculation:
384
385 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
386 vec_r[jh] = vec_r[jl] - tpre
387 vec_r[jl] += tpre
388
389 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
390 vec_i[jh] = vec_i[jl] - tpim
391 vec_i[jl] += tpim
392
393 temp1 = vec[jh] * exptable[k]
394 temp2 = vec[jl]
395 vec[jh] = temp2 - temp1
396 vec[jl] = temp2 + temp1
397 """
398 lst = SVP64Asm( ["setvl 0, 0, 11, 1, 1, 1",
399 "svshape 8, 1, 1, 1, 1",
400 # tpre
401 "svremap 31, 1, 0, 2, 0, 1",
402 "sv.fmuls 24, 0.v, 16.v",
403 "svremap 31, 1, 0, 2, 0, 1",
404 "sv.fmuls 25, 8.v, 20.v",
405 "fadds 24, 24, 25",
406 # tpim
407 "svremap 31, 1, 0, 2, 0, 1",
408 "sv.fmuls 26, 0.v, 20.v",
409 "svremap 31, 1, 0, 2, 0, 1",
410 "sv.fmuls 26, 8.v, 16.v",
411 "fsubs 26, 26, 27",
412 # vec_r jh/jl
413 "svremap 31, 1, 0, 2, 0, 1",
414 "sv.ffadds 0.v, 24, 25",
415 # vec_i jh/jl
416 "svremap 31, 1, 0, 2, 0, 1",
417 "sv.ffadds 8.v, 26, 27",
418
419 # svstep loop
420 "setvl. 0, 0, 0, 1, 0, 0",
421 "bc 4, 2, -16"
422 ])
423 lst = list(lst)
424
425 # array and coefficients to test
426 ar = [7.0, -9.8, 3.0, -32.3,
427 -2.0, 5.0, -9.8, 31.3] # array 0..7 real
428 ai = [1.0, -1.8, 3.0, 19.3,
429 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
430 coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
431 coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
432
433 # store in regfile
434 fprs = [0] * 64
435 for i, a in enumerate(ar):
436 fprs[i+0] = fp64toselectable(a)
437 for i, a in enumerate(ai):
438 fprs[i+8] = fp64toselectable(a)
439 for i, c in enumerate(coer):
440 fprs[i+16] = fp64toselectable(c)
441 for i, c in enumerate(coei):
442 fprs[i+20] = fp64toselectable(c)
443
444 # set total. err don't know how to calculate how many there are...
445 # do it manually for now
446 VL = 0
447 size = 2
448 n = len(ar)
449 while size <= n:
450 halfsize = size // 2
451 tablestep = n // size
452 for i in range(0, n, size):
453 for j in range(i, i + halfsize):
454 VL += 1
455 size *= 2
456
457 # SVSTATE (calculated VL)
458 svstate = SVP64State()
459 svstate.vl[0:7] = VL # VL
460 svstate.maxvl[0:7] = VL # MAXVL
461 print ("SVSTATE", bin(svstate.spr.asint()))
462
463 with Program(lst, bigendian=False) as program:
464 sim = self.run_tst_program(program, svstate=svstate,
465 initial_fprs=fprs)
466 print ("spr svshape0", sim.spr['SVSHAPE0'])
467 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
468 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
469 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
470 print ("spr svshape1", sim.spr['SVSHAPE1'])
471 print ("spr svshape2", sim.spr['SVSHAPE2'])
472 print ("spr svshape3", sim.spr['SVSHAPE3'])
473
474 # work out the results with the twin mul/add-sub, explicit
475 # complex numbers
476 res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
477
478 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
479 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
480 "expected_r", expected_r,
481 "expected_i", expected_i)
482 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
483 # convert to Power single
484 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
485 expected_r = float(expected_r)
486 actual_r = float(sim.fpr(i))
487 # approximate error calculation, good enough test
488 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
489 # and the rounding is different
490 err = abs(actual_r - expected_r ) / expected_r
491 self.assertTrue(err < 1e-7)
492 # convert to Power single
493 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
494 expected_i = float(expected_i)
495 actual_i = float(sim.fpr(i+8))
496 # approximate error calculation, good enough test
497 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
498 # and the rounding is different
499 err = abs(actual_i - expected_i ) / expected_i
500 self.assertTrue(err < 1e-7)
501
502 def test_sv_ffadds_fft_scalar(self):
503 """>>> lst = ["sv.ffadds 2.v, 12, 13"
504 ]
505 four in-place vector adds and subs, but done with a scalar
506 pair (fp12, fp13)
507 """
508 lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
509 ])
510 lst = list(lst)
511
512 fprs = [0] * 32
513 scalar_a = 1.3
514 scalar_b = -2.0
515 fprs[12] = fp64toselectable(scalar_a)
516 fprs[13] = fp64toselectable(scalar_b)
517 res = []
518 # work out the results with the twin add-sub
519 for i in range(4):
520 t = scalar_b + scalar_a
521 u = scalar_b - scalar_a
522 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
523 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
524 res.append((t, u))
525 print ("FFT", i, "res", t, u)
526
527 # SVSTATE (in this case, VL=2)
528 svstate = SVP64State()
529 svstate.vl[0:7] = 4 # VL
530 svstate.maxvl[0:7] = 4 # MAXVL
531 print ("SVSTATE", bin(svstate.spr.asint()))
532
533 with Program(lst, bigendian=False) as program:
534 sim = self.run_tst_program(program, svstate=svstate,
535 initial_fprs=fprs)
536 # confirm that the results are as expected
537 for i, (t, u) in enumerate(res):
538 a = float(sim.fpr(i+2))
539 b = float(sim.fpr(i+6))
540 t = float(t)
541 u = float(u)
542 print ("FFT", i, "in", a, b, "res", t, u)
543 for i, (t, u) in enumerate(res):
544 self.assertEqual(sim.fpr(i+2), t)
545 self.assertEqual(sim.fpr(i+6), u)
546
547 def run_tst_program(self, prog, initial_regs=None,
548 svstate=None,
549 initial_mem=None,
550 initial_fprs=None):
551 if initial_regs is None:
552 initial_regs = [0] * 32
553 simulator = run_tst(prog, initial_regs, mem=initial_mem,
554 initial_fprs=initial_fprs,
555 svstate=svstate)
556
557 print ("GPRs")
558 simulator.gpr.dump()
559 print ("FPRs")
560 simulator.fpr.dump()
561
562 return simulator
563
564
565 if __name__ == "__main__":
566 unittest.main()