working RADIX-2 FFT with bit-reversed LD/ST
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_fft.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
14
15
16 def transform_radix2(vec, exptable, reverse=False):
17 """
18 # FFT and convolution test (Python), based on Project Nayuki
19 #
20 # Copyright (c) 2020 Project Nayuki. (MIT License)
21 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
22
23 """
24 # bits of the integer 'val'.
25 def reverse_bits(val, width):
26 result = 0
27 for _ in range(width):
28 result = (result << 1) | (val & 1)
29 val >>= 1
30 return result
31
32 # Initialization
33 n = len(vec)
34 levels = n.bit_length() - 1
35
36 # Copy with bit-reversed permutation
37 if reverse:
38 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
39
40 size = 2
41 while size <= n:
42 halfsize = size // 2
43 tablestep = n // size
44 for i in range(0, n, size):
45 k = 0
46 for j in range(i, i + halfsize):
47 # exact same actual computation, just embedded in
48 # triple-nested for-loops
49 jl, jh = j, j+halfsize
50 vjh = vec[jh]
51 temp1 = vec[jh] * exptable[k]
52 temp2 = vec[jl]
53 vec[jh] = temp2 - temp1
54 vec[jl] = temp2 + temp1
55 print ("xform jl jh k", jl, jh, k,
56 "vj vjh ek", temp2, vjh, exptable[k],
57 "t1, t2", temp1, temp2,
58 "v[jh] v[jl]", vec[jh], vec[jl])
59 k += tablestep
60 size *= 2
61
62 return vec
63
64
65 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
66 """
67 # FFT and convolution test (Python), based on Project Nayuki
68 #
69 # Copyright (c) 2020 Project Nayuki. (MIT License)
70 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
71
72 """
73 # bits of the integer 'val'.
74 def reverse_bits(val, width):
75 result = 0
76 for _ in range(width):
77 result = (result << 1) | (val & 1)
78 val >>= 1
79 return result
80
81 # Initialization
82 n = len(vec_r)
83 levels = n.bit_length() - 1
84
85 # Copy with bit-reversed permutation
86 if reverse:
87 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
88
89 size = 2
90 while size <= n:
91 halfsize = size // 2
92 tablestep = n // size
93 for i in range(0, n, size):
94 k = 0
95 for j in range(i, i + halfsize):
96 # exact same actual computation, just embedded in
97 # triple-nested for-loops
98 jl, jh = j, j+halfsize
99
100 print ("xform jl jh k", jl, jh, k,
101 "vr h l", vec_r[jh], vec_r[jl],
102 "vi h l", vec_i[jh], vec_i[jl])
103 print (" cr k", cos_r[k], "si k", sin_i[k])
104 mul1_r = vec_r[jh] * cos_r[k]
105 mul2_r = vec_i[jh] * sin_i[k]
106 tpre = mul1_r + mul2_r
107 print (" vec_r[jh] * cos_r[k]", mul1_r)
108 print (" vec_i[jh] * sin_i[k]", mul2_r)
109 print (" tpre", tpre)
110 mul1_i = vec_r[jh] * sin_i[k]
111 mul2_i = vec_i[jh] * cos_r[k]
112 tpim = -mul1_i + mul2_i
113 print (" vec_r[jh] * sin_i[k]", mul1_i)
114 print (" vec_i[jh] * cos_r[k]", mul2_i)
115 print (" tpim", tpim)
116 vec_r[jh] = vec_r[jl] - tpre
117 vec_i[jh] = vec_i[jl] - tpim
118 vec_r[jl] += tpre
119 vec_i[jl] += tpim
120
121 print (" xform jl jh k", jl, jh, k,
122 "\n vr h l", vec_r[jh], vec_r[jl],
123 "\n vi h l", vec_i[jh], vec_i[jl])
124 k += tablestep
125 size *= 2
126
127 return vec_r, vec_i
128
129
130 class FFTTestCase(FHDLTestCase):
131
132 def _check_regs(self, sim, expected):
133 for i in range(32):
134 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
135
136 def test_sv_remap_fpmadds_fft(self):
137 """>>> lst = ["svshape 8, 1, 1, 1, 0",
138 "svremap 31, 1, 0, 2, 0, 1, 0",
139 "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
140 ]
141 runs a full in-place O(N log2 N) butterfly schedule for
142 Discrete Fourier Transform.
143
144 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
145 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
146
147 there is the *option* to target a different location (non-in-place)
148 just in case.
149
150 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
151 (3 inputs, 2 outputs)
152 """
153 lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
154 "svremap 31, 1, 0, 2, 0, 1, 0",
155 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
156 ])
157 lst = list(lst)
158
159 # array and coefficients to test
160 av = [7.0, -9.8, 3.0, -32.3,
161 -2.0, 5.0, -9.8, 31.3] # array 0..7
162 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
163
164 # store in regfile
165 fprs = [0] * 32
166 for i, c in enumerate(coe):
167 fprs[i+8] = fp64toselectable(c)
168 for i, a in enumerate(av):
169 fprs[i+0] = fp64toselectable(a)
170
171 with Program(lst, bigendian=False) as program:
172 sim = self.run_tst_program(program, initial_fprs=fprs)
173 print ("spr svshape0", sim.spr['SVSHAPE0'])
174 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
175 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
176 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
177 print ("spr svshape1", sim.spr['SVSHAPE1'])
178 print ("spr svshape2", sim.spr['SVSHAPE2'])
179 print ("spr svshape3", sim.spr['SVSHAPE3'])
180
181 # work out the results with the twin mul/add-sub
182 res = transform_radix2(av, coe)
183
184 for i, expected in enumerate(res):
185 print ("i", i, float(sim.fpr(i)), "expected", expected)
186 for i, expected in enumerate(res):
187 # convert to Power single
188 expected = DOUBLE2SINGLE(fp64toselectable(expected))
189 expected = float(expected)
190 actual = float(sim.fpr(i))
191 # approximate error calculation, good enough test
192 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
193 # and the rounding is different
194 err = abs(actual - expected) / expected
195 self.assertTrue(err < 1e-7)
196
197 def test_sv_remap_fpmadds_fft_svstep(self):
198 """>>> lst = SVP64Asm( [
199 "svshape 8, 1, 1, 1, 1",
200 "svremap 31, 1, 0, 2, 0, 1, 0",
201 "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
202 "setvl. 0, 0, 1, 1, 0, 0",
203 "bc 4, 2, -16"
204 ])
205 runs a full in-place O(N log2 N) butterfly schedule for
206 Discrete Fourier Transform. this version however uses
207 SVP64 "Vertical-First" Mode and so needs an explicit
208 branch, testing CR0.
209
210 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
211 (3 inputs, 2 outputs)
212 """
213 lst = SVP64Asm( [
214 "svshape 8, 1, 1, 1, 1",
215 "svremap 31, 1, 0, 2, 0, 1, 0",
216 "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
217 "setvl. 0, 0, 1, 1, 0, 0",
218 "bc 4, 2, -16"
219 ])
220 lst = list(lst)
221
222 # array and coefficients to test
223 av = [7.0, -9.8, 3.0, -32.3,
224 -2.0, 5.0, -9.8, 31.3] # array 0..7
225 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
226
227 # store in regfile
228 fprs = [0] * 32
229 for i, c in enumerate(coe):
230 fprs[i+8] = fp64toselectable(c)
231 for i, a in enumerate(av):
232 fprs[i+0] = fp64toselectable(a)
233
234 # set total. err don't know how to calculate how many there are...
235 # do it manually for now
236 VL = 0
237 size = 2
238 n = len(av)
239 while size <= n:
240 halfsize = size // 2
241 tablestep = n // size
242 for i in range(0, n, size):
243 for j in range(i, i + halfsize):
244 VL += 1
245 size *= 2
246
247 # SVSTATE (calculated VL)
248 svstate = SVP64State()
249 svstate.vl = VL # VL
250 svstate.maxvl = VL # MAXVL
251 print ("SVSTATE", bin(svstate.asint()))
252
253 with Program(lst, bigendian=False) as program:
254 sim = self.run_tst_program(program, svstate=svstate,
255 initial_fprs=fprs)
256 print ("spr svshape0", sim.spr['SVSHAPE0'])
257 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
258 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
259 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
260 print ("spr svshape1", sim.spr['SVSHAPE1'])
261 print ("spr svshape2", sim.spr['SVSHAPE2'])
262 print ("spr svshape3", sim.spr['SVSHAPE3'])
263
264 # work out the results with the twin mul/add-sub
265 res = transform_radix2(av, coe)
266
267 for i, expected in enumerate(res):
268 print ("i", i, float(sim.fpr(i)), "expected", expected)
269 for i, expected in enumerate(res):
270 # convert to Power single
271 expected = DOUBLE2SINGLE(fp64toselectable(expected))
272 expected = float(expected)
273 actual = float(sim.fpr(i))
274 # approximate error calculation, good enough test
275 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
276 # and the rounding is different
277 err = abs(actual - expected) / expected
278 self.assertTrue(err < 1e-7)
279
280 def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
281 """>>> lst = SVP64Asm( [
282 "svshape 8, 1, 1, 1, 1",
283 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
284 "svremap 5, 1, 0, 2, 0, 0, 1",
285 "sv.fmuls 24, 0.v, 8.v",
286 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
287 "svremap 26, 0, 0, 0, 0, 1, 1",
288 "sv.ffadds 0.v, 24, 0.v",
289 "setvl. 0, 0, 1, 1, 0, 0",
290 "bc 4, 2, -28"
291 ])
292
293 runs a full in-place O(N log2 N) butterfly schedule for
294 Discrete Fourier Transform. also uses "Vertical First"
295 but also uses temporary scalars and ffadds rather than
296 sv.ffmadds.
297
298 this represents an incremental step towards complex FFT
299
300 SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
301
302 * single fmuls FRT, FRA, FRC
303 * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
304 (FRS is implicit / hidden in ff* operations)
305
306 multiply: # sv.fmuls FRT, FRA, FRC
307 temp1 = vec[jh] * exptable[k]
308 temp2 = vec[jl]
309 twin-add: # sv.ffadds FRT(/FRS), FRA, FRB
310 vec[jh] = temp2 - temp1
311 vec[jl] = temp2 + temp1
312
313 also see notes in complex fft test: here svremap is done in
314 "non-persistent" mode (as a demo) whereas in the complex fft
315 svremap is used in "persistent" mode, where by a complete
316 coincidence the REMAP arguments all happen to line up and
317 only one persistent svremap is needed. the exact same trick
318 *could* be applied here but for illustrative purposes it is not.
319 """
320 lst = SVP64Asm( [
321 "svshape 8, 1, 1, 1, 1",
322 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
323 "svremap 5, 1, 0, 2, 0, 0, 0",
324 "sv.fmuls 24, 0.v, 8.v",
325 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
326 "svremap 26, 0, 0, 0, 0, 1, 0",
327 "sv.ffadds 0.v, 24, 0.v",
328 "setvl. 0, 0, 1, 1, 0, 0",
329 "bc 4, 2, -28"
330 ])
331 lst = list(lst)
332
333 # array and coefficients to test
334 av = [7.0, -9.8, 3.0, -32.3,
335 -2.0, 5.0, -9.8, 31.3] # array 0..7
336 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
337
338 # store in regfile
339 fprs = [0] * 32
340 for i, c in enumerate(coe):
341 fprs[i+8] = fp64toselectable(c)
342 for i, a in enumerate(av):
343 fprs[i+0] = fp64toselectable(a)
344
345 # set total. err don't know how to calculate how many there are...
346 # do it manually for now
347 VL = 0
348 size = 2
349 n = len(av)
350 while size <= n:
351 halfsize = size // 2
352 tablestep = n // size
353 for i in range(0, n, size):
354 for j in range(i, i + halfsize):
355 VL += 1
356 size *= 2
357
358 # SVSTATE (calculated VL)
359 svstate = SVP64State()
360 svstate.vl = VL # VL
361 svstate.maxvl = VL # MAXVL
362 print ("SVSTATE", bin(svstate.asint()))
363
364 with Program(lst, bigendian=False) as program:
365 sim = self.run_tst_program(program, svstate=svstate,
366 initial_fprs=fprs)
367 print ("spr svshape0", sim.spr['SVSHAPE0'])
368 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
369 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
370 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
371 print ("spr svshape1", sim.spr['SVSHAPE1'])
372 print ("spr svshape2", sim.spr['SVSHAPE2'])
373 print ("spr svshape3", sim.spr['SVSHAPE3'])
374
375 # work out the results with the twin mul/add-sub
376 res = transform_radix2(av, coe)
377
378 for i, expected in enumerate(res):
379 print ("i", i, float(sim.fpr(i)), "expected", expected)
380 for i, expected in enumerate(res):
381 # convert to Power single
382 expected = DOUBLE2SINGLE(fp64toselectable(expected))
383 expected = float(expected)
384 actual = float(sim.fpr(i))
385 # approximate error calculation, good enough test
386 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
387 # and the rounding is different
388 err = abs(actual - expected) / expected
389 self.assertTrue(err < 1e-7)
390
391 def test_sv_fpmadds_fft(self):
392 """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
393 ]
394 four in-place vector mul-adds, four in-place vector mul-subs
395
396 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
397 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
398
399 there is the *option* to target a different location (non-in-place)
400 just in case.
401
402 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
403 FRS to perform the two multiplies. one add, one subtract.
404
405 sv.ffmadds FRT, FRA, FRC, FRB actually does:
406 fmadds FRT , FRA, FRC, FRA
407 fnmsubs FRT+vl, FRA, FRC, FRB+vl
408
409 """
410 lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
411 ])
412 lst = list(lst)
413
414 fprs = [0] * 32
415 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
416 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
417 coe = [-1.0, 4.0, 3.1, 6.2] # coefficients
418 res = []
419 # work out the results with the twin mul/add-sub
420 for i, (a, b, c) in enumerate(zip(av, bv, coe)):
421 fprs[i+2] = fp64toselectable(a)
422 fprs[i+6] = fp64toselectable(b)
423 fprs[i+10] = fp64toselectable(c)
424 mul = a * c
425 t = b + mul
426 u = b - mul
427 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
428 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
429 res.append((t, u))
430 print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
431
432 # SVSTATE (in this case, VL=2)
433 svstate = SVP64State()
434 svstate.vl = 4 # VL
435 svstate.maxvl = 4 # MAXVL
436 print ("SVSTATE", bin(svstate.asint()))
437
438 with Program(lst, bigendian=False) as program:
439 sim = self.run_tst_program(program, svstate=svstate,
440 initial_fprs=fprs)
441 # confirm that the results are as expected
442 for i, (t, u) in enumerate(res):
443 self.assertEqual(sim.fpr(i+2), t)
444 self.assertEqual(sim.fpr(i+6), u)
445
446 def test_sv_ffadds_fft(self):
447 """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
448 ]
449 four in-place vector adds, four in-place vector subs
450
451 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
452 FRS to perform the two multiplies. one add, one subtract.
453
454 sv.ffadds FRT, FRA, FRB actually does:
455 fadds FRT , FRB, FRA
456 fsubs FRT+vl, FRA, FRB+vl
457 """
458 lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
459 ])
460 lst = list(lst)
461
462 fprs = [0] * 32
463 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
464 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
465 res = []
466 # work out the results with the twin add-sub
467 for i, (a, b) in enumerate(zip(av, bv)):
468 fprs[i+2] = fp64toselectable(a)
469 fprs[i+6] = fp64toselectable(b)
470 t = b + a
471 u = b - a
472 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
473 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
474 res.append((t, u))
475 print ("FFT", i, "in", a, b, "res", t, u)
476
477 # SVSTATE (in this case, VL=2)
478 svstate = SVP64State()
479 svstate.vl = 4 # VL
480 svstate.maxvl = 4 # MAXVL
481 print ("SVSTATE", bin(svstate.asint()))
482
483 with Program(lst, bigendian=False) as program:
484 sim = self.run_tst_program(program, svstate=svstate,
485 initial_fprs=fprs)
486 # confirm that the results are as expected
487 for i, (t, u) in enumerate(res):
488 a = float(sim.fpr(i+2))
489 b = float(sim.fpr(i+6))
490 t = float(t)
491 u = float(u)
492 print ("FFT", i, "in", a, b, "res", t, u)
493 for i, (t, u) in enumerate(res):
494 self.assertEqual(sim.fpr(i+2), t)
495 self.assertEqual(sim.fpr(i+6), u)
496
497 def test_sv_remap_fpmadds_fft_svstep_complex(self):
498 """
499 runs a full in-place O(N log2 N) butterfly schedule for
500 Discrete Fourier Transform. this version however uses
501 SVP64 "Vertical-First" Mode and so needs an explicit
502 branch, testing CR0.
503
504 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
505 (3 inputs, 2 outputs)
506
507 complex calculation (FFT):
508
509 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
510 vec_r[jh] = vec_r[jl] - tpre
511 vec_r[jl] += tpre
512
513 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
514 vec_i[jh] = vec_i[jl] - tpim
515 vec_i[jl] += tpim
516
517 real-only calculation (DFT):
518
519 temp1 = vec[jh] * exptable[k]
520 temp2 = vec[jl]
521 vec[jh] = temp2 - temp1
522 vec[jl] = temp2 + temp1
523
524 note: a rather nice convenience / coincidence. the meaning of
525 these two instructions is:
526 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
527 "svremap 5, 1, 0, 2, 0, 0, 1",
528 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
529 "svremap 26, 0, 0, 0, 0, 1, 1",
530
531 however it turns out that they can be *merged*, and for
532 the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
533 *ignore* their REMAPs (by definition), and for the second
534 one (sv.ffads) exactly the right REMAPs are also ignored!
535
536 "svremap 31, 1, 0, 2, 0, 1, 1",
537 """
538 lst = SVP64Asm( [
539 # set triple butterfly mode with persistent "REMAP"
540 "svshape 8, 1, 1, 1, 1",
541 "svremap 31, 1, 0, 2, 0, 1, 1",
542 # tpre
543 "sv.fmuls 24, 0.v, 16.v", # mul1_r = r*cos_r
544 "sv.fmadds 24, 8.v, 20.v, 24", # mul2_r = i*sin_i
545 # tpre = mul1_r + mul2_r
546 # tpim
547 "sv.fmuls 26, 0.v, 20.v", # mul1_i = r*sin_i
548 "sv.fmsubs 26, 8.v, 16.v, 26", # mul2_i = i*cos_r
549 # tpim = mul2_i - mul1_i
550 # vec_r jh/jl
551 "sv.ffadds 0.v, 24, 0.v", # vh/vl +/- tpre
552 # vec_i jh/jl
553 "sv.ffadds 8.v, 26, 8.v", # vh/vl +- tpim
554
555 # svstep loop
556 "setvl. 0, 0, 1, 1, 0, 0",
557 "bc 4, 2, -56"
558 ])
559 lst = list(lst)
560
561 # array and coefficients to test
562 ar = [7.0, -9.8, 3.0, -32.3,
563 -2.0, 5.0, -9.8, 31.3] # array 0..7 real
564 ai = [1.0, -1.8, 3.0, 19.3,
565 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
566 coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
567 coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
568
569 # store in regfile
570 fprs = [0] * 64
571 for i, a in enumerate(ar):
572 fprs[i+0] = fp64toselectable(a)
573 for i, a in enumerate(ai):
574 fprs[i+8] = fp64toselectable(a)
575 for i, cr in enumerate(coer):
576 fprs[i+16] = fp64toselectable(cr)
577 for i, ci in enumerate(coei):
578 fprs[i+20] = fp64toselectable(ci)
579
580 # set total. err don't know how to calculate how many there are...
581 # do it manually for now
582 VL = 0
583 size = 2
584 n = len(ar)
585 while size <= n:
586 halfsize = size // 2
587 tablestep = n // size
588 for i in range(0, n, size):
589 for j in range(i, i + halfsize):
590 VL += 1
591 size *= 2
592
593 # SVSTATE (calculated VL)
594 svstate = SVP64State()
595 svstate.vl = VL # VL
596 svstate.maxvl = VL # MAXVL
597 print ("SVSTATE", bin(svstate.asint()))
598
599 with Program(lst, bigendian=False) as program:
600 sim = self.run_tst_program(program, svstate=svstate,
601 initial_fprs=fprs)
602 print ("spr svshape0", sim.spr['SVSHAPE0'])
603 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
604 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
605 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
606 print ("spr svshape1", sim.spr['SVSHAPE1'])
607 print ("spr svshape2", sim.spr['SVSHAPE2'])
608 print ("spr svshape3", sim.spr['SVSHAPE3'])
609
610 # work out the results with the twin mul/add-sub, explicit
611 # complex numbers
612 res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
613
614 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
615 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
616 "expected_r", expected_r,
617 "expected_i", expected_i)
618 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
619 # convert to Power single
620 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
621 expected_r = float(expected_r)
622 actual_r = float(sim.fpr(i))
623 # approximate error calculation, good enough test
624 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
625 # and the rounding is different
626 err = abs(actual_r - expected_r ) / expected_r
627 self.assertTrue(err < 1e-6)
628 # convert to Power single
629 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
630 expected_i = float(expected_i)
631 actual_i = float(sim.fpr(i+8))
632 # approximate error calculation, good enough test
633 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
634 # and the rounding is different
635 err = abs(actual_i - expected_i ) / expected_i
636 self.assertTrue(err < 1e-6)
637
638 def test_sv_ffadds_fft_scalar(self):
639 """>>> lst = ["sv.ffadds 2.v, 12, 13"
640 ]
641 four in-place vector adds and subs, but done with a scalar
642 pair (fp12, fp13)
643 """
644 lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
645 ])
646 lst = list(lst)
647
648 fprs = [0] * 32
649 scalar_a = 1.3
650 scalar_b = -2.0
651 fprs[12] = fp64toselectable(scalar_a)
652 fprs[13] = fp64toselectable(scalar_b)
653 res = []
654 # work out the results with the twin add-sub
655 for i in range(4):
656 t = scalar_b + scalar_a
657 u = scalar_b - scalar_a
658 t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
659 u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
660 res.append((t, u))
661 print ("FFT", i, "res", t, u)
662
663 # SVSTATE (in this case, VL=2)
664 svstate = SVP64State()
665 svstate.vl = 4 # VL
666 svstate.maxvl = 4 # MAXVL
667 print ("SVSTATE", bin(svstate.asint()))
668
669 with Program(lst, bigendian=False) as program:
670 sim = self.run_tst_program(program, svstate=svstate,
671 initial_fprs=fprs)
672 # confirm that the results are as expected
673 for i, (t, u) in enumerate(res):
674 a = float(sim.fpr(i+2))
675 b = float(sim.fpr(i+6))
676 t = float(t)
677 u = float(u)
678 print ("FFT", i, "in", a, b, "res", t, u)
679 for i, (t, u) in enumerate(res):
680 self.assertEqual(sim.fpr(i+2), t)
681 self.assertEqual(sim.fpr(i+6), u)
682
683 def test_sv_remap_fpmadds_fft_ldst(self):
684 """>>> lst = ["svshape 8, 1, 1, 1, 0",
685 "svremap 31, 1, 0, 2, 0, 1, 0",
686 "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
687 ]
688 runs a full in-place O(N log2 N) butterfly schedule for
689 Discrete Fourier Transform, using bit-reversed LD/ST
690 """
691 lst = SVP64Asm( ["setvl 0, 0, 8, 0, 1, 1",
692 "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
693 "svshape 8, 1, 1, 1, 0",
694 "svremap 31, 1, 0, 2, 0, 1, 0",
695 "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
696 ])
697 lst = list(lst)
698
699 # array and coefficients to test
700 av = [7.0, -9.8, 3.0, -32.3,
701 -2.0, 5.0, -9.8, 31.3] # array 0..7
702 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
703
704 # store in regfile
705 fprs = [0] * 32
706 for i, c in enumerate(coe):
707 fprs[i+8] = fp64toselectable(c)
708 # store in memory
709 mem = {}
710 val = 0
711 for i, a in enumerate(av):
712 a = SINGLE(fp64toselectable(a)).value
713 shift = (i % 2) == 1
714 if shift == 0:
715 val = a
716 else:
717 mem[(i//2)*8] = val | (a << 32)
718
719 with Program(lst, bigendian=False) as program:
720 sim = self.run_tst_program(program, initial_mem=mem,
721 initial_fprs=fprs)
722 print ("spr svshape0", sim.spr['SVSHAPE0'])
723 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
724 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
725 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
726 print ("spr svshape1", sim.spr['SVSHAPE1'])
727 print ("spr svshape2", sim.spr['SVSHAPE2'])
728 print ("spr svshape3", sim.spr['SVSHAPE3'])
729
730 print ("mem dump")
731 print (sim.mem.dump())
732
733 # work out the results with the twin mul/add-sub
734 res = transform_radix2(av, coe, reverse=True)
735
736 for i, expected in enumerate(res):
737 print ("i", i, float(sim.fpr(i)), "expected", expected)
738 for i, expected in enumerate(res):
739 # convert to Power single
740 expected = DOUBLE2SINGLE(fp64toselectable(expected))
741 expected = float(expected)
742 actual = float(sim.fpr(i))
743 # approximate error calculation, good enough test
744 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
745 # and the rounding is different
746 err = abs(actual - expected) / expected
747 self.assertTrue(err < 1e-6)
748
749 def run_tst_program(self, prog, initial_regs=None,
750 svstate=None,
751 initial_mem=None,
752 initial_fprs=None):
753 if initial_regs is None:
754 initial_regs = [0] * 32
755 simulator = run_tst(prog, initial_regs, mem=initial_mem,
756 initial_fprs=initial_fprs,
757 svstate=svstate)
758
759 print ("GPRs")
760 simulator.gpr.dump()
761 print ("FPRs")
762 simulator.fpr.dump()
763
764 return simulator
765
766
767 if __name__ == "__main__":
768 unittest.main()