convert test_caller_svp64_fft.py to new vector numbering convention
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_fft.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
14
15 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
16 # from pseudo-code.
17 fph = ISACallerFnHelper(XLEN=64)
18
19
20 def transform_radix2(vec, exptable, reverse=False):
21 """
22 # FFT and convolution test (Python), based on Project Nayuki
23 #
24 # Copyright (c) 2020 Project Nayuki. (MIT License)
25 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
26
27 """
28 # bits of the integer 'val'.
29 def reverse_bits(val, width):
30 result = 0
31 for _ in range(width):
32 result = (result << 1) | (val & 1)
33 val >>= 1
34 return result
35
36 # Initialization
37 n = len(vec)
38 levels = n.bit_length() - 1
39
40 # Copy with bit-reversed permutation
41 if reverse:
42 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
43
44 size = 2
45 while size <= n:
46 halfsize = size // 2
47 tablestep = n // size
48 for i in range(0, n, size):
49 k = 0
50 for j in range(i, i + halfsize):
51 # exact same actual computation, just embedded in
52 # triple-nested for-loops
53 jl, jh = j, j+halfsize
54 vjh = vec[jh]
55 temp1 = vec[jh] * exptable[k]
56 temp2 = vec[jl]
57 vec[jh] = temp2 - temp1
58 vec[jl] = temp2 + temp1
59 print ("xform jl jh k", jl, jh, k,
60 "vj vjh ek", temp2, vjh, exptable[k],
61 "t1, t2", temp1, temp2,
62 "v[jh] v[jl]", vec[jh], vec[jl])
63 k += tablestep
64 size *= 2
65
66 return vec
67
68
69 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
70 """
71 # FFT and convolution test (Python), based on Project Nayuki
72 #
73 # Copyright (c) 2020 Project Nayuki. (MIT License)
74 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
75
76 """
77 # bits of the integer 'val'.
78 def reverse_bits(val, width):
79 result = 0
80 for _ in range(width):
81 result = (result << 1) | (val & 1)
82 val >>= 1
83 return result
84
85 # Initialization
86 n = len(vec_r)
87 levels = n.bit_length() - 1
88
89 # Copy with bit-reversed permutation
90 if reverse:
91 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
92
93 size = 2
94 while size <= n:
95 halfsize = size // 2
96 tablestep = n // size
97 for i in range(0, n, size):
98 k = 0
99 for j in range(i, i + halfsize):
100 # exact same actual computation, just embedded in
101 # triple-nested for-loops
102 jl, jh = j, j+halfsize
103
104 print ("xform jl jh k", jl, jh, k,
105 "vr h l", vec_r[jh], vec_r[jl],
106 "vi h l", vec_i[jh], vec_i[jl])
107 print (" cr k", cos_r[k], "si k", sin_i[k])
108 mul1_r = vec_r[jh] * cos_r[k]
109 mul2_r = vec_i[jh] * sin_i[k]
110 tpre = mul1_r + mul2_r
111 print (" vec_r[jh] * cos_r[k]", mul1_r)
112 print (" vec_i[jh] * sin_i[k]", mul2_r)
113 print (" tpre", tpre)
114 mul1_i = vec_r[jh] * sin_i[k]
115 mul2_i = vec_i[jh] * cos_r[k]
116 tpim = -mul1_i + mul2_i
117 print (" vec_r[jh] * sin_i[k]", mul1_i)
118 print (" vec_i[jh] * cos_r[k]", mul2_i)
119 print (" tpim", tpim)
120 vec_r[jh] = vec_r[jl] - tpre
121 vec_i[jh] = vec_i[jl] - tpim
122 vec_r[jl] += tpre
123 vec_i[jl] += tpim
124
125 print (" xform jl jh k", jl, jh, k,
126 "\n vr h l", vec_r[jh], vec_r[jl],
127 "\n vi h l", vec_i[jh], vec_i[jl])
128 k += tablestep
129 size *= 2
130
131 return vec_r, vec_i
132
133
134 class FFTTestCase(FHDLTestCase):
135
136 def _check_regs(self, sim, expected):
137 for i in range(32):
138 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
139
140 def test_sv_remap_fpmadds_fft(self):
141 """>>> lst = ["svshape 8, 1, 1, 1, 0",
142 "svremap 31, 1, 0, 2, 0, 1, 0",
143 "sv.ffmadds *2, *2, *2, *10"
144 ]
145 runs a full in-place O(N log2 N) butterfly schedule for
146 Discrete Fourier Transform.
147
148 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
149 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
150
151 there is the *option* to target a different location (non-in-place)
152 just in case.
153
154 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
155 (3 inputs, 2 outputs)
156 """
157 lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
158 "svremap 31, 1, 0, 2, 0, 1, 0",
159 "sv.ffmadds *0, *0, *0, *8"
160 ])
161 lst = list(lst)
162
163 # array and coefficients to test
164 av = [7.0, -9.8, 3.0, -32.3,
165 -2.0, 5.0, -9.8, 31.3] # array 0..7
166 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
167
168 # store in regfile
169 fprs = [0] * 32
170 for i, c in enumerate(coe):
171 fprs[i+8] = fp64toselectable(c)
172 for i, a in enumerate(av):
173 fprs[i+0] = fp64toselectable(a)
174
175 with Program(lst, bigendian=False) as program:
176 sim = self.run_tst_program(program, initial_fprs=fprs)
177 print ("spr svshape0", sim.spr['SVSHAPE0'])
178 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
179 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
180 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
181 print ("spr svshape1", sim.spr['SVSHAPE1'])
182 print ("spr svshape2", sim.spr['SVSHAPE2'])
183 print ("spr svshape3", sim.spr['SVSHAPE3'])
184
185 # work out the results with the twin mul/add-sub
186 res = transform_radix2(av, coe)
187
188 for i, expected in enumerate(res):
189 print ("i", i, float(sim.fpr(i)), "expected", expected)
190 for i, expected in enumerate(res):
191 # convert to Power single
192 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
193 expected = float(expected)
194 actual = float(sim.fpr(i))
195 # approximate error calculation, good enough test
196 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
197 # and the rounding is different
198 err = abs(actual - expected) / expected
199 self.assertTrue(err < 1e-7)
200
201 def test_sv_remap_fpmadds_fft_svstep(self):
202 """>>> lst = SVP64Asm( [
203 "svshape 8, 1, 1, 1, 1",
204 "svremap 31, 1, 0, 2, 0, 1, 0",
205 "sv.ffmadds *0, *0, *0, *8",
206 "setvl. 0, 0, 1, 1, 0, 0",
207 "bc 6, 3, -16"
208 ])
209 runs a full in-place O(N log2 N) butterfly schedule for
210 Discrete Fourier Transform. this version however uses
211 SVP64 "Vertical-First" Mode and so needs an explicit
212 branch, testing CR0.
213
214 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
215 (3 inputs, 2 outputs)
216 """
217 lst = SVP64Asm( [
218 "svshape 8, 1, 1, 1, 1",
219 "svremap 31, 1, 0, 2, 0, 1, 0",
220 "sv.ffmadds *0, *0, *0, *8",
221 "setvl. 0, 0, 1, 1, 0, 0",
222 "bc 6, 3, -16"
223 ])
224 lst = list(lst)
225
226 # array and coefficients to test
227 av = [7.0, -9.8, 3.0, -32.3,
228 -2.0, 5.0, -9.8, 31.3] # array 0..7
229 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
230
231 # store in regfile
232 fprs = [0] * 32
233 for i, c in enumerate(coe):
234 fprs[i+8] = fp64toselectable(c)
235 for i, a in enumerate(av):
236 fprs[i+0] = fp64toselectable(a)
237
238 # set total. err don't know how to calculate how many there are...
239 # do it manually for now
240 VL = 0
241 size = 2
242 n = len(av)
243 while size <= n:
244 halfsize = size // 2
245 tablestep = n // size
246 for i in range(0, n, size):
247 for j in range(i, i + halfsize):
248 VL += 1
249 size *= 2
250
251 # SVSTATE (calculated VL)
252 svstate = SVP64State()
253 svstate.vl = VL # VL
254 svstate.maxvl = VL # MAXVL
255 print ("SVSTATE", bin(svstate.asint()))
256
257 with Program(lst, bigendian=False) as program:
258 sim = self.run_tst_program(program, svstate=svstate,
259 initial_fprs=fprs)
260 print ("spr svshape0", sim.spr['SVSHAPE0'])
261 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
262 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
263 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
264 print ("spr svshape1", sim.spr['SVSHAPE1'])
265 print ("spr svshape2", sim.spr['SVSHAPE2'])
266 print ("spr svshape3", sim.spr['SVSHAPE3'])
267
268 # work out the results with the twin mul/add-sub
269 res = transform_radix2(av, coe)
270
271 for i, expected in enumerate(res):
272 print ("i", i, float(sim.fpr(i)), "expected", expected)
273 for i, expected in enumerate(res):
274 # convert to Power single
275 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
276 expected = float(expected)
277 actual = float(sim.fpr(i))
278 # approximate error calculation, good enough test
279 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
280 # and the rounding is different
281 err = abs(actual - expected) / expected
282 self.assertTrue(err < 1e-7)
283
284 def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
285 """>>> lst = SVP64Asm( [
286 "svshape 8, 1, 1, 1, 1",
287 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
288 "svremap 5, 1, 0, 2, 0, 0, 1",
289 "sv.fmuls 24, *0, *8",
290 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
291 "svremap 26, 0, 0, 0, 0, 1, 1",
292 "sv.ffadds *0, 24, *0",
293 "setvl. 0, 0, 1, 1, 0, 0",
294 "bc 6, 3, -28"
295 ])
296
297 runs a full in-place O(N log2 N) butterfly schedule for
298 Discrete Fourier Transform. also uses "Vertical First"
299 but also uses temporary scalars and ffadds rather than
300 sv.ffmadds.
301
302 this represents an incremental step towards complex FFT
303
304 SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
305
306 * single fmuls FRT, FRA, FRC
307 * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
308 (FRS is implicit / hidden in ff* operations)
309
310 multiply: # sv.fmuls FRT, FRA, FRC
311 temp1 = vec[jh] * exptable[k]
312 temp2 = vec[jl]
313 twin-add: # sv.ffadds FRT(/FRS), FRA, FRB
314 vec[jh] = temp2 - temp1
315 vec[jl] = temp2 + temp1
316
317 also see notes in complex fft test: here svremap is done in
318 "non-persistent" mode (as a demo) whereas in the complex fft
319 svremap is used in "persistent" mode, where by a complete
320 coincidence the REMAP arguments all happen to line up and
321 only one persistent svremap is needed. the exact same trick
322 *could* be applied here but for illustrative purposes it is not.
323 """
324 lst = SVP64Asm( [
325 "svshape 8, 1, 1, 1, 1",
326 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
327 "svremap 5, 1, 0, 2, 0, 0, 0",
328 "sv.fmuls 24, *0, *8",
329 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
330 "svremap 26, 0, 0, 0, 0, 1, 0",
331 "sv.ffadds *0, 24, *0",
332 "setvl. 0, 0, 1, 1, 0, 0",
333 "bc 6, 3, -28"
334 ])
335 lst = list(lst)
336
337 # array and coefficients to test
338 av = [7.0, -9.8, 3.0, -32.3,
339 -2.0, 5.0, -9.8, 31.3] # array 0..7
340 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
341
342 # store in regfile
343 fprs = [0] * 32
344 for i, c in enumerate(coe):
345 fprs[i+8] = fp64toselectable(c)
346 for i, a in enumerate(av):
347 fprs[i+0] = fp64toselectable(a)
348
349 # set total. err don't know how to calculate how many there are...
350 # do it manually for now
351 VL = 0
352 size = 2
353 n = len(av)
354 while size <= n:
355 halfsize = size // 2
356 tablestep = n // size
357 for i in range(0, n, size):
358 for j in range(i, i + halfsize):
359 VL += 1
360 size *= 2
361
362 # SVSTATE (calculated VL)
363 svstate = SVP64State()
364 svstate.vl = VL # VL
365 svstate.maxvl = VL # MAXVL
366 print ("SVSTATE", bin(svstate.asint()))
367
368 with Program(lst, bigendian=False) as program:
369 sim = self.run_tst_program(program, svstate=svstate,
370 initial_fprs=fprs)
371 print ("spr svshape0", sim.spr['SVSHAPE0'])
372 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
373 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
374 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
375 print ("spr svshape1", sim.spr['SVSHAPE1'])
376 print ("spr svshape2", sim.spr['SVSHAPE2'])
377 print ("spr svshape3", sim.spr['SVSHAPE3'])
378
379 # work out the results with the twin mul/add-sub
380 res = transform_radix2(av, coe)
381
382 for i, expected in enumerate(res):
383 print ("i", i, float(sim.fpr(i)), "expected", expected)
384 for i, expected in enumerate(res):
385 # convert to Power single
386 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
387 expected = float(expected)
388 actual = float(sim.fpr(i))
389 # approximate error calculation, good enough test
390 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
391 # and the rounding is different
392 err = abs(actual - expected) / expected
393 self.assertTrue(err < 1e-7)
394
395 def test_sv_fpmadds_fft(self):
396 """>>> lst = ["sv.ffmadds *2, *2, *2, *10"
397 ]
398 four in-place vector mul-adds, four in-place vector mul-subs
399
400 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
401 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
402
403 there is the *option* to target a different location (non-in-place)
404 just in case.
405
406 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
407 FRS to perform the two multiplies. one add, one subtract.
408
409 sv.ffmadds FRT, FRA, FRC, FRB actually does:
410 fmadds FRT , FRA, FRC, FRA
411 fnmsubs FRT+vl, FRA, FRC, FRB+vl
412
413 """
414 lst = SVP64Asm(["sv.ffmadds *2, *2, *2, *10"
415 ])
416 lst = list(lst)
417
418 fprs = [0] * 32
419 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
420 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
421 coe = [-1.0, 4.0, 3.1, 6.2] # coefficients
422 res = []
423 # work out the results with the twin mul/add-sub
424 for i, (a, b, c) in enumerate(zip(av, bv, coe)):
425 fprs[i+2] = fp64toselectable(a)
426 fprs[i+6] = fp64toselectable(b)
427 fprs[i+10] = fp64toselectable(c)
428 mul = a * c
429 t = b + mul
430 u = b - mul
431 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
432 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
433 res.append((t, u))
434 print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
435
436 # SVSTATE (in this case, VL=2)
437 svstate = SVP64State()
438 svstate.vl = 4 # VL
439 svstate.maxvl = 4 # MAXVL
440 print ("SVSTATE", bin(svstate.asint()))
441
442 with Program(lst, bigendian=False) as program:
443 sim = self.run_tst_program(program, svstate=svstate,
444 initial_fprs=fprs)
445 # confirm that the results are as expected
446 for i, (t, u) in enumerate(res):
447 self.assertEqual(sim.fpr(i+2), t)
448 self.assertEqual(sim.fpr(i+6), u)
449
450 def test_sv_ffadds_fft(self):
451 """>>> lst = ["sv.ffadds *2, *2, *2"
452 ]
453 four in-place vector adds, four in-place vector subs
454
455 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
456 FRS to perform the two multiplies. one add, one subtract.
457
458 sv.ffadds FRT, FRA, FRB actually does:
459 fadds FRT , FRB, FRA
460 fsubs FRT+vl, FRA, FRB+vl
461 """
462 lst = SVP64Asm(["sv.ffadds *2, *2, *2"
463 ])
464 lst = list(lst)
465
466 fprs = [0] * 32
467 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
468 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
469 res = []
470 # work out the results with the twin add-sub
471 for i, (a, b) in enumerate(zip(av, bv)):
472 fprs[i+2] = fp64toselectable(a)
473 fprs[i+6] = fp64toselectable(b)
474 t = b + a
475 u = b - a
476 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
477 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
478 res.append((t, u))
479 print ("FFT", i, "in", a, b, "res", t, u)
480
481 # SVSTATE (in this case, VL=2)
482 svstate = SVP64State()
483 svstate.vl = 4 # VL
484 svstate.maxvl = 4 # MAXVL
485 print ("SVSTATE", bin(svstate.asint()))
486
487 with Program(lst, bigendian=False) as program:
488 sim = self.run_tst_program(program, svstate=svstate,
489 initial_fprs=fprs)
490 # confirm that the results are as expected
491 for i, (t, u) in enumerate(res):
492 a = float(sim.fpr(i+2))
493 b = float(sim.fpr(i+6))
494 t = float(t)
495 u = float(u)
496 print ("FFT", i, "in", a, b, "res", t, u)
497 for i, (t, u) in enumerate(res):
498 self.assertEqual(sim.fpr(i+2), t)
499 self.assertEqual(sim.fpr(i+6), u)
500
501 def test_sv_remap_fpmadds_fft_svstep_complex(self):
502 """
503 runs a full in-place O(N log2 N) butterfly schedule for
504 Discrete Fourier Transform. this version however uses
505 SVP64 "Vertical-First" Mode and so needs an explicit
506 branch, testing CR0.
507
508 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
509 (3 inputs, 2 outputs)
510
511 complex calculation (FFT):
512
513 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
514 vec_r[jh] = vec_r[jl] - tpre
515 vec_r[jl] += tpre
516
517 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
518 vec_i[jh] = vec_i[jl] - tpim
519 vec_i[jl] += tpim
520
521 real-only calculation (DFT):
522
523 temp1 = vec[jh] * exptable[k]
524 temp2 = vec[jl]
525 vec[jh] = temp2 - temp1
526 vec[jl] = temp2 + temp1
527
528 note: a rather nice convenience / coincidence. the meaning of
529 these two instructions is:
530 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
531 "svremap 5, 1, 0, 2, 0, 0, 1",
532 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
533 "svremap 26, 0, 0, 0, 0, 1, 1",
534
535 however it turns out that they can be *merged*, and for
536 the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
537 *ignore* their REMAPs (by definition, because you can't REMAP
538 scalar operands), and for the second one (sv.ffads) exactly the
539 right REMAPs are also ignored!
540
541 therefore we can merge:
542 "svremap 5, 1, 0, 2, 0, 0, 1",
543 "svremap 26, 0, 0, 0, 0, 1, 1",
544 into:
545 "svremap 31, 1, 0, 2, 0, 1, 1",
546 and save one instruction.
547 """
548 lst = SVP64Asm( [
549 # set triple butterfly mode with persistent "REMAP"
550 "svshape 8, 1, 1, 1, 1",
551 "svremap 31, 1, 0, 2, 0, 1, 1",
552 # tpre
553 "sv.fmuls 24, *0, *16", # mul1_r = r*cos_r
554 "sv.fmadds 24, *8, *20, 24", # mul2_r = i*sin_i
555 # tpre = mul1_r + mul2_r
556 # tpim
557 "sv.fmuls 26, *0, *20", # mul1_i = r*sin_i
558 "sv.fmsubs 26, *8, *16, 26", # mul2_i = i*cos_r
559 # tpim = mul2_i - mul1_i
560 # vec_r jh/jl
561 "sv.ffadds *0, 24, *0", # vh/vl +/- tpre
562 # vec_i jh/jl
563 "sv.ffadds *8, 26, *8", # vh/vl +- tpim
564
565 # svstep loop
566 "setvl. 0, 0, 1, 1, 0, 0",
567 "bc 6, 3, -56"
568 ])
569 lst = list(lst)
570
571 # array and coefficients to test
572 ar = [7.0, -9.8, 3.0, -32.3,
573 -2.0, 5.0, -9.8, 31.3] # array 0..7 real
574 ai = [1.0, -1.8, 3.0, 19.3,
575 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
576 coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
577 coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
578
579 # store in regfile
580 fprs = [0] * 64
581 for i, a in enumerate(ar):
582 fprs[i+0] = fp64toselectable(a)
583 for i, a in enumerate(ai):
584 fprs[i+8] = fp64toselectable(a)
585 for i, cr in enumerate(coer):
586 fprs[i+16] = fp64toselectable(cr)
587 for i, ci in enumerate(coei):
588 fprs[i+20] = fp64toselectable(ci)
589
590 # set total. err don't know how to calculate how many there are...
591 # do it manually for now
592 VL = 0
593 size = 2
594 n = len(ar)
595 while size <= n:
596 halfsize = size // 2
597 tablestep = n // size
598 for i in range(0, n, size):
599 for j in range(i, i + halfsize):
600 VL += 1
601 size *= 2
602
603 # SVSTATE (calculated VL)
604 svstate = SVP64State()
605 svstate.vl = VL # VL
606 svstate.maxvl = VL # MAXVL
607 print ("SVSTATE", bin(svstate.asint()))
608
609 with Program(lst, bigendian=False) as program:
610 sim = self.run_tst_program(program, svstate=svstate,
611 initial_fprs=fprs)
612 print ("spr svshape0", sim.spr['SVSHAPE0'])
613 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
614 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
615 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
616 print ("spr svshape1", sim.spr['SVSHAPE1'])
617 print ("spr svshape2", sim.spr['SVSHAPE2'])
618 print ("spr svshape3", sim.spr['SVSHAPE3'])
619
620 # work out the results with the twin mul/add-sub, explicit
621 # complex numbers
622 res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
623
624 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
625 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
626 "expected_r", expected_r,
627 "expected_i", expected_i)
628 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
629 # convert to Power single
630 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r ))
631 expected_r = float(expected_r)
632 actual_r = float(sim.fpr(i))
633 # approximate error calculation, good enough test
634 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
635 # and the rounding is different
636 err = abs(actual_r - expected_r ) / expected_r
637 self.assertTrue(err < 1e-6)
638 # convert to Power single
639 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i ))
640 expected_i = float(expected_i)
641 actual_i = float(sim.fpr(i+8))
642 # approximate error calculation, good enough test
643 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
644 # and the rounding is different
645 err = abs(actual_i - expected_i ) / expected_i
646 self.assertTrue(err < 1e-6)
647
648 def test_sv_ffadds_fft_scalar(self):
649 """>>> lst = ["sv.ffadds *2, 12, 13"
650 ]
651 four in-place vector adds and subs, but done with a scalar
652 pair (fp12, fp13)
653 """
654 lst = SVP64Asm(["sv.ffadds *2, 12, 13"
655 ])
656 lst = list(lst)
657
658 fprs = [0] * 32
659 scalar_a = 1.3
660 scalar_b = -2.0
661 fprs[12] = fp64toselectable(scalar_a)
662 fprs[13] = fp64toselectable(scalar_b)
663 res = []
664 # work out the results with the twin add-sub
665 for i in range(4):
666 t = scalar_b + scalar_a
667 u = scalar_b - scalar_a
668 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
669 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
670 res.append((t, u))
671 print ("FFT", i, "res", t, u)
672
673 # SVSTATE (in this case, VL=2)
674 svstate = SVP64State()
675 svstate.vl = 4 # VL
676 svstate.maxvl = 4 # MAXVL
677 print ("SVSTATE", bin(svstate.asint()))
678
679 with Program(lst, bigendian=False) as program:
680 sim = self.run_tst_program(program, svstate=svstate,
681 initial_fprs=fprs)
682 # confirm that the results are as expected
683 for i, (t, u) in enumerate(res):
684 a = float(sim.fpr(i+2))
685 b = float(sim.fpr(i+6))
686 t = float(t)
687 u = float(u)
688 print ("FFT", i, "in", a, b, "res", t, u)
689 for i, (t, u) in enumerate(res):
690 self.assertEqual(sim.fpr(i+2), t)
691 self.assertEqual(sim.fpr(i+6), u)
692
693 def test_sv_remap_fpmadds_fft_ldst(self):
694 """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
695 "sv.lfssh *0, 4(0), 20", # bit-reversed
696 "svshape 8, 1, 1, 1, 0",
697 "svremap 31, 1, 0, 2, 0, 1, 0",
698 "sv.ffmadds *0, *0, *0, *8"
699
700 runs a full in-place O(N log2 N) butterfly schedule for
701 Discrete Fourier Transform, using bit-reversed LD/ST
702 """
703 lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
704 "svremap 1, 0, 0, 0, 0, 0, 0",
705 "sv.lfssh *0, 4(0), 20", # shifted
706 "svshape 8, 1, 1, 1, 0",
707 "svremap 31, 1, 0, 2, 0, 1, 0",
708 "sv.ffmadds *0, *0, *0, *8"
709 ])
710 lst = list(lst)
711
712 # array and coefficients to test
713 av = [7.0, -9.8, 3.0, -32.3,
714 -2.0, 5.0, -9.8, 31.3] # array 0..7
715 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
716
717 # store in regfile
718 fprs = [0] * 32
719 for i, c in enumerate(coe):
720 fprs[i+8] = fp64toselectable(c)
721 # store in memory
722 mem = {}
723 val = 0
724 for i, a in enumerate(av):
725 a = SINGLE(fp64toselectable(a)).value
726 shift = (i % 2) == 1
727 if shift == 0:
728 val = a
729 else:
730 mem[(i//2)*8] = val | (a << 32)
731
732 with Program(lst, bigendian=False) as program:
733 sim = self.run_tst_program(program, initial_mem=mem,
734 initial_fprs=fprs)
735 print ("spr svshape0", sim.spr['SVSHAPE0'])
736 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
737 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
738 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
739 print ("spr svshape1", sim.spr['SVSHAPE1'])
740 print ("spr svshape2", sim.spr['SVSHAPE2'])
741 print ("spr svshape3", sim.spr['SVSHAPE3'])
742
743 print ("mem dump")
744 print (sim.mem.dump())
745
746 # work out the results with the twin mul/add-sub,
747 # note bit-reverse mode requested
748 res = transform_radix2(av, coe, reverse=True)
749
750 for i, expected in enumerate(res):
751 print ("i", i, float(sim.fpr(i)), "expected", expected)
752 for i, expected in enumerate(res):
753 # convert to Power single
754 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
755 expected = float(expected)
756 actual = float(sim.fpr(i))
757 # approximate error calculation, good enough test
758 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
759 # and the rounding is different
760 err = abs(actual - expected) / expected
761 self.assertTrue(err < 1e-6)
762
763 def run_tst_program(self, prog, initial_regs=None,
764 svstate=None,
765 initial_mem=None,
766 initial_fprs=None):
767 if initial_regs is None:
768 initial_regs = [0] * 32
769 simulator = run_tst(prog, initial_regs, mem=initial_mem,
770 initial_fprs=initial_fprs,
771 svstate=svstate)
772
773 print ("GPRs")
774 simulator.gpr.dump()
775 print ("FPRs")
776 simulator.fpr.dump()
777
778 return simulator
779
780
781 if __name__ == "__main__":
782 unittest.main()