pysvp64db: fix traversal
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_fft.py
1 import unittest
2
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.helpers import SINGLE, fp64toselectable
5 from openpower.decoder.isa.caller import SVP64State
6 from openpower.decoder.isa.test_caller import run_tst
7 from openpower.decoder.isafunctions.double2single import (
8 ISACallerFnHelper_double2single)
9 from openpower.decoder.selectable_int import SelectableInt
10 from openpower.simulator.program import Program
11 from openpower.insndb.asm import SVP64Asm
12
13 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
14 # from pseudo-code.
15 fph = ISACallerFnHelper_double2single(XLEN=64, FPSCR=None)
16 fph.namespace = {'FPSCR': fph.FPSCR,
17 'NIA': None,
18 'XLEN': fph.XLEN,
19 'CIA': None,
20 'SVSTATE': None,
21 }
22
23
24 def transform_radix2(vec, exptable, reverse=False):
25 """
26 # FFT and convolution test (Python), based on Project Nayuki
27 #
28 # Copyright (c) 2020 Project Nayuki. (MIT License)
29 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
30
31 """
32 # bits of the integer 'val'.
33 def reverse_bits(val, width):
34 result = 0
35 for _ in range(width):
36 result = (result << 1) | (val & 1)
37 val >>= 1
38 return result
39
40 # Initialization
41 n = len(vec)
42 levels = n.bit_length() - 1
43
44 # Copy with bit-reversed permutation
45 if reverse:
46 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
47
48 size = 2
49 while size <= n:
50 halfsize = size // 2
51 tablestep = n // size
52 for i in range(0, n, size):
53 k = 0
54 for j in range(i, i + halfsize):
55 # exact same actual computation, just embedded in
56 # triple-nested for-loops
57 jl, jh = j, j+halfsize
58 vjh = vec[jh]
59 temp1 = vec[jh] * exptable[k]
60 temp2 = vec[jl]
61 vec[jh] = temp2 - temp1
62 vec[jl] = temp2 + temp1
63 print("xform jl jh k", jl, jh, k,
64 "vj vjh ek", temp2, vjh, exptable[k],
65 "t1, t2", temp1, temp2,
66 "v[jh] v[jl]", vec[jh], vec[jl])
67 k += tablestep
68 size *= 2
69
70 return vec
71
72
73 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
74 """
75 # FFT and convolution test (Python), based on Project Nayuki
76 #
77 # Copyright (c) 2020 Project Nayuki. (MIT License)
78 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
79
80 """
81 # bits of the integer 'val'.
82 def reverse_bits(val, width):
83 result = 0
84 for _ in range(width):
85 result = (result << 1) | (val & 1)
86 val >>= 1
87 return result
88
89 # Initialization
90 n = len(vec_r)
91 levels = n.bit_length() - 1
92
93 # Copy with bit-reversed permutation
94 if reverse:
95 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
96
97 size = 2
98 while size <= n:
99 halfsize = size // 2
100 tablestep = n // size
101 for i in range(0, n, size):
102 k = 0
103 for j in range(i, i + halfsize):
104 # exact same actual computation, just embedded in
105 # triple-nested for-loops
106 jl, jh = j, j+halfsize
107
108 print("xform jl jh k", jl, jh, k,
109 "vr h l", vec_r[jh], vec_r[jl],
110 "vi h l", vec_i[jh], vec_i[jl])
111 print(" cr k", cos_r[k], "si k", sin_i[k])
112 mul1_r = vec_r[jh] * cos_r[k]
113 mul2_r = vec_i[jh] * sin_i[k]
114 tpre = mul1_r + mul2_r
115 print(" vec_r[jh] * cos_r[k]", mul1_r)
116 print(" vec_i[jh] * sin_i[k]", mul2_r)
117 print(" tpre", tpre)
118 mul1_i = vec_r[jh] * sin_i[k]
119 mul2_i = vec_i[jh] * cos_r[k]
120 tpim = -mul1_i + mul2_i
121 print(" vec_r[jh] * sin_i[k]", mul1_i)
122 print(" vec_i[jh] * cos_r[k]", mul2_i)
123 print(" tpim", tpim)
124 vec_r[jh] = vec_r[jl] - tpre
125 vec_i[jh] = vec_i[jl] - tpim
126 vec_r[jl] += tpre
127 vec_i[jl] += tpim
128
129 print(" xform jl jh k", jl, jh, k,
130 "\n vr h l", vec_r[jh], vec_r[jl],
131 "\n vi h l", vec_i[jh], vec_i[jl])
132 k += tablestep
133 size *= 2
134
135 return vec_r, vec_i
136
137
138 class FFTTestCase(FHDLTestCase):
139
140 def _check_regs(self, sim, expected):
141 for i in range(32):
142 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
143
144 def test_sv_remap_fpmadds_fft_4(self):
145 """>>> lst = ["svshape 2, 1, 1, 1, 0",
146 "svremap 31, 1, 0, 2, 0, 1, 0",
147 "sv.ffmadds. *2, *10, *2"
148 ]
149 this is a cheap (cheating) way to run a single "ffmadds." to
150 get at least Rc=1 on sv.ffmadds to be activated. the results
151 are not actually tested because there's no checking yet on
152 FP Rc=1
153 """
154 lst = SVP64Asm(["svshape 2, 1, 1, 1, 0",
155 "svremap 31, 1, 0, 2, 0, 1, 0",
156 "sv.ffmadds *0, *8, *0"
157 ])
158 lst = list(lst)
159
160 # array and coefficients to test
161 av = [7.0, -9.8] # array 0..1
162 coe = [3.1] # coefficients
163
164 # store in regfile
165 fprs = [0] * 32
166 for i, c in enumerate(coe):
167 fprs[i+8] = fp64toselectable(c)
168 for i, a in enumerate(av):
169 fprs[i+0] = fp64toselectable(a)
170
171 with Program(lst, bigendian=False) as program:
172 sim = self.run_tst_program(program, initial_fprs=fprs)
173 print("spr svshape0", sim.spr['SVSHAPE0'])
174 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
175 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
176 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
177 print("spr svshape1", sim.spr['SVSHAPE1'])
178 print("spr svshape2", sim.spr['SVSHAPE2'])
179 print("spr svshape3", sim.spr['SVSHAPE3'])
180
181 # work out the results with the twin mul/add-sub
182 res = transform_radix2(av, coe)
183
184 for i, expected in enumerate(res):
185 print("i", i, float(sim.fpr(i)), "expected", expected)
186 for i, expected in enumerate(res):
187 # convert to Power single
188 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
189 expected = float(expected)
190 actual = float(sim.fpr(i))
191 # approximate error calculation, good enough test
192 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
193 # and the rounding is different
194 err = abs(actual - expected) / expected
195 self.assertTrue(err < 1e-7)
196
197 def test_sv_remap_fpmadds_fft(self):
198 """>>> lst = ["svshape 8, 1, 1, 1, 0",
199 "svremap 31, 1, 0, 2, 0, 1, 0",
200 "sv.ffmadds *2, *10, *2"
201 ]
202 runs a full in-place O(N log2 N) butterfly schedule for
203 Discrete Fourier Transform.
204
205 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
206 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
207
208 there is the *option* to target a different location (non-in-place)
209 just in case.
210
211 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
212 (3 inputs, 2 outputs)
213 """
214 lst = SVP64Asm(["svshape 8, 1, 1, 1, 0",
215 "svremap 31, 1, 0, 2, 0, 1, 0",
216 "sv.ffmadds *0, *8, *0"
217 ])
218 lst = list(lst)
219
220 # array and coefficients to test
221 av = [7.0, -9.8, 3.0, -32.3,
222 -2.0, 5.0, -9.8, 31.3] # array 0..7
223 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
224
225 # store in regfile
226 fprs = [0] * 32
227 for i, c in enumerate(coe):
228 fprs[i+8] = fp64toselectable(c)
229 for i, a in enumerate(av):
230 fprs[i+0] = fp64toselectable(a)
231
232 with Program(lst, bigendian=False) as program:
233 sim = self.run_tst_program(program, initial_fprs=fprs)
234 print("spr svshape0", sim.spr['SVSHAPE0'])
235 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
236 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
237 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
238 print("spr svshape1", sim.spr['SVSHAPE1'])
239 print("spr svshape2", sim.spr['SVSHAPE2'])
240 print("spr svshape3", sim.spr['SVSHAPE3'])
241
242 # work out the results with the twin mul/add-sub
243 res = transform_radix2(av, coe)
244
245 for i, expected in enumerate(res):
246 print("i", i, float(sim.fpr(i)), "expected", expected)
247 for i, expected in enumerate(res):
248 # convert to Power single
249 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
250 expected = float(expected)
251 actual = float(sim.fpr(i))
252 # approximate error calculation, good enough test
253 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
254 # and the rounding is different
255 err = abs(actual - expected) / expected
256 self.assertTrue(err < 1e-7)
257
258 def test_sv_remap_fpmadds_fft_svstep(self):
259 """>>> lst = SVP64Asm( [
260 "svshape 8, 1, 1, 1, 1",
261 "svremap 31, 1, 0, 2, 0, 1, 0",
262 "sv.ffmadds *0, *8, *0",
263 "svstep. 12, 0, 1, 0",
264 "bc 6, 3, -16"
265 ])
266 runs a full in-place O(N log2 N) butterfly schedule for
267 Discrete Fourier Transform. this version however uses
268 SVP64 "Vertical-First" Mode and so needs an explicit
269 branch, testing CR0.
270
271 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
272 (3 inputs, 2 outputs)
273 """
274 lst = SVP64Asm([
275 "svshape 8, 1, 1, 1, 1",
276 "svremap 31, 1, 0, 2, 0, 1, 0",
277 "sv.ffmadds *0, *8, *0",
278 "svstep. 27, 0, 1, 0",
279 "bc 6, 3, -16"
280 ])
281 lst = list(lst)
282
283 # array and coefficients to test
284 av = [7.0, -9.8, 3.0, -32.3,
285 -2.0, 5.0, -9.8, 31.3] # array 0..7
286 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
287
288 # store in regfile
289 fprs = [0] * 32
290 for i, c in enumerate(coe):
291 fprs[i+8] = fp64toselectable(c)
292 for i, a in enumerate(av):
293 fprs[i+0] = fp64toselectable(a)
294
295 # set total. err don't know how to calculate how many there are...
296 # do it manually for now
297 VL = 0
298 size = 2
299 n = len(av)
300 while size <= n:
301 halfsize = size // 2
302 tablestep = n // size
303 for i in range(0, n, size):
304 for j in range(i, i + halfsize):
305 VL += 1
306 size *= 2
307
308 # SVSTATE (calculated VL)
309 svstate = SVP64State()
310 svstate.vl = VL # VL
311 svstate.maxvl = VL # MAXVL
312 print("SVSTATE", bin(svstate.asint()))
313
314 with Program(lst, bigendian=False) as program:
315 sim = self.run_tst_program(program, svstate=svstate,
316 initial_fprs=fprs)
317 print("spr svshape0", sim.spr['SVSHAPE0'])
318 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
319 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
320 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
321 print("spr svshape1", sim.spr['SVSHAPE1'])
322 print("spr svshape2", sim.spr['SVSHAPE2'])
323 print("spr svshape3", sim.spr['SVSHAPE3'])
324
325 # work out the results with the twin mul/add-sub
326 res = transform_radix2(av, coe)
327
328 for i, expected in enumerate(res):
329 print("i", i, float(sim.fpr(i)), "expected", expected)
330 for i, expected in enumerate(res):
331 # convert to Power single
332 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
333 expected = float(expected)
334 actual = float(sim.fpr(i))
335 # approximate error calculation, good enough test
336 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
337 # and the rounding is different
338 err = abs(actual - expected) / expected
339 self.assertTrue(err < 1e-7)
340
341 def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
342 """>>> lst = SVP64Asm( [
343 "svshape 8, 1, 1, 1, 1",
344 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
345 "svremap 5, 1, 0, 2, 0, 0, 1",
346 "sv.fmuls 24, *0, *8",
347 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
348 "svremap 26, 0, 0, 0, 0, 1, 1",
349 "sv.ffadds *0, 24, *0",
350 "svstep. 27, 0, 1, 0",
351 "bc 6, 3, -28"
352 ])
353
354 runs a full in-place O(N log2 N) butterfly schedule for
355 Discrete Fourier Transform. also uses "Vertical First"
356 but also uses temporary scalars and ffadds rather than
357 sv.ffmadds.
358
359 this represents an incremental step towards complex FFT
360
361 SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
362
363 * single fmuls FRT, FRA, FRC
364 * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
365 (FRS is implicit / hidden in ff* operations)
366
367 multiply: # sv.fmuls FRT, FRA, FRC
368 temp1 = vec[jh] * exptable[k]
369 temp2 = vec[jl]
370 twin-add: # sv.ffadds FRT(/FRS), FRA, FRB
371 vec[jh] = temp2 - temp1
372 vec[jl] = temp2 + temp1
373
374 also see notes in complex fft test: here svremap is done in
375 "non-persistent" mode (as a demo) whereas in the complex fft
376 svremap is used in "persistent" mode, where by a complete
377 coincidence the REMAP arguments all happen to line up and
378 only one persistent svremap is needed. the exact same trick
379 *could* be applied here but for illustrative purposes it is not.
380 """
381 lst = SVP64Asm([
382 "svshape 8, 1, 1, 1, 1",
383 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
384 "svremap 5, 1, 0, 2, 0, 0, 0",
385 "sv.fmuls 24, *0, *8",
386 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
387 "svremap 26, 0, 0, 0, 0, 1, 0",
388 "sv.ffadds *0, 24, *0",
389 "svstep. 27, 0, 1, 0",
390 "bc 6, 3, -28"
391 ])
392 lst = list(lst)
393
394 # array and coefficients to test
395 av = [7.0, -9.8, 3.0, -32.3,
396 -2.0, 5.0, -9.8, 31.3] # array 0..7
397 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
398
399 # store in regfile
400 fprs = [0] * 32
401 for i, c in enumerate(coe):
402 fprs[i+8] = fp64toselectable(c)
403 for i, a in enumerate(av):
404 fprs[i+0] = fp64toselectable(a)
405
406 # set total. err don't know how to calculate how many there are...
407 # do it manually for now
408 VL = 0
409 size = 2
410 n = len(av)
411 while size <= n:
412 halfsize = size // 2
413 tablestep = n // size
414 for i in range(0, n, size):
415 for j in range(i, i + halfsize):
416 VL += 1
417 size *= 2
418
419 # SVSTATE (calculated VL)
420 svstate = SVP64State()
421 svstate.vl = VL # VL
422 svstate.maxvl = VL # MAXVL
423 print("SVSTATE", bin(svstate.asint()))
424
425 with Program(lst, bigendian=False) as program:
426 sim = self.run_tst_program(program, svstate=svstate,
427 initial_fprs=fprs)
428 print("spr svshape0", sim.spr['SVSHAPE0'])
429 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
430 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
431 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
432 print("spr svshape1", sim.spr['SVSHAPE1'])
433 print("spr svshape2", sim.spr['SVSHAPE2'])
434 print("spr svshape3", sim.spr['SVSHAPE3'])
435
436 # work out the results with the twin mul/add-sub
437 res = transform_radix2(av, coe)
438
439 for i, expected in enumerate(res):
440 print("i", i, float(sim.fpr(i)), "expected", expected)
441 for i, expected in enumerate(res):
442 # convert to Power single
443 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
444 expected = float(expected)
445 actual = float(sim.fpr(i))
446 # approximate error calculation, good enough test
447 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
448 # and the rounding is different
449 err = abs(actual - expected) / expected
450 self.assertTrue(err < 1e-7)
451
452 def test_sv_fpmadds_fft(self):
453 """>>> lst = ["sv.ffmadds *2, *10, *2"
454 ]
455 four in-place vector mul-adds, four in-place vector mul-subs
456
457 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
458 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
459
460 there is the *option* to target a different location (non-in-place)
461 just in case.
462
463 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
464 FRS to perform the two multiplies. one add, one subtract.
465
466 sv.ffmadds FRT, FRA, FRC, FRB actually does:
467 fmadds FRT , FRA, FRC, FRA
468 fnmsubs FRT+vl, FRA, FRC, FRB+vl
469
470 """
471 lst = SVP64Asm(["sv.ffmadds *2, *10, *2"
472 ])
473 lst = list(lst)
474
475 fprs = [0] * 32
476 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
477 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
478 coe = [-1.0, 4.0, 3.1, 6.2] # coefficients
479 res = []
480 # work out the results with the twin mul/add-sub
481 for i, (a, b, c) in enumerate(zip(av, bv, coe)):
482 fprs[i+2] = fp64toselectable(a)
483 fprs[i+6] = fp64toselectable(b)
484 fprs[i+10] = fp64toselectable(c)
485 mul = a * c
486 t = b + mul
487 u = b - mul
488 # convert to Power single
489 t = fph.DOUBLE2SINGLE(fp64toselectable(t))
490 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
491 res.append((t, u))
492 print("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
493
494 # SVSTATE (in this case, VL=2)
495 svstate = SVP64State()
496 svstate.vl = 4 # VL
497 svstate.maxvl = 4 # MAXVL
498 print("SVSTATE", bin(svstate.asint()))
499
500 with Program(lst, bigendian=False) as program:
501 sim = self.run_tst_program(program, svstate=svstate,
502 initial_fprs=fprs)
503 # confirm that the results are as expected
504 for i, (t, u) in enumerate(res):
505 self.assertEqual(sim.fpr(i+2), t)
506 self.assertEqual(sim.fpr(i+6), u)
507
508 def test_sv_ffadds_fft(self):
509 """>>> lst = ["sv.ffadds *2, *2, *2"
510 ]
511 four in-place vector adds, four in-place vector subs
512
513 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
514 FRS to perform the two multiplies. one add, one subtract.
515
516 sv.ffadds FRT, FRA, FRB actually does:
517 fadds FRT , FRB, FRA
518 fsubs FRT+vl, FRA, FRB+vl
519 """
520 lst = SVP64Asm(["sv.ffadds *2, *2, *2"
521 ])
522 lst = list(lst)
523
524 fprs = [0] * 32
525 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
526 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
527 res = []
528 # work out the results with the twin add-sub
529 for i, (a, b) in enumerate(zip(av, bv)):
530 fprs[i+2] = fp64toselectable(a)
531 fprs[i+6] = fp64toselectable(b)
532 t = b + a
533 u = b - a
534 # convert to Power single
535 t = fph.DOUBLE2SINGLE(fp64toselectable(t))
536 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
537 res.append((t, u))
538 print("FFT", i, "in", a, b, "res", t, u)
539
540 # SVSTATE (in this case, VL=2)
541 svstate = SVP64State()
542 svstate.vl = 4 # VL
543 svstate.maxvl = 4 # MAXVL
544 print("SVSTATE", bin(svstate.asint()))
545
546 with Program(lst, bigendian=False) as program:
547 sim = self.run_tst_program(program, svstate=svstate,
548 initial_fprs=fprs)
549 # confirm that the results are as expected
550 for i, (t, u) in enumerate(res):
551 a = float(sim.fpr(i+2))
552 b = float(sim.fpr(i+6))
553 t = float(t)
554 u = float(u)
555 print("FFT", i, "in", a, b, "res", t, u)
556 for i, (t, u) in enumerate(res):
557 self.assertEqual(sim.fpr(i+2), t)
558 self.assertEqual(sim.fpr(i+6), u)
559
560 def test_sv_remap_fpmadds_fft_svstep_complex(self):
561 """
562 runs a full in-place O(N log2 N) butterfly schedule for
563 Discrete Fourier Transform. this version however uses
564 SVP64 "Vertical-First" Mode and so needs an explicit
565 branch, testing CR0.
566
567 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
568 (3 inputs, 2 outputs)
569
570 complex calculation (FFT):
571
572 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
573 vec_r[jh] = vec_r[jl] - tpre
574 vec_r[jl] += tpre
575
576 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
577 vec_i[jh] = vec_i[jl] - tpim
578 vec_i[jl] += tpim
579
580 real-only calculation (DFT):
581
582 temp1 = vec[jh] * exptable[k]
583 temp2 = vec[jl]
584 vec[jh] = temp2 - temp1
585 vec[jl] = temp2 + temp1
586
587 note: a rather nice convenience / coincidence. the meaning of
588 these two instructions is:
589 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
590 "svremap 5, 1, 0, 2, 0, 0, 1",
591 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
592 "svremap 26, 0, 0, 0, 0, 1, 1",
593
594 however it turns out that they can be *merged*, and for
595 the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
596 *ignore* their REMAPs (by definition, because you can't REMAP
597 scalar operands), and for the second one (sv.ffads) exactly the
598 right REMAPs are also ignored!
599
600 therefore we can merge:
601 "svremap 5, 1, 0, 2, 0, 0, 1",
602 "svremap 26, 0, 0, 0, 0, 1, 1",
603 into:
604 "svremap 31, 1, 0, 2, 0, 1, 1",
605 and save one instruction.
606 """
607 lst = SVP64Asm([
608 # set triple butterfly mode with persistent "REMAP"
609 "svshape 8, 1, 1, 1, 1",
610 "svremap 31, 1, 0, 2, 0, 1, 1",
611 # tpre
612 "sv.fmuls 24, *0, *16", # mul1_r = r*cos_r
613 "sv.fmadds 24, *8, *20, 24", # mul2_r = i*sin_i
614 # tpre = mul1_r + mul2_r
615 # tpim
616 "sv.fmuls 26, *0, *20", # mul1_i = r*sin_i
617 "sv.fmsubs 26, *8, *16, 26", # mul2_i = i*cos_r
618 # tpim = mul2_i - mul1_i
619 # vec_r jh/jl
620 "sv.ffadds *0, 24, *0", # vh/vl +/- tpre
621 # vec_i jh/jl
622 "sv.ffadds *8, 26, *8", # vh/vl +- tpim
623
624 # svstep loop
625 "svstep. 27, 0, 1, 0",
626 "bc 6, 3, -56"
627 ])
628 lst = list(lst)
629
630 # array and coefficients to test
631 ar = [7.0, -9.8, 3.0, -32.3,
632 -2.0, 5.0, -9.8, 31.3] # array 0..7 real
633 ai = [1.0, -1.8, 3.0, 19.3,
634 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
635 coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
636 coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
637
638 # store in regfile
639 fprs = [0] * 64
640 for i, a in enumerate(ar):
641 fprs[i+0] = fp64toselectable(a)
642 for i, a in enumerate(ai):
643 fprs[i+8] = fp64toselectable(a)
644 for i, cr in enumerate(coer):
645 fprs[i+16] = fp64toselectable(cr)
646 for i, ci in enumerate(coei):
647 fprs[i+20] = fp64toselectable(ci)
648
649 # set total. err don't know how to calculate how many there are...
650 # do it manually for now
651 VL = 0
652 size = 2
653 n = len(ar)
654 while size <= n:
655 halfsize = size // 2
656 tablestep = n // size
657 for i in range(0, n, size):
658 for j in range(i, i + halfsize):
659 VL += 1
660 size *= 2
661
662 # SVSTATE (calculated VL)
663 svstate = SVP64State()
664 svstate.vl = VL # VL
665 svstate.maxvl = VL # MAXVL
666 print("SVSTATE", bin(svstate.asint()))
667
668 with Program(lst, bigendian=False) as program:
669 sim = self.run_tst_program(program, svstate=svstate,
670 initial_fprs=fprs)
671 print("spr svshape0", sim.spr['SVSHAPE0'])
672 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
673 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
674 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
675 print("spr svshape1", sim.spr['SVSHAPE1'])
676 print("spr svshape2", sim.spr['SVSHAPE2'])
677 print("spr svshape3", sim.spr['SVSHAPE3'])
678
679 # work out the results with the twin mul/add-sub, explicit
680 # complex numbers
681 res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
682
683 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
684 print("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
685 "expected_r", expected_r,
686 "expected_i", expected_i)
687 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
688 # convert to Power single
689 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r))
690 expected_r = float(expected_r)
691 actual_r = float(sim.fpr(i))
692 # approximate error calculation, good enough test
693 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
694 # and the rounding is different
695 err = abs(actual_r - expected_r) / expected_r
696 self.assertTrue(err < 1e-6)
697 # convert to Power single
698 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i))
699 expected_i = float(expected_i)
700 actual_i = float(sim.fpr(i+8))
701 # approximate error calculation, good enough test
702 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
703 # and the rounding is different
704 err = abs(actual_i - expected_i) / expected_i
705 self.assertTrue(err < 1e-6)
706
707 def test_sv_ffadds_fft_scalar(self):
708 """>>> lst = ["sv.ffadds *2, 12, 13"
709 ]
710 four in-place vector adds and subs, but done with a scalar
711 pair (fp12, fp13)
712 """
713 lst = SVP64Asm(["sv.ffadds *2, 12, 13"
714 ])
715 lst = list(lst)
716
717 fprs = [0] * 32
718 scalar_a = 1.3
719 scalar_b = -2.0
720 fprs[12] = fp64toselectable(scalar_a)
721 fprs[13] = fp64toselectable(scalar_b)
722 res = []
723 # work out the results with the twin add-sub
724 for i in range(4):
725 t = scalar_b + scalar_a
726 u = scalar_b - scalar_a
727 # convert to Power single
728 t = fph.DOUBLE2SINGLE(fp64toselectable(t))
729 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
730 res.append((t, u))
731 print("FFT", i, "res", t, u)
732
733 # SVSTATE (in this case, VL=2)
734 svstate = SVP64State()
735 svstate.vl = 4 # VL
736 svstate.maxvl = 4 # MAXVL
737 print("SVSTATE", bin(svstate.asint()))
738
739 with Program(lst, bigendian=False) as program:
740 sim = self.run_tst_program(program, svstate=svstate,
741 initial_fprs=fprs)
742 # confirm that the results are as expected
743 for i, (t, u) in enumerate(res):
744 a = float(sim.fpr(i+2))
745 b = float(sim.fpr(i+6))
746 t = float(t)
747 u = float(u)
748 print("FFT", i, "in", a, b, "res", t, u)
749 for i, (t, u) in enumerate(res):
750 self.assertEqual(sim.fpr(i+2), t)
751 self.assertEqual(sim.fpr(i+6), u)
752
753 def test_sv_remap_fpmadds_fft_ldst(self):
754 """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
755 "sv.lfs/els *0, 4(0)",
756 "svshape 8, 1, 1, 1, 0",
757 "svremap 31, 1, 0, 2, 0, 1, 0",
758 "sv.ffmadds *0, *8, *0"
759
760 runs a full in-place O(N log2 N) butterfly schedule for
761 Discrete Fourier Transform, using bit-reversed LD/ST
762 """
763 lst = SVP64Asm(["svshape 8, 1, 1, 15, 0",
764 "svremap 1, 0, 0, 0, 0, 0, 0",
765 "sv.lfs/els *0, 4(0)",
766 "svshape 8, 1, 1, 1, 0",
767 "svremap 31, 1, 0, 2, 0, 1, 0",
768 "sv.ffmadds *0, *8, *0"
769 ])
770 lst = list(lst)
771
772 # array and coefficients to test
773 av = [7.0, -9.8, 3.0, -32.3,
774 -2.0, 5.0, -9.8, 31.3] # array 0..7
775 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
776
777 # store in regfile
778 fprs = [0] * 32
779 for i, c in enumerate(coe):
780 fprs[i+8] = fp64toselectable(c)
781 # store in memory
782 mem = {}
783 val = 0
784 for i, a in enumerate(av):
785 a = SINGLE(fp64toselectable(a)).value
786 shift = (i % 2) == 1
787 if shift == 0:
788 val = a
789 else:
790 mem[(i//2)*8] = val | (a << 32)
791
792 with Program(lst, bigendian=False) as program:
793 sim = self.run_tst_program(program, initial_mem=mem,
794 initial_fprs=fprs)
795 print("spr svshape0", sim.spr['SVSHAPE0'])
796 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
797 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
798 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
799 print("spr svshape1", sim.spr['SVSHAPE1'])
800 print("spr svshape2", sim.spr['SVSHAPE2'])
801 print("spr svshape3", sim.spr['SVSHAPE3'])
802
803 print("mem dump")
804 print(sim.mem.dump())
805
806 # work out the results with the twin mul/add-sub,
807 # note bit-reverse mode requested
808 res = transform_radix2(av, coe, reverse=True)
809
810 for i, expected in enumerate(res):
811 print("i", i, float(sim.fpr(i)), "expected", expected)
812 for i, expected in enumerate(res):
813 # convert to Power single
814 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
815 expected = float(expected)
816 actual = float(sim.fpr(i))
817 # approximate error calculation, good enough test
818 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
819 # and the rounding is different
820 err = abs(actual - expected) / expected
821 self.assertTrue(err < 1e-6)
822
823 def run_tst_program(self, prog, initial_regs=None,
824 svstate=None,
825 initial_mem=None,
826 initial_fprs=None):
827 if initial_regs is None:
828 initial_regs = [0] * 32
829 simulator = run_tst(prog, initial_regs, mem=initial_mem,
830 initial_fprs=initial_fprs,
831 svstate=svstate)
832
833 print("GPRs")
834 simulator.gpr.dump()
835 print("FPRs")
836 simulator.fpr.dump()
837
838 return simulator
839
840
841 if __name__ == "__main__":
842 unittest.main()