cf5915adf99b4348316ec7853376362df0ed6893
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_fft.py
1 from nmigen import Module, Signal
2 from nmigen.sim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.power_decoder import (create_pdecode)
6 from openpower.simulator.program import Program
7 from openpower.decoder.isa.caller import SVP64State
8 from openpower.decoder.selectable_int import SelectableInt
9 from openpower.decoder.isa.test_caller import run_tst
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from copy import deepcopy
12 from openpower.decoder.helpers import fp64toselectable, SINGLE
13 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
14
15 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
16 # from pseudo-code.
17 fph = ISACallerFnHelper(XLEN=64)
18
19
20 def transform_radix2(vec, exptable, reverse=False):
21 """
22 # FFT and convolution test (Python), based on Project Nayuki
23 #
24 # Copyright (c) 2020 Project Nayuki. (MIT License)
25 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
26
27 """
28 # bits of the integer 'val'.
29 def reverse_bits(val, width):
30 result = 0
31 for _ in range(width):
32 result = (result << 1) | (val & 1)
33 val >>= 1
34 return result
35
36 # Initialization
37 n = len(vec)
38 levels = n.bit_length() - 1
39
40 # Copy with bit-reversed permutation
41 if reverse:
42 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
43
44 size = 2
45 while size <= n:
46 halfsize = size // 2
47 tablestep = n // size
48 for i in range(0, n, size):
49 k = 0
50 for j in range(i, i + halfsize):
51 # exact same actual computation, just embedded in
52 # triple-nested for-loops
53 jl, jh = j, j+halfsize
54 vjh = vec[jh]
55 temp1 = vec[jh] * exptable[k]
56 temp2 = vec[jl]
57 vec[jh] = temp2 - temp1
58 vec[jl] = temp2 + temp1
59 print ("xform jl jh k", jl, jh, k,
60 "vj vjh ek", temp2, vjh, exptable[k],
61 "t1, t2", temp1, temp2,
62 "v[jh] v[jl]", vec[jh], vec[jl])
63 k += tablestep
64 size *= 2
65
66 return vec
67
68
69 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
70 """
71 # FFT and convolution test (Python), based on Project Nayuki
72 #
73 # Copyright (c) 2020 Project Nayuki. (MIT License)
74 # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
75
76 """
77 # bits of the integer 'val'.
78 def reverse_bits(val, width):
79 result = 0
80 for _ in range(width):
81 result = (result << 1) | (val & 1)
82 val >>= 1
83 return result
84
85 # Initialization
86 n = len(vec_r)
87 levels = n.bit_length() - 1
88
89 # Copy with bit-reversed permutation
90 if reverse:
91 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
92
93 size = 2
94 while size <= n:
95 halfsize = size // 2
96 tablestep = n // size
97 for i in range(0, n, size):
98 k = 0
99 for j in range(i, i + halfsize):
100 # exact same actual computation, just embedded in
101 # triple-nested for-loops
102 jl, jh = j, j+halfsize
103
104 print ("xform jl jh k", jl, jh, k,
105 "vr h l", vec_r[jh], vec_r[jl],
106 "vi h l", vec_i[jh], vec_i[jl])
107 print (" cr k", cos_r[k], "si k", sin_i[k])
108 mul1_r = vec_r[jh] * cos_r[k]
109 mul2_r = vec_i[jh] * sin_i[k]
110 tpre = mul1_r + mul2_r
111 print (" vec_r[jh] * cos_r[k]", mul1_r)
112 print (" vec_i[jh] * sin_i[k]", mul2_r)
113 print (" tpre", tpre)
114 mul1_i = vec_r[jh] * sin_i[k]
115 mul2_i = vec_i[jh] * cos_r[k]
116 tpim = -mul1_i + mul2_i
117 print (" vec_r[jh] * sin_i[k]", mul1_i)
118 print (" vec_i[jh] * cos_r[k]", mul2_i)
119 print (" tpim", tpim)
120 vec_r[jh] = vec_r[jl] - tpre
121 vec_i[jh] = vec_i[jl] - tpim
122 vec_r[jl] += tpre
123 vec_i[jl] += tpim
124
125 print (" xform jl jh k", jl, jh, k,
126 "\n vr h l", vec_r[jh], vec_r[jl],
127 "\n vi h l", vec_i[jh], vec_i[jl])
128 k += tablestep
129 size *= 2
130
131 return vec_r, vec_i
132
133
134 class FFTTestCase(FHDLTestCase):
135
136 def _check_regs(self, sim, expected):
137 for i in range(32):
138 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
139
140 def test_sv_remap_fpmadds_fft_4(self):
141 """>>> lst = ["svshape 2, 1, 1, 1, 0",
142 "svremap 31, 1, 0, 2, 0, 1, 0",
143 "sv.ffmadds. *2, *2, *2, *10"
144 ]
145 this is a cheap (cheating) way to run a single "ffmadds." to
146 get at least Rc=1 on sv.ffmadds to be activated. the results
147 are not actually tested because there's no checking yet on
148 FP Rc=1
149 """
150 lst = SVP64Asm( ["svshape 2, 1, 1, 1, 0",
151 "svremap 31, 1, 0, 2, 0, 1, 0",
152 "sv.ffmadds *0, *0, *0, *8"
153 ])
154 lst = list(lst)
155
156 # array and coefficients to test
157 av = [7.0, -9.8 ] # array 0..1
158 coe = [3.1] # coefficients
159
160 # store in regfile
161 fprs = [0] * 32
162 for i, c in enumerate(coe):
163 fprs[i+8] = fp64toselectable(c)
164 for i, a in enumerate(av):
165 fprs[i+0] = fp64toselectable(a)
166
167 with Program(lst, bigendian=False) as program:
168 sim = self.run_tst_program(program, initial_fprs=fprs)
169 print ("spr svshape0", sim.spr['SVSHAPE0'])
170 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
171 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
172 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
173 print ("spr svshape1", sim.spr['SVSHAPE1'])
174 print ("spr svshape2", sim.spr['SVSHAPE2'])
175 print ("spr svshape3", sim.spr['SVSHAPE3'])
176
177 # work out the results with the twin mul/add-sub
178 res = transform_radix2(av, coe)
179
180 for i, expected in enumerate(res):
181 print ("i", i, float(sim.fpr(i)), "expected", expected)
182 for i, expected in enumerate(res):
183 # convert to Power single
184 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
185 expected = float(expected)
186 actual = float(sim.fpr(i))
187 # approximate error calculation, good enough test
188 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
189 # and the rounding is different
190 err = abs(actual - expected) / expected
191 self.assertTrue(err < 1e-7)
192
193 def test_sv_remap_fpmadds_fft(self):
194 """>>> lst = ["svshape 8, 1, 1, 1, 0",
195 "svremap 31, 1, 0, 2, 0, 1, 0",
196 "sv.ffmadds *2, *2, *2, *10"
197 ]
198 runs a full in-place O(N log2 N) butterfly schedule for
199 Discrete Fourier Transform.
200
201 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
202 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
203
204 there is the *option* to target a different location (non-in-place)
205 just in case.
206
207 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
208 (3 inputs, 2 outputs)
209 """
210 lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
211 "svremap 31, 1, 0, 2, 0, 1, 0",
212 "sv.ffmadds *0, *0, *0, *8"
213 ])
214 lst = list(lst)
215
216 # array and coefficients to test
217 av = [7.0, -9.8, 3.0, -32.3,
218 -2.0, 5.0, -9.8, 31.3] # array 0..7
219 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
220
221 # store in regfile
222 fprs = [0] * 32
223 for i, c in enumerate(coe):
224 fprs[i+8] = fp64toselectable(c)
225 for i, a in enumerate(av):
226 fprs[i+0] = fp64toselectable(a)
227
228 with Program(lst, bigendian=False) as program:
229 sim = self.run_tst_program(program, initial_fprs=fprs)
230 print ("spr svshape0", sim.spr['SVSHAPE0'])
231 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
232 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
233 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
234 print ("spr svshape1", sim.spr['SVSHAPE1'])
235 print ("spr svshape2", sim.spr['SVSHAPE2'])
236 print ("spr svshape3", sim.spr['SVSHAPE3'])
237
238 # work out the results with the twin mul/add-sub
239 res = transform_radix2(av, coe)
240
241 for i, expected in enumerate(res):
242 print ("i", i, float(sim.fpr(i)), "expected", expected)
243 for i, expected in enumerate(res):
244 # convert to Power single
245 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
246 expected = float(expected)
247 actual = float(sim.fpr(i))
248 # approximate error calculation, good enough test
249 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
250 # and the rounding is different
251 err = abs(actual - expected) / expected
252 self.assertTrue(err < 1e-7)
253
254 def test_sv_remap_fpmadds_fft_svstep(self):
255 """>>> lst = SVP64Asm( [
256 "svshape 8, 1, 1, 1, 1",
257 "svremap 31, 1, 0, 2, 0, 1, 0",
258 "sv.ffmadds *0, *0, *0, *8",
259 "setvl. 0, 0, 1, 1, 0, 0",
260 "bc 6, 3, -16"
261 ])
262 runs a full in-place O(N log2 N) butterfly schedule for
263 Discrete Fourier Transform. this version however uses
264 SVP64 "Vertical-First" Mode and so needs an explicit
265 branch, testing CR0.
266
267 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
268 (3 inputs, 2 outputs)
269 """
270 lst = SVP64Asm( [
271 "svshape 8, 1, 1, 1, 1",
272 "svremap 31, 1, 0, 2, 0, 1, 0",
273 "sv.ffmadds *0, *0, *0, *8",
274 "setvl. 0, 0, 1, 1, 0, 0",
275 "bc 6, 3, -16"
276 ])
277 lst = list(lst)
278
279 # array and coefficients to test
280 av = [7.0, -9.8, 3.0, -32.3,
281 -2.0, 5.0, -9.8, 31.3] # array 0..7
282 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
283
284 # store in regfile
285 fprs = [0] * 32
286 for i, c in enumerate(coe):
287 fprs[i+8] = fp64toselectable(c)
288 for i, a in enumerate(av):
289 fprs[i+0] = fp64toselectable(a)
290
291 # set total. err don't know how to calculate how many there are...
292 # do it manually for now
293 VL = 0
294 size = 2
295 n = len(av)
296 while size <= n:
297 halfsize = size // 2
298 tablestep = n // size
299 for i in range(0, n, size):
300 for j in range(i, i + halfsize):
301 VL += 1
302 size *= 2
303
304 # SVSTATE (calculated VL)
305 svstate = SVP64State()
306 svstate.vl = VL # VL
307 svstate.maxvl = VL # MAXVL
308 print ("SVSTATE", bin(svstate.asint()))
309
310 with Program(lst, bigendian=False) as program:
311 sim = self.run_tst_program(program, svstate=svstate,
312 initial_fprs=fprs)
313 print ("spr svshape0", sim.spr['SVSHAPE0'])
314 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
315 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
316 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
317 print ("spr svshape1", sim.spr['SVSHAPE1'])
318 print ("spr svshape2", sim.spr['SVSHAPE2'])
319 print ("spr svshape3", sim.spr['SVSHAPE3'])
320
321 # work out the results with the twin mul/add-sub
322 res = transform_radix2(av, coe)
323
324 for i, expected in enumerate(res):
325 print ("i", i, float(sim.fpr(i)), "expected", expected)
326 for i, expected in enumerate(res):
327 # convert to Power single
328 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
329 expected = float(expected)
330 actual = float(sim.fpr(i))
331 # approximate error calculation, good enough test
332 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
333 # and the rounding is different
334 err = abs(actual - expected) / expected
335 self.assertTrue(err < 1e-7)
336
337 def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
338 """>>> lst = SVP64Asm( [
339 "svshape 8, 1, 1, 1, 1",
340 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
341 "svremap 5, 1, 0, 2, 0, 0, 1",
342 "sv.fmuls 24, *0, *8",
343 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
344 "svremap 26, 0, 0, 0, 0, 1, 1",
345 "sv.ffadds *0, 24, *0",
346 "setvl. 0, 0, 1, 1, 0, 0",
347 "bc 6, 3, -28"
348 ])
349
350 runs a full in-place O(N log2 N) butterfly schedule for
351 Discrete Fourier Transform. also uses "Vertical First"
352 but also uses temporary scalars and ffadds rather than
353 sv.ffmadds.
354
355 this represents an incremental step towards complex FFT
356
357 SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
358
359 * single fmuls FRT, FRA, FRC
360 * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
361 (FRS is implicit / hidden in ff* operations)
362
363 multiply: # sv.fmuls FRT, FRA, FRC
364 temp1 = vec[jh] * exptable[k]
365 temp2 = vec[jl]
366 twin-add: # sv.ffadds FRT(/FRS), FRA, FRB
367 vec[jh] = temp2 - temp1
368 vec[jl] = temp2 + temp1
369
370 also see notes in complex fft test: here svremap is done in
371 "non-persistent" mode (as a demo) whereas in the complex fft
372 svremap is used in "persistent" mode, where by a complete
373 coincidence the REMAP arguments all happen to line up and
374 only one persistent svremap is needed. the exact same trick
375 *could* be applied here but for illustrative purposes it is not.
376 """
377 lst = SVP64Asm( [
378 "svshape 8, 1, 1, 1, 1",
379 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
380 "svremap 5, 1, 0, 2, 0, 0, 0",
381 "sv.fmuls 24, *0, *8",
382 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
383 "svremap 26, 0, 0, 0, 0, 1, 0",
384 "sv.ffadds *0, 24, *0",
385 "setvl. 0, 0, 1, 1, 0, 0",
386 "bc 6, 3, -28"
387 ])
388 lst = list(lst)
389
390 # array and coefficients to test
391 av = [7.0, -9.8, 3.0, -32.3,
392 -2.0, 5.0, -9.8, 31.3] # array 0..7
393 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
394
395 # store in regfile
396 fprs = [0] * 32
397 for i, c in enumerate(coe):
398 fprs[i+8] = fp64toselectable(c)
399 for i, a in enumerate(av):
400 fprs[i+0] = fp64toselectable(a)
401
402 # set total. err don't know how to calculate how many there are...
403 # do it manually for now
404 VL = 0
405 size = 2
406 n = len(av)
407 while size <= n:
408 halfsize = size // 2
409 tablestep = n // size
410 for i in range(0, n, size):
411 for j in range(i, i + halfsize):
412 VL += 1
413 size *= 2
414
415 # SVSTATE (calculated VL)
416 svstate = SVP64State()
417 svstate.vl = VL # VL
418 svstate.maxvl = VL # MAXVL
419 print ("SVSTATE", bin(svstate.asint()))
420
421 with Program(lst, bigendian=False) as program:
422 sim = self.run_tst_program(program, svstate=svstate,
423 initial_fprs=fprs)
424 print ("spr svshape0", sim.spr['SVSHAPE0'])
425 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
426 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
427 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
428 print ("spr svshape1", sim.spr['SVSHAPE1'])
429 print ("spr svshape2", sim.spr['SVSHAPE2'])
430 print ("spr svshape3", sim.spr['SVSHAPE3'])
431
432 # work out the results with the twin mul/add-sub
433 res = transform_radix2(av, coe)
434
435 for i, expected in enumerate(res):
436 print ("i", i, float(sim.fpr(i)), "expected", expected)
437 for i, expected in enumerate(res):
438 # convert to Power single
439 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
440 expected = float(expected)
441 actual = float(sim.fpr(i))
442 # approximate error calculation, good enough test
443 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
444 # and the rounding is different
445 err = abs(actual - expected) / expected
446 self.assertTrue(err < 1e-7)
447
448 def test_sv_fpmadds_fft(self):
449 """>>> lst = ["sv.ffmadds *2, *2, *2, *10"
450 ]
451 four in-place vector mul-adds, four in-place vector mul-subs
452
453 this is the twin "butterfly" mul-add-sub from Cooley-Tukey
454 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
455
456 there is the *option* to target a different location (non-in-place)
457 just in case.
458
459 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
460 FRS to perform the two multiplies. one add, one subtract.
461
462 sv.ffmadds FRT, FRA, FRC, FRB actually does:
463 fmadds FRT , FRA, FRC, FRA
464 fnmsubs FRT+vl, FRA, FRC, FRB+vl
465
466 """
467 lst = SVP64Asm(["sv.ffmadds *2, *2, *2, *10"
468 ])
469 lst = list(lst)
470
471 fprs = [0] * 32
472 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
473 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
474 coe = [-1.0, 4.0, 3.1, 6.2] # coefficients
475 res = []
476 # work out the results with the twin mul/add-sub
477 for i, (a, b, c) in enumerate(zip(av, bv, coe)):
478 fprs[i+2] = fp64toselectable(a)
479 fprs[i+6] = fp64toselectable(b)
480 fprs[i+10] = fp64toselectable(c)
481 mul = a * c
482 t = b + mul
483 u = b - mul
484 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
485 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
486 res.append((t, u))
487 print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
488
489 # SVSTATE (in this case, VL=2)
490 svstate = SVP64State()
491 svstate.vl = 4 # VL
492 svstate.maxvl = 4 # MAXVL
493 print ("SVSTATE", bin(svstate.asint()))
494
495 with Program(lst, bigendian=False) as program:
496 sim = self.run_tst_program(program, svstate=svstate,
497 initial_fprs=fprs)
498 # confirm that the results are as expected
499 for i, (t, u) in enumerate(res):
500 self.assertEqual(sim.fpr(i+2), t)
501 self.assertEqual(sim.fpr(i+6), u)
502
503 def test_sv_ffadds_fft(self):
504 """>>> lst = ["sv.ffadds *2, *2, *2"
505 ]
506 four in-place vector adds, four in-place vector subs
507
508 SVP64 "FFT" mode will *automatically* offset FRB and an implicit
509 FRS to perform the two multiplies. one add, one subtract.
510
511 sv.ffadds FRT, FRA, FRB actually does:
512 fadds FRT , FRB, FRA
513 fsubs FRT+vl, FRA, FRB+vl
514 """
515 lst = SVP64Asm(["sv.ffadds *2, *2, *2"
516 ])
517 lst = list(lst)
518
519 fprs = [0] * 32
520 av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
521 bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
522 res = []
523 # work out the results with the twin add-sub
524 for i, (a, b) in enumerate(zip(av, bv)):
525 fprs[i+2] = fp64toselectable(a)
526 fprs[i+6] = fp64toselectable(b)
527 t = b + a
528 u = b - a
529 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
530 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
531 res.append((t, u))
532 print ("FFT", i, "in", a, b, "res", t, u)
533
534 # SVSTATE (in this case, VL=2)
535 svstate = SVP64State()
536 svstate.vl = 4 # VL
537 svstate.maxvl = 4 # MAXVL
538 print ("SVSTATE", bin(svstate.asint()))
539
540 with Program(lst, bigendian=False) as program:
541 sim = self.run_tst_program(program, svstate=svstate,
542 initial_fprs=fprs)
543 # confirm that the results are as expected
544 for i, (t, u) in enumerate(res):
545 a = float(sim.fpr(i+2))
546 b = float(sim.fpr(i+6))
547 t = float(t)
548 u = float(u)
549 print ("FFT", i, "in", a, b, "res", t, u)
550 for i, (t, u) in enumerate(res):
551 self.assertEqual(sim.fpr(i+2), t)
552 self.assertEqual(sim.fpr(i+6), u)
553
554 def test_sv_remap_fpmadds_fft_svstep_complex(self):
555 """
556 runs a full in-place O(N log2 N) butterfly schedule for
557 Discrete Fourier Transform. this version however uses
558 SVP64 "Vertical-First" Mode and so needs an explicit
559 branch, testing CR0.
560
561 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
562 (3 inputs, 2 outputs)
563
564 complex calculation (FFT):
565
566 tpre = vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
567 vec_r[jh] = vec_r[jl] - tpre
568 vec_r[jl] += tpre
569
570 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
571 vec_i[jh] = vec_i[jl] - tpim
572 vec_i[jl] += tpim
573
574 real-only calculation (DFT):
575
576 temp1 = vec[jh] * exptable[k]
577 temp2 = vec[jl]
578 vec[jh] = temp2 - temp1
579 vec[jl] = temp2 + temp1
580
581 note: a rather nice convenience / coincidence. the meaning of
582 these two instructions is:
583 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
584 "svremap 5, 1, 0, 2, 0, 0, 1",
585 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
586 "svremap 26, 0, 0, 0, 0, 1, 1",
587
588 however it turns out that they can be *merged*, and for
589 the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
590 *ignore* their REMAPs (by definition, because you can't REMAP
591 scalar operands), and for the second one (sv.ffads) exactly the
592 right REMAPs are also ignored!
593
594 therefore we can merge:
595 "svremap 5, 1, 0, 2, 0, 0, 1",
596 "svremap 26, 0, 0, 0, 0, 1, 1",
597 into:
598 "svremap 31, 1, 0, 2, 0, 1, 1",
599 and save one instruction.
600 """
601 lst = SVP64Asm( [
602 # set triple butterfly mode with persistent "REMAP"
603 "svshape 8, 1, 1, 1, 1",
604 "svremap 31, 1, 0, 2, 0, 1, 1",
605 # tpre
606 "sv.fmuls 24, *0, *16", # mul1_r = r*cos_r
607 "sv.fmadds 24, *8, *20, 24", # mul2_r = i*sin_i
608 # tpre = mul1_r + mul2_r
609 # tpim
610 "sv.fmuls 26, *0, *20", # mul1_i = r*sin_i
611 "sv.fmsubs 26, *8, *16, 26", # mul2_i = i*cos_r
612 # tpim = mul2_i - mul1_i
613 # vec_r jh/jl
614 "sv.ffadds *0, 24, *0", # vh/vl +/- tpre
615 # vec_i jh/jl
616 "sv.ffadds *8, 26, *8", # vh/vl +- tpim
617
618 # svstep loop
619 "setvl. 0, 0, 1, 1, 0, 0",
620 "bc 6, 3, -56"
621 ])
622 lst = list(lst)
623
624 # array and coefficients to test
625 ar = [7.0, -9.8, 3.0, -32.3,
626 -2.0, 5.0, -9.8, 31.3] # array 0..7 real
627 ai = [1.0, -1.8, 3.0, 19.3,
628 4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
629 coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
630 coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
631
632 # store in regfile
633 fprs = [0] * 64
634 for i, a in enumerate(ar):
635 fprs[i+0] = fp64toselectable(a)
636 for i, a in enumerate(ai):
637 fprs[i+8] = fp64toselectable(a)
638 for i, cr in enumerate(coer):
639 fprs[i+16] = fp64toselectable(cr)
640 for i, ci in enumerate(coei):
641 fprs[i+20] = fp64toselectable(ci)
642
643 # set total. err don't know how to calculate how many there are...
644 # do it manually for now
645 VL = 0
646 size = 2
647 n = len(ar)
648 while size <= n:
649 halfsize = size // 2
650 tablestep = n // size
651 for i in range(0, n, size):
652 for j in range(i, i + halfsize):
653 VL += 1
654 size *= 2
655
656 # SVSTATE (calculated VL)
657 svstate = SVP64State()
658 svstate.vl = VL # VL
659 svstate.maxvl = VL # MAXVL
660 print ("SVSTATE", bin(svstate.asint()))
661
662 with Program(lst, bigendian=False) as program:
663 sim = self.run_tst_program(program, svstate=svstate,
664 initial_fprs=fprs)
665 print ("spr svshape0", sim.spr['SVSHAPE0'])
666 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
667 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
668 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
669 print ("spr svshape1", sim.spr['SVSHAPE1'])
670 print ("spr svshape2", sim.spr['SVSHAPE2'])
671 print ("spr svshape3", sim.spr['SVSHAPE3'])
672
673 # work out the results with the twin mul/add-sub, explicit
674 # complex numbers
675 res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
676
677 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
678 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
679 "expected_r", expected_r,
680 "expected_i", expected_i)
681 for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
682 # convert to Power single
683 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r ))
684 expected_r = float(expected_r)
685 actual_r = float(sim.fpr(i))
686 # approximate error calculation, good enough test
687 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
688 # and the rounding is different
689 err = abs(actual_r - expected_r ) / expected_r
690 self.assertTrue(err < 1e-6)
691 # convert to Power single
692 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i ))
693 expected_i = float(expected_i)
694 actual_i = float(sim.fpr(i+8))
695 # approximate error calculation, good enough test
696 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
697 # and the rounding is different
698 err = abs(actual_i - expected_i ) / expected_i
699 self.assertTrue(err < 1e-6)
700
701 def test_sv_ffadds_fft_scalar(self):
702 """>>> lst = ["sv.ffadds *2, 12, 13"
703 ]
704 four in-place vector adds and subs, but done with a scalar
705 pair (fp12, fp13)
706 """
707 lst = SVP64Asm(["sv.ffadds *2, 12, 13"
708 ])
709 lst = list(lst)
710
711 fprs = [0] * 32
712 scalar_a = 1.3
713 scalar_b = -2.0
714 fprs[12] = fp64toselectable(scalar_a)
715 fprs[13] = fp64toselectable(scalar_b)
716 res = []
717 # work out the results with the twin add-sub
718 for i in range(4):
719 t = scalar_b + scalar_a
720 u = scalar_b - scalar_a
721 t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
722 u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
723 res.append((t, u))
724 print ("FFT", i, "res", t, u)
725
726 # SVSTATE (in this case, VL=2)
727 svstate = SVP64State()
728 svstate.vl = 4 # VL
729 svstate.maxvl = 4 # MAXVL
730 print ("SVSTATE", bin(svstate.asint()))
731
732 with Program(lst, bigendian=False) as program:
733 sim = self.run_tst_program(program, svstate=svstate,
734 initial_fprs=fprs)
735 # confirm that the results are as expected
736 for i, (t, u) in enumerate(res):
737 a = float(sim.fpr(i+2))
738 b = float(sim.fpr(i+6))
739 t = float(t)
740 u = float(u)
741 print ("FFT", i, "in", a, b, "res", t, u)
742 for i, (t, u) in enumerate(res):
743 self.assertEqual(sim.fpr(i+2), t)
744 self.assertEqual(sim.fpr(i+6), u)
745
746 def test_sv_remap_fpmadds_fft_ldst(self):
747 """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
748 "sv.lfs/els *0, 4(0)",
749 "svshape 8, 1, 1, 1, 0",
750 "svremap 31, 1, 0, 2, 0, 1, 0",
751 "sv.ffmadds *0, *0, *0, *8"
752
753 runs a full in-place O(N log2 N) butterfly schedule for
754 Discrete Fourier Transform, using bit-reversed LD/ST
755 """
756 lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
757 "svremap 1, 0, 0, 0, 0, 0, 0",
758 "sv.lfs/els *0, 4(0)",
759 "svshape 8, 1, 1, 1, 0",
760 "svremap 31, 1, 0, 2, 0, 1, 0",
761 "sv.ffmadds *0, *0, *0, *8"
762 ])
763 lst = list(lst)
764
765 # array and coefficients to test
766 av = [7.0, -9.8, 3.0, -32.3,
767 -2.0, 5.0, -9.8, 31.3] # array 0..7
768 coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
769
770 # store in regfile
771 fprs = [0] * 32
772 for i, c in enumerate(coe):
773 fprs[i+8] = fp64toselectable(c)
774 # store in memory
775 mem = {}
776 val = 0
777 for i, a in enumerate(av):
778 a = SINGLE(fp64toselectable(a)).value
779 shift = (i % 2) == 1
780 if shift == 0:
781 val = a
782 else:
783 mem[(i//2)*8] = val | (a << 32)
784
785 with Program(lst, bigendian=False) as program:
786 sim = self.run_tst_program(program, initial_mem=mem,
787 initial_fprs=fprs)
788 print ("spr svshape0", sim.spr['SVSHAPE0'])
789 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
790 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
791 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
792 print ("spr svshape1", sim.spr['SVSHAPE1'])
793 print ("spr svshape2", sim.spr['SVSHAPE2'])
794 print ("spr svshape3", sim.spr['SVSHAPE3'])
795
796 print ("mem dump")
797 print (sim.mem.dump())
798
799 # work out the results with the twin mul/add-sub,
800 # note bit-reverse mode requested
801 res = transform_radix2(av, coe, reverse=True)
802
803 for i, expected in enumerate(res):
804 print ("i", i, float(sim.fpr(i)), "expected", expected)
805 for i, expected in enumerate(res):
806 # convert to Power single
807 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
808 expected = float(expected)
809 actual = float(sim.fpr(i))
810 # approximate error calculation, good enough test
811 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
812 # and the rounding is different
813 err = abs(actual - expected) / expected
814 self.assertTrue(err < 1e-6)
815
816 def run_tst_program(self, prog, initial_regs=None,
817 svstate=None,
818 initial_mem=None,
819 initial_fprs=None):
820 if initial_regs is None:
821 initial_regs = [0] * 32
822 simulator = run_tst(prog, initial_regs, mem=initial_mem,
823 initial_fprs=initial_fprs,
824 svstate=svstate)
825
826 print ("GPRs")
827 simulator.gpr.dump()
828 print ("FPRs")
829 simulator.fpr.dump()
830
831 return simulator
832
833
834 if __name__ == "__main__":
835 unittest.main()