add SVP64 i-DCT unit test for inner butterfly, coefficients pre-computed
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2)
17 import unittest
18 import math
19
20
21 def transform_inner_radix2_dct(vec, ctable):
22
23 # Initialization
24 n = len(vec)
25 print ()
26 print ("transform2", n)
27 levels = n.bit_length() - 1
28
29 # reference (read/write) the in-place data in *reverse-bit-order*
30 ri = list(range(n))
31 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
32
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec = halfrev2(vec, False)
36 vec = [vec[ri[i]] for i in range(n)]
37
38 ################
39 # INNER butterfly
40 ################
41 xdim = n
42 ydim = 0
43 zdim = 0
44
45 # set up an SVSHAPE
46 class SVSHAPE:
47 pass
48 # j schedule
49 SVSHAPE0 = SVSHAPE()
50 SVSHAPE0.lims = [xdim, 2, zdim]
51 SVSHAPE0.mode = 0b01
52 SVSHAPE0.submode2 = 0b01
53 SVSHAPE0.skip = 0b00
54 SVSHAPE0.offset = 0 # experiment with different offset, here
55 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
56 # j+halfstep schedule
57 SVSHAPE1 = SVSHAPE()
58 SVSHAPE1.lims = [xdim, 2, zdim]
59 SVSHAPE1.mode = 0b01
60 SVSHAPE1.submode2 = 0b01
61 SVSHAPE1.skip = 0b01
62 SVSHAPE1.offset = 0 # experiment with different offset, here
63 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
64
65 # enumerate over the iterator function, getting new indices
66 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
67 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
68 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
69 t1, t2 = vec[jl], vec[jh]
70 coeff = ctable[k]
71 vec[jl] = t1 + t2
72 vec[jh] = (t1 - t2) * (1.0/coeff)
73 print ("coeff", "ci", k,
74 "jl", jl, "jh", jh,
75 "i/n", (k+0.5), 1.0/coeff,
76 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
77 "end", bin(jle), bin(jhe))
78 if jle == 0b111: # all loops end
79 break
80
81 return vec
82
83
84 def transform_outer_radix2_dct(vec):
85
86 # Initialization
87 n = len(vec)
88 print ()
89 print ("transform2", n)
90 levels = n.bit_length() - 1
91
92 # outer butterfly
93 xdim = n
94 ydim = 0
95 zdim = 0
96
97 # j schedule
98 class SVSHAPE:
99 pass
100 SVSHAPE0 = SVSHAPE()
101 SVSHAPE0.lims = [xdim, 3, zdim]
102 SVSHAPE0.submode2 = 0b100
103 SVSHAPE0.mode = 0b01
104 SVSHAPE0.skip = 0b00
105 SVSHAPE0.offset = 0 # experiment with different offset, here
106 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
107 # j+halfstep schedule
108 SVSHAPE1 = SVSHAPE()
109 SVSHAPE1.lims = [xdim, 3, zdim]
110 SVSHAPE1.mode = 0b01
111 SVSHAPE1.submode2 = 0b100
112 SVSHAPE1.skip = 0b01
113 SVSHAPE1.offset = 0 # experiment with different offset, here
114 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
115
116 # enumerate over the iterator function, getting new indices
117 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
118 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
119 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
120 print ("itersum jr", jl, jh,
121 "end", bin(jle), bin(jhe))
122 vec[jl] += vec[jh]
123 if jle == 0b111: # all loops end
124 break
125
126 print("transform2 result", vec)
127
128 return vec
129
130
131 def transform_inner_radix2_idct(vec, ctable):
132
133 # Initialization
134 n = len(vec)
135 print ()
136 print ("transform2", n)
137 levels = n.bit_length() - 1
138
139 # pretend we LDed data in half-swapped order
140 vec = halfrev2(vec, False)
141
142 ################
143 # INNER butterfly
144 ################
145 xdim = n
146 ydim = 0
147 zdim = 0
148
149 # set up an SVSHAPE
150 class SVSHAPE:
151 pass
152 # j schedule
153 SVSHAPE0 = SVSHAPE()
154 SVSHAPE0.lims = [xdim, 0b000001, 0]
155 SVSHAPE0.mode = 0b11
156 SVSHAPE0.submode2 = 0b11
157 SVSHAPE0.skip = 0b00
158 SVSHAPE0.offset = 0 # experiment with different offset, here
159 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
160 # j+halfstep schedule
161 SVSHAPE1 = SVSHAPE()
162 SVSHAPE1.lims = [xdim, 0b000001, 0]
163 SVSHAPE1.mode = 0b11
164 SVSHAPE1.submode2 = 0b11
165 SVSHAPE1.skip = 0b01
166 SVSHAPE1.offset = 0 # experiment with different offset, here
167 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
168
169 # enumerate over the iterator function, getting new indices
170 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
171 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
172 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
173 t1, t2 = vec[jl], vec[jh]
174 coeff = ctable[k]
175 vec[jl] = t1 + t2
176 vec[jh] = (t1 - t2) * (1.0/coeff)
177 print ("coeff", "ci", k,
178 "jl", jl, "jh", jh,
179 "i/n", (k+0.5), 1.0/coeff,
180 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
181 "end", bin(jle), bin(jhe))
182 if jle == 0b111: # all loops end
183 break
184
185 return vec
186
187
188 def transform_outer_radix2_idct(vec):
189
190 # Initialization
191 n = len(vec)
192 print ()
193 print ("transform2-inv", n)
194 levels = n.bit_length() - 1
195
196 # outer butterfly
197 xdim = n
198 ydim = 0
199 zdim = 0
200
201 # reference (read/write) the in-place data in *reverse-bit-order*
202 ri = list(range(n))
203 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
204
205 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
206 # TODO: merge these two
207 vec = [vec[ri[i]] for i in range(n)]
208 vec = halfrev2(vec, True)
209
210 # j schedule
211 class SVSHAPE:
212 pass
213 SVSHAPE0 = SVSHAPE()
214 SVSHAPE0.lims = [xdim, 3, zdim]
215 SVSHAPE0.submode2 = 0b011
216 SVSHAPE0.mode = 0b11
217 SVSHAPE0.skip = 0b00
218 SVSHAPE0.offset = 0 # experiment with different offset, here
219 SVSHAPE0.invxyz = [1,0,1] # inversion if desired
220 # j+halfstep schedule
221 SVSHAPE1 = SVSHAPE()
222 SVSHAPE1.lims = [xdim, 3, zdim]
223 SVSHAPE1.mode = 0b11
224 SVSHAPE1.submode2 = 0b011
225 SVSHAPE1.skip = 0b01
226 SVSHAPE1.offset = 0 # experiment with different offset, here
227 SVSHAPE1.invxyz = [1,0,1] # inversion if desired
228
229 # enumerate over the iterator function, getting new indices
230 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
231 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
232 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
233 print ("itersum jr", jl, jh,
234 "end", bin(jle), bin(jhe))
235 vec[jl] += vec[jh]
236 if jle == 0b111: # all loops end
237 break
238
239 print("transform2-inv result", vec)
240
241 return vec
242
243
244 class DCTTestCase(FHDLTestCase):
245
246 def _check_regs(self, sim, expected):
247 for i in range(32):
248 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
249
250 def test_sv_ffadds_dct(self):
251 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
252 ]
253 four in-place vector adds, four in-place vector mul-subs
254
255 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
256 FRS to perform the two multiplies. one add, one subtract.
257
258 sv.fdadds FRT, FRA, FRC, FRB actually does:
259 fadds FRT , FRB, FRA
260 fsubs FRT+vl, FRA, FRB+vl
261 """
262 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
263 ])
264 lst = list(lst)
265
266 # cheat here with these values, they're selected so that
267 # rounding errors do not occur. sigh.
268 fprs = [0] * 32
269 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
270 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
271 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
272 res = []
273 # work out the results with the twin add-sub
274 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
275 fprs[i+0] = fp64toselectable(a)
276 fprs[i+4] = fp64toselectable(b)
277 fprs[i+8] = fp64toselectable(c)
278 # this isn't quite a perfect replication of the
279 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
280 # and FPSUB32 directly to be honest.
281 t = a + b
282 diff = (a - b)
283 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
284 diff = float(diff)
285 u = diff * c
286 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
287 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
288 res.append((uc, tc))
289 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
290
291 # SVSTATE (in this case, VL=2)
292 svstate = SVP64State()
293 svstate.vl = 4 # VL
294 svstate.maxvl = 4 # MAXVL
295 print ("SVSTATE", bin(svstate.asint()))
296
297 with Program(lst, bigendian=False) as program:
298 sim = self.run_tst_program(program, svstate=svstate,
299 initial_fprs=fprs)
300 # confirm that the results are as expected
301 for i, (t, u) in enumerate(res):
302 a = float(sim.fpr(i+0))
303 b = float(sim.fpr(i+4))
304 t = float(t)
305 u = float(u)
306 print ("DCT", i, "in", a, b, "res", t, u)
307 for i, (t, u) in enumerate(res):
308 self.assertEqual(sim.fpr(i+0), t)
309 self.assertEqual(sim.fpr(i+4), u)
310
311 def test_sv_remap_fpmadds_dct_inner_4(self):
312 """>>> lst = ["svshape 4, 1, 1, 2, 0",
313 "svremap 27, 1, 0, 2, 0, 1, 0",
314 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
315 ]
316 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
317 for DCT
318
319 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
320 (3 inputs, 2 outputs)
321
322 Note that the coefficient (FRC) is not on a "schedule", it
323 is straight Vectorised (0123...) because DCT coefficients
324 cannot be shared between butterfly layers (due to +0.5)
325 """
326 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
327 "svremap 27, 1, 0, 2, 0, 1, 0",
328 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
329 ])
330 lst = list(lst)
331
332 # array and coefficients to test
333 n = 4
334 av = [7.0, -9.8, 3.0, -32.3]
335 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
336
337 levels = n.bit_length() - 1
338 ri = list(range(n))
339 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
340 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
341 av = halfrev2(avi, False)
342 av = [av[ri[i]] for i in range(n)]
343
344 # store in regfile
345 fprs = [0] * 32
346 for i, c in enumerate(coe):
347 fprs[i+8] = fp64toselectable(1.0 / c) # invert
348 for i, a in enumerate(av):
349 fprs[i+0] = fp64toselectable(a)
350
351 with Program(lst, bigendian=False) as program:
352 sim = self.run_tst_program(program, initial_fprs=fprs)
353 print ("spr svshape0", sim.spr['SVSHAPE0'])
354 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
355 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
356 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
357 print ("spr svshape1", sim.spr['SVSHAPE1'])
358 print ("spr svshape2", sim.spr['SVSHAPE2'])
359 print ("spr svshape3", sim.spr['SVSHAPE3'])
360
361 # work out the results with the twin mul/add-sub
362 res = transform_inner_radix2_dct(avi, coe)
363
364 for i, expected in enumerate(res):
365 print ("i", i, float(sim.fpr(i)), "expected", expected)
366 for i, expected in enumerate(res):
367 # convert to Power single
368 expected = DOUBLE2SINGLE(fp64toselectable(expected))
369 expected = float(expected)
370 actual = float(sim.fpr(i))
371 # approximate error calculation, good enough test
372 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
373 # and the rounding is different
374 err = abs((actual - expected) / expected)
375 print ("err", i, err)
376 self.assertTrue(err < 1e-6)
377
378 def test_sv_remap_fpmadds_dct_inner_4(self):
379 """>>> lst = ["svshape 4, 1, 1, 10, 0",
380 "svremap 27, 1, 0, 2, 0, 1, 0",
381 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
382 ]
383 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
384 for inverse-DCT
385
386 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
387 (3 inputs, 2 outputs)
388
389 Note that the coefficient (FRC) is not on a "schedule", it
390 is straight Vectorised (0123...) because DCT coefficients
391 cannot be shared between butterfly layers (due to +0.5)
392 """
393 lst = SVP64Asm( ["svshape 4, 1, 1, 10, 0",
394 "svremap 27, 1, 0, 2, 0, 1, 0",
395 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
396 ])
397 lst = list(lst)
398
399 # array and coefficients to test
400 n = 4
401 levels = n.bit_length() - 1
402 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
403 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
404 av = halfrev2(avi, False)
405
406 # store in regfile
407 fprs = [0] * 32
408 for i, c in enumerate(coe):
409 fprs[i+8] = fp64toselectable(1.0 / c) # invert
410 for i, a in enumerate(av):
411 fprs[i+0] = fp64toselectable(a)
412
413 with Program(lst, bigendian=False) as program:
414 sim = self.run_tst_program(program, initial_fprs=fprs)
415 print ("spr svshape0", sim.spr['SVSHAPE0'])
416 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
417 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
418 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
419 print ("spr svshape1", sim.spr['SVSHAPE1'])
420 print ("spr svshape2", sim.spr['SVSHAPE2'])
421 print ("spr svshape3", sim.spr['SVSHAPE3'])
422
423 # work out the results with the twin mul/add-sub
424 res = transform_inner_radix2_idct(avi, coe)
425
426 for i, expected in enumerate(res):
427 print ("i", i, float(sim.fpr(i)), "expected", expected)
428 for i, expected in enumerate(res):
429 # convert to Power single
430 expected = DOUBLE2SINGLE(fp64toselectable(expected))
431 expected = float(expected)
432 actual = float(sim.fpr(i))
433 # approximate error calculation, good enough test
434 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
435 # and the rounding is different
436 err = abs((actual - expected) / expected)
437 print ("err", i, err)
438 self.assertTrue(err < 1e-6)
439
440 def test_sv_remap_fpmadds_idct_outer_8(self):
441 """>>> lst = ["svshape 8, 1, 1, 11, 0",
442 "svremap 27, 1, 0, 2, 0, 1, 0",
443 "sv.fadds 0.v, 0.v, 0.v"
444 ]
445 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
446 for inverse-DCT, does the iterative overlapped ADDs
447
448 SVP64 "REMAP" in Butterfly Mode.
449 """
450 lst = SVP64Asm( ["svshape 8, 1, 1, 11, 0",
451 "svremap 27, 1, 0, 2, 0, 1, 0",
452 "sv.fadds 0.v, 0.v, 0.v"
453 ])
454 lst = list(lst)
455
456 # array and coefficients to test
457 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
458
459 n = len(avi)
460 levels = n.bit_length() - 1
461 ri = list(range(n))
462 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
463 av = [avi[ri[i]] for i in range(n)]
464 av = halfrev2(av, True)
465
466 # store in regfile
467 fprs = [0] * 32
468 for i, a in enumerate(av):
469 fprs[i+0] = fp64toselectable(a)
470
471 with Program(lst, bigendian=False) as program:
472 sim = self.run_tst_program(program, initial_fprs=fprs)
473 print ("spr svshape0", sim.spr['SVSHAPE0'])
474 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
475 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
476 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
477 print ("spr svshape1", sim.spr['SVSHAPE1'])
478 print ("spr svshape2", sim.spr['SVSHAPE2'])
479 print ("spr svshape3", sim.spr['SVSHAPE3'])
480
481 # outer iterative sum
482 res = transform_outer_radix2_idct(avi)
483
484 for i, expected in enumerate(res):
485 print ("i", i, float(sim.fpr(i)), "expected", expected)
486 for i, expected in enumerate(res):
487 # convert to Power single
488 expected = DOUBLE2SINGLE(fp64toselectable(expected))
489 expected = float(expected)
490 actual = float(sim.fpr(i))
491 # approximate error calculation, good enough test
492 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
493 # and the rounding is different
494 err = abs((actual - expected) / expected)
495 print ("err", i, err)
496 self.assertTrue(err < 1e-6)
497
498 def test_sv_remap_fpmadds_dct_outer_8(self):
499 """>>> lst = ["svshape 8, 1, 1, 3, 0",
500 "svremap 27, 1, 0, 2, 0, 1, 0",
501 "sv.fadds 0.v, 0.v, 0.v"
502 ]
503 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
504 for DCT, does the iterative overlapped ADDs
505
506 SVP64 "REMAP" in Butterfly Mode.
507 """
508 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
509 "svremap 27, 1, 0, 2, 0, 1, 0",
510 "sv.fadds 0.v, 0.v, 0.v"
511 ])
512 lst = list(lst)
513
514 # array and coefficients to test
515 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
516
517 # store in regfile
518 fprs = [0] * 32
519 for i, a in enumerate(av):
520 fprs[i+0] = fp64toselectable(a)
521
522 with Program(lst, bigendian=False) as program:
523 sim = self.run_tst_program(program, initial_fprs=fprs)
524 print ("spr svshape0", sim.spr['SVSHAPE0'])
525 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
526 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
527 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
528 print ("spr svshape1", sim.spr['SVSHAPE1'])
529 print ("spr svshape2", sim.spr['SVSHAPE2'])
530 print ("spr svshape3", sim.spr['SVSHAPE3'])
531
532 # outer iterative sum
533 res = transform_outer_radix2_dct(av)
534
535 for i, expected in enumerate(res):
536 print ("i", i, float(sim.fpr(i)), "expected", expected)
537 for i, expected in enumerate(res):
538 # convert to Power single
539 expected = DOUBLE2SINGLE(fp64toselectable(expected))
540 expected = float(expected)
541 actual = float(sim.fpr(i))
542 # approximate error calculation, good enough test
543 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
544 # and the rounding is different
545 err = abs((actual - expected) / expected)
546 print ("err", i, err)
547 self.assertTrue(err < 1e-6)
548
549 def test_sv_remap_fpmadds_dct_8(self):
550 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
551 "svshape 8, 1, 1, 2, 0",
552 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
553 "svshape 8, 1, 1, 3, 0",
554 "sv.fadds 0.v, 0.v, 0.v"
555 ]
556 runs a full in-place 8-long O(N log2 N) DCT, both
557 inner and outer butterfly "REMAP" schedules.
558 """
559 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
560 "svshape 8, 1, 1, 2, 0",
561 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
562 "svshape 8, 1, 1, 3, 0",
563 "sv.fadds 0.v, 0.v, 0.v"
564 ])
565 lst = list(lst)
566
567 # array and coefficients to test
568 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
569 n = len(avi)
570 levels = n.bit_length() - 1
571 ri = list(range(n))
572 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
573 av = halfrev2(avi, False)
574 av = [av[ri[i]] for i in range(n)]
575 ctable = []
576 size = n
577 while size >= 2:
578 halfsize = size // 2
579 for i in range(n//size):
580 for ci in range(halfsize):
581 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
582 size //= 2
583
584 # store in regfile
585 fprs = [0] * 32
586 for i, a in enumerate(av):
587 fprs[i+0] = fp64toselectable(a)
588 for i, c in enumerate(ctable):
589 fprs[i+8] = fp64toselectable(1.0 / c) # invert
590
591 with Program(lst, bigendian=False) as program:
592 sim = self.run_tst_program(program, initial_fprs=fprs)
593 print ("spr svshape0", sim.spr['SVSHAPE0'])
594 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
595 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
596 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
597 print ("spr svshape1", sim.spr['SVSHAPE1'])
598 print ("spr svshape2", sim.spr['SVSHAPE2'])
599 print ("spr svshape3", sim.spr['SVSHAPE3'])
600
601 # outer iterative sum
602 res = transform2(avi)
603
604 for i, expected in enumerate(res):
605 print ("i", i, float(sim.fpr(i)), "expected", expected)
606 for i, expected in enumerate(res):
607 # convert to Power single
608 expected = DOUBLE2SINGLE(fp64toselectable(expected))
609 expected = float(expected)
610 actual = float(sim.fpr(i))
611 # approximate error calculation, good enough test
612 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
613 # and the rounding is different
614 err = abs((actual - expected) / expected)
615 print ("err", i, err)
616 self.assertTrue(err < 1e-5)
617
618 def test_sv_remap_dct_cos_precompute_8(self):
619 """pre-computes a DCT COS table, deliberately using a lot of
620 registers so as to be able to see what is going on (dumping all
621 regs after the run).
622
623 the simpler (scalar) version is in test_caller_transcendentals.py
624 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
625 need the new version of fcfids which doesn't spam memory with
626 LD/STs.
627 """
628 lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
629 "svremap 0, 0, 0, 2, 0, 1, 1",
630 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
631 "sv.svstep 16.v, 3, 1", # svstep get vector of step
632 "addi 1, 0, 0x0000",
633 "setvl 0, 0, 12, 0, 1, 1",
634 "sv.std 4.v, 0(1)",
635 "sv.lfd 64.v, 0(1)",
636 "sv.fcfids 48.v, 64.v",
637 "addi 1, 0, 0x0060",
638 "sv.std 16.v, 0(1)",
639 "sv.lfd 12.v, 0(1)",
640 "sv.fcfids 24.v, 12.v",
641 "sv.fadds 0.v, 24.v, 43", # plus 0.5
642 "sv.fmuls 0.v, 0.v, 41", # times PI
643 "sv.fdivs 0.v, 0.v, 48.v", # div size
644 "sv.fcoss 80.v, 0.v",
645 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
646 ])
647 lst = list(lst)
648
649 gprs = [0] * 32
650 fprs = [0] * 128
651 # constants
652 fprs[43] = fp64toselectable(0.5) # 0.5
653 fprs[41] = fp64toselectable(math.pi) # pi
654 fprs[44] = fp64toselectable(2.0) # 2.0
655
656 n = 8
657
658 ctable = []
659 size = n
660 while size >= 2:
661 halfsize = size // 2
662 for i in range(n//size):
663 for ci in range(halfsize):
664 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
665 size //= 2
666
667 with Program(lst, bigendian=False) as program:
668 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
669 print ("MEM")
670 sim.mem.dump()
671 print ("ci FP")
672 for i in range(len(ctable)):
673 actual = float(sim.fpr(i+24))
674 print ("i", i, actual)
675 print ("size FP")
676 for i in range(len(ctable)):
677 actual = float(sim.fpr(i+48))
678 print ("i", i, actual)
679 print ("temps")
680 for i in range(len(ctable)):
681 actual = float(sim.fpr(i))
682 print ("i", i, actual)
683 for i in range(len(ctable)):
684 expected = 1.0/ctable[i]
685 actual = float(sim.fpr(i+80))
686 err = abs((actual - expected) / expected)
687 print ("i", i, actual, "1/expect", 1/expected,
688 "expected", expected,
689 "err", err)
690 self.assertTrue(err < 1e-6)
691
692 def test_sv_remap_dct_cos_precompute_inner_8(self):
693 """pre-computes a DCT COS table, using the shorter costable
694 indices schedule. turns out, some COS values are repeated
695 in each layer of the DCT butterfly.
696
697 the simpler (scalar) version is in test_caller_transcendentals.py
698 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
699 need the new version of fcfids which doesn't spam memory with
700 LD/STs.
701 """
702 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
703 "svremap 0, 0, 0, 2, 0, 1, 1",
704 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
705 "sv.svstep 16.v, 2, 1", # svstep get vector of step
706 "addi 1, 0, 0x0000",
707 "setvl 0, 0, 7, 0, 1, 1",
708 "sv.std 4.v, 0(1)",
709 "sv.lfd 64.v, 0(1)",
710 "sv.fcfids 48.v, 64.v",
711 "addi 1, 0, 0x0060",
712 "sv.std 16.v, 0(1)",
713 "sv.lfd 12.v, 0(1)",
714 "sv.fcfids 24.v, 12.v",
715 "sv.fadds 0.v, 24.v, 43", # plus 0.5
716 "sv.fmuls 0.v, 0.v, 41", # times PI
717 "sv.fdivs 0.v, 0.v, 48.v", # div size
718 "sv.fcoss 80.v, 0.v",
719 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
720 ])
721 lst = list(lst)
722
723 gprs = [0] * 32
724 fprs = [0] * 128
725 # constants
726 fprs[43] = fp64toselectable(0.5) # 0.5
727 fprs[41] = fp64toselectable(math.pi) # pi
728 fprs[44] = fp64toselectable(2.0) # 2.0
729
730 n = 8
731
732 ctable = []
733 size = n
734 while size >= 2:
735 halfsize = size // 2
736 for ci in range(halfsize):
737 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
738 ctable.append(coeff)
739 print ("coeff", "ci", ci, "size", size,
740 "i/n", (ci+0.5), 1.0/coeff)
741 size //= 2
742
743 with Program(lst, bigendian=False) as program:
744 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
745 print ("MEM")
746 sim.mem.dump()
747 print ("ci FP")
748 for i in range(len(ctable)):
749 actual = float(sim.fpr(i+24))
750 print ("i", i, actual)
751 print ("size FP")
752 for i in range(len(ctable)):
753 actual = float(sim.fpr(i+48))
754 print ("i", i, actual)
755 print ("temps")
756 for i in range(len(ctable)):
757 actual = float(sim.fpr(i))
758 print ("i", i, actual)
759 for i in range(len(ctable)):
760 expected = 1.0/ctable[i]
761 actual = float(sim.fpr(i+80))
762 err = abs((actual - expected) / expected)
763 print ("i", i, actual, "1/expect", 1/expected,
764 "expected", expected,
765 "err", err)
766 self.assertTrue(err < 1e-6)
767
768 def test_sv_remap_fpmadds_dct_8_mode_4(self):
769 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
770 "svshape 8, 1, 1, 4, 0",
771 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
772 "svshape 8, 1, 1, 3, 0",
773 "sv.fadds 0.v, 0.v, 0.v"
774 ]
775 runs a full in-place 8-long O(N log2 N) DCT, both
776 inner and outer butterfly "REMAP" schedules.
777 uses shorter tables: FRC also needs to be on a Schedule
778 """
779 lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
780 "svshape 8, 1, 1, 4, 0",
781 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
782 "svshape 8, 1, 1, 3, 0",
783 "sv.fadds 0.v, 0.v, 0.v"
784 ])
785 lst = list(lst)
786
787 # array and coefficients to test
788 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
789 n = len(avi)
790 levels = n.bit_length() - 1
791 ri = list(range(n))
792 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
793 av = halfrev2(avi, False)
794 av = [av[ri[i]] for i in range(n)]
795 ctable = []
796 size = n
797 while size >= 2:
798 halfsize = size // 2
799 for ci in range(halfsize):
800 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
801 size //= 2
802
803 # store in regfile
804 fprs = [0] * 32
805 for i, a in enumerate(av):
806 fprs[i+0] = fp64toselectable(a)
807 for i, c in enumerate(ctable):
808 fprs[i+8] = fp64toselectable(1.0 / c) # invert
809
810 with Program(lst, bigendian=False) as program:
811 sim = self.run_tst_program(program, initial_fprs=fprs)
812 print ("spr svshape0", sim.spr['SVSHAPE0'])
813 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
814 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
815 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
816 print ("spr svshape1", sim.spr['SVSHAPE1'])
817 print ("spr svshape2", sim.spr['SVSHAPE2'])
818 print ("spr svshape3", sim.spr['SVSHAPE3'])
819
820 # outer iterative sum
821 res = transform2(avi)
822
823 for i, expected in enumerate(res):
824 print ("i", i, float(sim.fpr(i)), "expected", expected)
825 for i, expected in enumerate(res):
826 # convert to Power single
827 expected = DOUBLE2SINGLE(fp64toselectable(expected))
828 expected = float(expected)
829 actual = float(sim.fpr(i))
830 # approximate error calculation, good enough test
831 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
832 # and the rounding is different
833 err = abs((actual - expected) / expected)
834 print ("err", i, err)
835 self.assertTrue(err < 1e-5)
836
837 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self):
838 """>>> lst = [# LOAD bit-reversed with half-swap
839 "svshape 8, 1, 1, 6, 0",
840 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
841 "sv.lfsbr 0.v, 4(1), 2",
842 # Inner butterfly, twin +/- MUL-ADD-SUB
843 "svremap 31, 1, 0, 2, 0, 1, 1",
844 "svshape 8, 1, 1, 4, 0",
845 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
846 # Outer butterfly, iterative sum
847 "svshape 8, 1, 1, 3, 0",
848 "sv.fadds 0.v, 0.v, 0.v"
849 ]
850 runs a full in-place 8-long O(N log2 N) DCT, both
851 inner and outer butterfly "REMAP" schedules, and using
852 bit-reversed half-swapped LDs.
853 uses shorter pre-loaded COS tables: FRC also needs to be on a
854 Schedule
855 """
856 lst = SVP64Asm( ["addi 1, 0, 0x000",
857 "svshape 8, 1, 1, 6, 0",
858 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
859 "sv.lfsbr 0.v, 4(1), 2",
860 "svremap 31, 1, 0, 2, 0, 1, 1",
861 "svshape 8, 1, 1, 4, 0",
862 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
863 "svshape 8, 1, 1, 3, 0",
864 "sv.fadds 0.v, 0.v, 0.v"
865 ])
866 lst = list(lst)
867
868 # array and coefficients to test
869 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
870
871 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
872 # LD will bring them in, in the correct order.
873 mem = {}
874 val = 0
875 for i, a in enumerate(avi):
876 a = SINGLE(fp64toselectable(a)).value
877 shift = (i % 2) == 1
878 if shift == 0:
879 val = a # accumulate for next iteration
880 else:
881 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
882
883 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
884 n = len(avi)
885 ctable = []
886 size = n
887 while size >= 2:
888 halfsize = size // 2
889 for ci in range(halfsize):
890 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
891 size //= 2
892
893 # store in regfile
894 fprs = [0] * 32
895 for i, c in enumerate(ctable):
896 fprs[i+8] = fp64toselectable(1.0 / c) # invert
897
898 with Program(lst, bigendian=False) as program:
899 sim = self.run_tst_program(program, initial_fprs=fprs,
900 initial_mem=mem)
901 print ("spr svshape0", sim.spr['SVSHAPE0'])
902 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
903 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
904 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
905 print ("spr svshape1", sim.spr['SVSHAPE1'])
906 print ("spr svshape2", sim.spr['SVSHAPE2'])
907 print ("spr svshape3", sim.spr['SVSHAPE3'])
908
909 # outer iterative sum
910 res = transform2(avi)
911
912 for i, expected in enumerate(res):
913 print ("i", i, float(sim.fpr(i)), "expected", expected)
914
915 for i, expected in enumerate(res):
916 # convert to Power single
917 expected = DOUBLE2SINGLE(fp64toselectable(expected))
918 expected = float(expected)
919 actual = float(sim.fpr(i))
920 # approximate error calculation, good enough test
921 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
922 # and the rounding is different
923 err = abs((actual - expected) / expected)
924 print ("err", i, err)
925 self.assertTrue(err < 1e-5)
926
927 def run_tst_program(self, prog, initial_regs=None,
928 svstate=None,
929 initial_mem=None,
930 initial_fprs=None):
931 if initial_regs is None:
932 initial_regs = [0] * 32
933 simulator = run_tst(prog, initial_regs, mem=initial_mem,
934 initial_fprs=initial_fprs,
935 svstate=svstate)
936
937 print ("GPRs")
938 simulator.gpr.dump()
939 print ("FPRs")
940 simulator.fpr.dump()
941
942 return simulator
943
944
945 if __name__ == "__main__":
946 unittest.main()