add i-DCT SVP64 unit test for outer butterfly
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2)
17 import unittest
18 import math
19
20
21 def transform_inner_radix2_dct(vec, ctable):
22
23 # Initialization
24 n = len(vec)
25 print ()
26 print ("transform2", n)
27 levels = n.bit_length() - 1
28
29 # reference (read/write) the in-place data in *reverse-bit-order*
30 ri = list(range(n))
31 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
32
33 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
34 # TODO: merge these two
35 vec = halfrev2(vec, False)
36 vec = [vec[ri[i]] for i in range(n)]
37
38 ################
39 # INNER butterfly
40 ################
41 xdim = n
42 ydim = 0
43 zdim = 0
44
45 # set up an SVSHAPE
46 class SVSHAPE:
47 pass
48 # j schedule
49 SVSHAPE0 = SVSHAPE()
50 SVSHAPE0.lims = [xdim, 2, zdim]
51 SVSHAPE0.mode = 0b01
52 SVSHAPE0.submode2 = 0b01
53 SVSHAPE0.skip = 0b00
54 SVSHAPE0.offset = 0 # experiment with different offset, here
55 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
56 # j+halfstep schedule
57 SVSHAPE1 = SVSHAPE()
58 SVSHAPE1.lims = [xdim, 2, zdim]
59 SVSHAPE1.mode = 0b01
60 SVSHAPE1.submode2 = 0b01
61 SVSHAPE1.skip = 0b01
62 SVSHAPE1.offset = 0 # experiment with different offset, here
63 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
64
65 # enumerate over the iterator function, getting new indices
66 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
67 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
68 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
69 t1, t2 = vec[jl], vec[jh]
70 coeff = ctable[k]
71 vec[jl] = t1 + t2
72 vec[jh] = (t1 - t2) * (1.0/coeff)
73 print ("coeff", "ci", k,
74 "jl", jl, "jh", jh,
75 "i/n", (k+0.5), 1.0/coeff,
76 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
77 "end", bin(jle), bin(jhe))
78 if jle == 0b111: # all loops end
79 break
80
81 return vec
82
83
84 def transform_outer_radix2_dct(vec):
85
86 # Initialization
87 n = len(vec)
88 print ()
89 print ("transform2", n)
90 levels = n.bit_length() - 1
91
92 # outer butterfly
93 xdim = n
94 ydim = 0
95 zdim = 0
96
97 # j schedule
98 class SVSHAPE:
99 pass
100 SVSHAPE0 = SVSHAPE()
101 SVSHAPE0.lims = [xdim, 3, zdim]
102 SVSHAPE0.submode2 = 0b100
103 SVSHAPE0.mode = 0b01
104 SVSHAPE0.skip = 0b00
105 SVSHAPE0.offset = 0 # experiment with different offset, here
106 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
107 # j+halfstep schedule
108 SVSHAPE1 = SVSHAPE()
109 SVSHAPE1.lims = [xdim, 3, zdim]
110 SVSHAPE1.mode = 0b01
111 SVSHAPE1.submode2 = 0b100
112 SVSHAPE1.skip = 0b01
113 SVSHAPE1.offset = 0 # experiment with different offset, here
114 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
115
116 # enumerate over the iterator function, getting new indices
117 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
118 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
119 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
120 print ("itersum jr", jl, jh,
121 "end", bin(jle), bin(jhe))
122 vec[jl] += vec[jh]
123 if jle == 0b111: # all loops end
124 break
125
126 print("transform2 result", vec)
127
128 return vec
129
130
131 def transform_inner_radix2_idct(vec, ctable):
132
133 # Initialization
134 n = len(vec)
135 print ()
136 print ("transform2", n)
137 levels = n.bit_length() - 1
138
139 # pretend we LDed data in half-swapped order
140 vec = halfrev2(vec, True)
141
142 ################
143 # INNER butterfly
144 ################
145 xdim = n
146 ydim = 0
147 zdim = 0
148
149 # set up an SVSHAPE
150 class SVSHAPE:
151 pass
152 # j schedule
153 SVSHAPE0 = SVSHAPE()
154 SVSHAPE0.lims = [xdim, 0b000001, 0]
155 SVSHAPE0.mode = 0b11
156 SVSHAPE0.submode2 = 0b11
157 SVSHAPE0.skip = 0b00
158 SVSHAPE0.offset = 0 # experiment with different offset, here
159 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
160 # j+halfstep schedule
161 SVSHAPE1 = SVSHAPE()
162 SVSHAPE1.lims = [xdim, 0b000001, 0]
163 SVSHAPE1.mode = 0b11
164 SVSHAPE1.submode2 = 0b11
165 SVSHAPE1.skip = 0b01
166 SVSHAPE1.offset = 0 # experiment with different offset, here
167 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
168
169 # enumerate over the iterator function, getting new indices
170 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
171 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
172 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
173 t1, t2 = vec[jl], vec[jh]
174 coeff = ctable[k]
175 vec[jl] = t1 + t2
176 vec[jh] = (t1 - t2) * (1.0/coeff)
177 print ("coeff", "ci", k,
178 "jl", jl, "jh", jh,
179 "i/n", (k+0.5), 1.0/coeff,
180 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
181 "end", bin(jle), bin(jhe))
182 if jle == 0b111: # all loops end
183 break
184
185 return vec
186
187
188 def transform_outer_radix2_idct(vec):
189
190 # Initialization
191 n = len(vec)
192 print ()
193 print ("transform2-inv", n)
194 levels = n.bit_length() - 1
195
196 # outer butterfly
197 xdim = n
198 ydim = 0
199 zdim = 0
200
201 # reference (read/write) the in-place data in *reverse-bit-order*
202 ri = list(range(n))
203 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
204
205 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
206 # TODO: merge these two
207 vec = [vec[ri[i]] for i in range(n)]
208 vec = halfrev2(vec, True)
209
210 # j schedule
211 class SVSHAPE:
212 pass
213 SVSHAPE0 = SVSHAPE()
214 SVSHAPE0.lims = [xdim, 3, zdim]
215 SVSHAPE0.submode2 = 0b011
216 SVSHAPE0.mode = 0b11
217 SVSHAPE0.skip = 0b00
218 SVSHAPE0.offset = 0 # experiment with different offset, here
219 SVSHAPE0.invxyz = [1,0,1] # inversion if desired
220 # j+halfstep schedule
221 SVSHAPE1 = SVSHAPE()
222 SVSHAPE1.lims = [xdim, 3, zdim]
223 SVSHAPE1.mode = 0b11
224 SVSHAPE1.submode2 = 0b011
225 SVSHAPE1.skip = 0b01
226 SVSHAPE1.offset = 0 # experiment with different offset, here
227 SVSHAPE1.invxyz = [1,0,1] # inversion if desired
228
229 # enumerate over the iterator function, getting new indices
230 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
231 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
232 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
233 print ("itersum jr", jl, jh,
234 "end", bin(jle), bin(jhe))
235 vec[jl] += vec[jh]
236 if jle == 0b111: # all loops end
237 break
238
239 print("transform2-inv result", vec)
240
241 return vec
242
243
244 class DCTTestCase(FHDLTestCase):
245
246 def _check_regs(self, sim, expected):
247 for i in range(32):
248 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
249
250 def test_sv_ffadds_dct(self):
251 """>>> lst = ["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
252 ]
253 four in-place vector adds, four in-place vector mul-subs
254
255 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
256 FRS to perform the two multiplies. one add, one subtract.
257
258 sv.fdadds FRT, FRA, FRC, FRB actually does:
259 fadds FRT , FRB, FRA
260 fsubs FRT+vl, FRA, FRB+vl
261 """
262 lst = SVP64Asm(["sv.fdmadds 0.v, 0.v, 0.v, 8.v"
263 ])
264 lst = list(lst)
265
266 # cheat here with these values, they're selected so that
267 # rounding errors do not occur. sigh.
268 fprs = [0] * 32
269 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
270 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
271 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
272 res = []
273 # work out the results with the twin add-sub
274 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
275 fprs[i+0] = fp64toselectable(a)
276 fprs[i+4] = fp64toselectable(b)
277 fprs[i+8] = fp64toselectable(c)
278 # this isn't quite a perfect replication of the
279 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
280 # and FPSUB32 directly to be honest.
281 t = a + b
282 diff = (a - b)
283 diff = DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
284 diff = float(diff)
285 u = diff * c
286 tc = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
287 uc = DOUBLE2SINGLE(fp64toselectable(u)) # from double
288 res.append((uc, tc))
289 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
290
291 # SVSTATE (in this case, VL=2)
292 svstate = SVP64State()
293 svstate.vl = 4 # VL
294 svstate.maxvl = 4 # MAXVL
295 print ("SVSTATE", bin(svstate.asint()))
296
297 with Program(lst, bigendian=False) as program:
298 sim = self.run_tst_program(program, svstate=svstate,
299 initial_fprs=fprs)
300 # confirm that the results are as expected
301 for i, (t, u) in enumerate(res):
302 a = float(sim.fpr(i+0))
303 b = float(sim.fpr(i+4))
304 t = float(t)
305 u = float(u)
306 print ("DCT", i, "in", a, b, "res", t, u)
307 for i, (t, u) in enumerate(res):
308 self.assertEqual(sim.fpr(i+0), t)
309 self.assertEqual(sim.fpr(i+4), u)
310
311 def test_sv_remap_fpmadds_dct_inner_4(self):
312 """>>> lst = ["svshape 4, 1, 1, 2, 0",
313 "svremap 27, 1, 0, 2, 0, 1, 0",
314 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
315 ]
316 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
317 for DCT
318
319 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
320 (3 inputs, 2 outputs)
321
322 Note that the coefficient (FRC) is not on a "schedule", it
323 is straight Vectorised (0123...) because DCT coefficients
324 cannot be shared between butterfly layers (due to +0.5)
325 """
326 lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
327 "svremap 27, 1, 0, 2, 0, 1, 0",
328 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
329 ])
330 lst = list(lst)
331
332 # array and coefficients to test
333 n = 4
334 av = [7.0, -9.8, 3.0, -32.3]
335 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
336
337 levels = n.bit_length() - 1
338 ri = list(range(n))
339 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
340 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
341 av = halfrev2(avi, False)
342 av = [av[ri[i]] for i in range(n)]
343
344 # store in regfile
345 fprs = [0] * 32
346 for i, c in enumerate(coe):
347 fprs[i+8] = fp64toselectable(1.0 / c) # invert
348 for i, a in enumerate(av):
349 fprs[i+0] = fp64toselectable(a)
350
351 with Program(lst, bigendian=False) as program:
352 sim = self.run_tst_program(program, initial_fprs=fprs)
353 print ("spr svshape0", sim.spr['SVSHAPE0'])
354 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
355 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
356 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
357 print ("spr svshape1", sim.spr['SVSHAPE1'])
358 print ("spr svshape2", sim.spr['SVSHAPE2'])
359 print ("spr svshape3", sim.spr['SVSHAPE3'])
360
361 # work out the results with the twin mul/add-sub
362 res = transform_inner_radix2_dct(avi, coe)
363
364 for i, expected in enumerate(res):
365 print ("i", i, float(sim.fpr(i)), "expected", expected)
366 for i, expected in enumerate(res):
367 # convert to Power single
368 expected = DOUBLE2SINGLE(fp64toselectable(expected))
369 expected = float(expected)
370 actual = float(sim.fpr(i))
371 # approximate error calculation, good enough test
372 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
373 # and the rounding is different
374 err = abs((actual - expected) / expected)
375 print ("err", i, err)
376 self.assertTrue(err < 1e-6)
377
378 def test_sv_remap_fpmadds_idct_outer_8(self):
379 """>>> lst = ["svshape 8, 1, 1, 11, 0",
380 "svremap 27, 1, 0, 2, 0, 1, 0",
381 "sv.fadds 0.v, 0.v, 0.v"
382 ]
383 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
384 for inverse-DCT, does the iterative overlapped ADDs
385
386 SVP64 "REMAP" in Butterfly Mode.
387 """
388 lst = SVP64Asm( ["svshape 8, 1, 1, 11, 0",
389 "svremap 27, 1, 0, 2, 0, 1, 0",
390 "sv.fadds 0.v, 0.v, 0.v"
391 ])
392 lst = list(lst)
393
394 # array and coefficients to test
395 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
396
397 n = len(avi)
398 levels = n.bit_length() - 1
399 ri = list(range(n))
400 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
401 av = [avi[ri[i]] for i in range(n)]
402 av = halfrev2(av, True)
403
404 # store in regfile
405 fprs = [0] * 32
406 for i, a in enumerate(av):
407 fprs[i+0] = fp64toselectable(a)
408
409 with Program(lst, bigendian=False) as program:
410 sim = self.run_tst_program(program, initial_fprs=fprs)
411 print ("spr svshape0", sim.spr['SVSHAPE0'])
412 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
413 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
414 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
415 print ("spr svshape1", sim.spr['SVSHAPE1'])
416 print ("spr svshape2", sim.spr['SVSHAPE2'])
417 print ("spr svshape3", sim.spr['SVSHAPE3'])
418
419 # outer iterative sum
420 res = transform_outer_radix2_idct(avi)
421
422 for i, expected in enumerate(res):
423 print ("i", i, float(sim.fpr(i)), "expected", expected)
424 for i, expected in enumerate(res):
425 # convert to Power single
426 expected = DOUBLE2SINGLE(fp64toselectable(expected))
427 expected = float(expected)
428 actual = float(sim.fpr(i))
429 # approximate error calculation, good enough test
430 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
431 # and the rounding is different
432 err = abs((actual - expected) / expected)
433 print ("err", i, err)
434 self.assertTrue(err < 1e-6)
435
436 def test_sv_remap_fpmadds_dct_outer_8(self):
437 """>>> lst = ["svshape 8, 1, 1, 3, 0",
438 "svremap 27, 1, 0, 2, 0, 1, 0",
439 "sv.fadds 0.v, 0.v, 0.v"
440 ]
441 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
442 for DCT, does the iterative overlapped ADDs
443
444 SVP64 "REMAP" in Butterfly Mode.
445 """
446 lst = SVP64Asm( ["svshape 8, 1, 1, 3, 0",
447 "svremap 27, 1, 0, 2, 0, 1, 0",
448 "sv.fadds 0.v, 0.v, 0.v"
449 ])
450 lst = list(lst)
451
452 # array and coefficients to test
453 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
454
455 # store in regfile
456 fprs = [0] * 32
457 for i, a in enumerate(av):
458 fprs[i+0] = fp64toselectable(a)
459
460 with Program(lst, bigendian=False) as program:
461 sim = self.run_tst_program(program, initial_fprs=fprs)
462 print ("spr svshape0", sim.spr['SVSHAPE0'])
463 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
464 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
465 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
466 print ("spr svshape1", sim.spr['SVSHAPE1'])
467 print ("spr svshape2", sim.spr['SVSHAPE2'])
468 print ("spr svshape3", sim.spr['SVSHAPE3'])
469
470 # outer iterative sum
471 res = transform_outer_radix2_dct(av)
472
473 for i, expected in enumerate(res):
474 print ("i", i, float(sim.fpr(i)), "expected", expected)
475 for i, expected in enumerate(res):
476 # convert to Power single
477 expected = DOUBLE2SINGLE(fp64toselectable(expected))
478 expected = float(expected)
479 actual = float(sim.fpr(i))
480 # approximate error calculation, good enough test
481 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
482 # and the rounding is different
483 err = abs((actual - expected) / expected)
484 print ("err", i, err)
485 self.assertTrue(err < 1e-6)
486
487 def test_sv_remap_fpmadds_dct_8(self):
488 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
489 "svshape 8, 1, 1, 2, 0",
490 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
491 "svshape 8, 1, 1, 3, 0",
492 "sv.fadds 0.v, 0.v, 0.v"
493 ]
494 runs a full in-place 8-long O(N log2 N) DCT, both
495 inner and outer butterfly "REMAP" schedules.
496 """
497 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
498 "svshape 8, 1, 1, 2, 0",
499 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
500 "svshape 8, 1, 1, 3, 0",
501 "sv.fadds 0.v, 0.v, 0.v"
502 ])
503 lst = list(lst)
504
505 # array and coefficients to test
506 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
507 n = len(avi)
508 levels = n.bit_length() - 1
509 ri = list(range(n))
510 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
511 av = halfrev2(avi, False)
512 av = [av[ri[i]] for i in range(n)]
513 ctable = []
514 size = n
515 while size >= 2:
516 halfsize = size // 2
517 for i in range(n//size):
518 for ci in range(halfsize):
519 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
520 size //= 2
521
522 # store in regfile
523 fprs = [0] * 32
524 for i, a in enumerate(av):
525 fprs[i+0] = fp64toselectable(a)
526 for i, c in enumerate(ctable):
527 fprs[i+8] = fp64toselectable(1.0 / c) # invert
528
529 with Program(lst, bigendian=False) as program:
530 sim = self.run_tst_program(program, initial_fprs=fprs)
531 print ("spr svshape0", sim.spr['SVSHAPE0'])
532 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
533 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
534 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
535 print ("spr svshape1", sim.spr['SVSHAPE1'])
536 print ("spr svshape2", sim.spr['SVSHAPE2'])
537 print ("spr svshape3", sim.spr['SVSHAPE3'])
538
539 # outer iterative sum
540 res = transform2(avi)
541
542 for i, expected in enumerate(res):
543 print ("i", i, float(sim.fpr(i)), "expected", expected)
544 for i, expected in enumerate(res):
545 # convert to Power single
546 expected = DOUBLE2SINGLE(fp64toselectable(expected))
547 expected = float(expected)
548 actual = float(sim.fpr(i))
549 # approximate error calculation, good enough test
550 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
551 # and the rounding is different
552 err = abs((actual - expected) / expected)
553 print ("err", i, err)
554 self.assertTrue(err < 1e-5)
555
556 def test_sv_remap_dct_cos_precompute_8(self):
557 """pre-computes a DCT COS table, deliberately using a lot of
558 registers so as to be able to see what is going on (dumping all
559 regs after the run).
560
561 the simpler (scalar) version is in test_caller_transcendentals.py
562 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
563 need the new version of fcfids which doesn't spam memory with
564 LD/STs.
565 """
566 lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
567 "svremap 0, 0, 0, 2, 0, 1, 1",
568 "sv.svstep 4.v, 4, 1", # svstep get vector of ci
569 "sv.svstep 16.v, 3, 1", # svstep get vector of step
570 "addi 1, 0, 0x0000",
571 "setvl 0, 0, 12, 0, 1, 1",
572 "sv.std 4.v, 0(1)",
573 "sv.lfd 64.v, 0(1)",
574 "sv.fcfids 48.v, 64.v",
575 "addi 1, 0, 0x0060",
576 "sv.std 16.v, 0(1)",
577 "sv.lfd 12.v, 0(1)",
578 "sv.fcfids 24.v, 12.v",
579 "sv.fadds 0.v, 24.v, 43", # plus 0.5
580 "sv.fmuls 0.v, 0.v, 41", # times PI
581 "sv.fdivs 0.v, 0.v, 48.v", # div size
582 "sv.fcoss 80.v, 0.v",
583 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
584 ])
585 lst = list(lst)
586
587 gprs = [0] * 32
588 fprs = [0] * 128
589 # constants
590 fprs[43] = fp64toselectable(0.5) # 0.5
591 fprs[41] = fp64toselectable(math.pi) # pi
592 fprs[44] = fp64toselectable(2.0) # 2.0
593
594 n = 8
595
596 ctable = []
597 size = n
598 while size >= 2:
599 halfsize = size // 2
600 for i in range(n//size):
601 for ci in range(halfsize):
602 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
603 size //= 2
604
605 with Program(lst, bigendian=False) as program:
606 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
607 print ("MEM")
608 sim.mem.dump()
609 print ("ci FP")
610 for i in range(len(ctable)):
611 actual = float(sim.fpr(i+24))
612 print ("i", i, actual)
613 print ("size FP")
614 for i in range(len(ctable)):
615 actual = float(sim.fpr(i+48))
616 print ("i", i, actual)
617 print ("temps")
618 for i in range(len(ctable)):
619 actual = float(sim.fpr(i))
620 print ("i", i, actual)
621 for i in range(len(ctable)):
622 expected = 1.0/ctable[i]
623 actual = float(sim.fpr(i+80))
624 err = abs((actual - expected) / expected)
625 print ("i", i, actual, "1/expect", 1/expected,
626 "expected", expected,
627 "err", err)
628 self.assertTrue(err < 1e-6)
629
630 def test_sv_remap_dct_cos_precompute_inner_8(self):
631 """pre-computes a DCT COS table, using the shorter costable
632 indices schedule. turns out, some COS values are repeated
633 in each layer of the DCT butterfly.
634
635 the simpler (scalar) version is in test_caller_transcendentals.py
636 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
637 need the new version of fcfids which doesn't spam memory with
638 LD/STs.
639 """
640 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
641 "svremap 0, 0, 0, 2, 0, 1, 1",
642 "sv.svstep 4.v, 3, 1", # svstep get vector of ci
643 "sv.svstep 16.v, 2, 1", # svstep get vector of step
644 "addi 1, 0, 0x0000",
645 "setvl 0, 0, 7, 0, 1, 1",
646 "sv.std 4.v, 0(1)",
647 "sv.lfd 64.v, 0(1)",
648 "sv.fcfids 48.v, 64.v",
649 "addi 1, 0, 0x0060",
650 "sv.std 16.v, 0(1)",
651 "sv.lfd 12.v, 0(1)",
652 "sv.fcfids 24.v, 12.v",
653 "sv.fadds 0.v, 24.v, 43", # plus 0.5
654 "sv.fmuls 0.v, 0.v, 41", # times PI
655 "sv.fdivs 0.v, 0.v, 48.v", # div size
656 "sv.fcoss 80.v, 0.v",
657 "sv.fdivs 80.v, 43, 80.v", # div 0.5 / x
658 ])
659 lst = list(lst)
660
661 gprs = [0] * 32
662 fprs = [0] * 128
663 # constants
664 fprs[43] = fp64toselectable(0.5) # 0.5
665 fprs[41] = fp64toselectable(math.pi) # pi
666 fprs[44] = fp64toselectable(2.0) # 2.0
667
668 n = 8
669
670 ctable = []
671 size = n
672 while size >= 2:
673 halfsize = size // 2
674 for ci in range(halfsize):
675 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
676 ctable.append(coeff)
677 print ("coeff", "ci", ci, "size", size,
678 "i/n", (ci+0.5), 1.0/coeff)
679 size //= 2
680
681 with Program(lst, bigendian=False) as program:
682 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
683 print ("MEM")
684 sim.mem.dump()
685 print ("ci FP")
686 for i in range(len(ctable)):
687 actual = float(sim.fpr(i+24))
688 print ("i", i, actual)
689 print ("size FP")
690 for i in range(len(ctable)):
691 actual = float(sim.fpr(i+48))
692 print ("i", i, actual)
693 print ("temps")
694 for i in range(len(ctable)):
695 actual = float(sim.fpr(i))
696 print ("i", i, actual)
697 for i in range(len(ctable)):
698 expected = 1.0/ctable[i]
699 actual = float(sim.fpr(i+80))
700 err = abs((actual - expected) / expected)
701 print ("i", i, actual, "1/expect", 1/expected,
702 "expected", expected,
703 "err", err)
704 self.assertTrue(err < 1e-6)
705
706 def test_sv_remap_fpmadds_dct_8_mode_4(self):
707 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
708 "svshape 8, 1, 1, 4, 0",
709 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
710 "svshape 8, 1, 1, 3, 0",
711 "sv.fadds 0.v, 0.v, 0.v"
712 ]
713 runs a full in-place 8-long O(N log2 N) DCT, both
714 inner and outer butterfly "REMAP" schedules.
715 uses shorter tables: FRC also needs to be on a Schedule
716 """
717 lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
718 "svshape 8, 1, 1, 4, 0",
719 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
720 "svshape 8, 1, 1, 3, 0",
721 "sv.fadds 0.v, 0.v, 0.v"
722 ])
723 lst = list(lst)
724
725 # array and coefficients to test
726 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
727 n = len(avi)
728 levels = n.bit_length() - 1
729 ri = list(range(n))
730 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
731 av = halfrev2(avi, False)
732 av = [av[ri[i]] for i in range(n)]
733 ctable = []
734 size = n
735 while size >= 2:
736 halfsize = size // 2
737 for ci in range(halfsize):
738 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
739 size //= 2
740
741 # store in regfile
742 fprs = [0] * 32
743 for i, a in enumerate(av):
744 fprs[i+0] = fp64toselectable(a)
745 for i, c in enumerate(ctable):
746 fprs[i+8] = fp64toselectable(1.0 / c) # invert
747
748 with Program(lst, bigendian=False) as program:
749 sim = self.run_tst_program(program, initial_fprs=fprs)
750 print ("spr svshape0", sim.spr['SVSHAPE0'])
751 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
752 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
753 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
754 print ("spr svshape1", sim.spr['SVSHAPE1'])
755 print ("spr svshape2", sim.spr['SVSHAPE2'])
756 print ("spr svshape3", sim.spr['SVSHAPE3'])
757
758 # outer iterative sum
759 res = transform2(avi)
760
761 for i, expected in enumerate(res):
762 print ("i", i, float(sim.fpr(i)), "expected", expected)
763 for i, expected in enumerate(res):
764 # convert to Power single
765 expected = DOUBLE2SINGLE(fp64toselectable(expected))
766 expected = float(expected)
767 actual = float(sim.fpr(i))
768 # approximate error calculation, good enough test
769 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
770 # and the rounding is different
771 err = abs((actual - expected) / expected)
772 print ("err", i, err)
773 self.assertTrue(err < 1e-5)
774
775 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self):
776 """>>> lst = [# LOAD bit-reversed with half-swap
777 "svshape 8, 1, 1, 6, 0",
778 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
779 "sv.lfsbr 0.v, 4(1), 2",
780 # Inner butterfly, twin +/- MUL-ADD-SUB
781 "svremap 31, 1, 0, 2, 0, 1, 1",
782 "svshape 8, 1, 1, 4, 0",
783 "sv.fdmadds 0.v, 0.v, 0.v, 8.v"
784 # Outer butterfly, iterative sum
785 "svshape 8, 1, 1, 3, 0",
786 "sv.fadds 0.v, 0.v, 0.v"
787 ]
788 runs a full in-place 8-long O(N log2 N) DCT, both
789 inner and outer butterfly "REMAP" schedules, and using
790 bit-reversed half-swapped LDs.
791 uses shorter pre-loaded COS tables: FRC also needs to be on a
792 Schedule
793 """
794 lst = SVP64Asm( ["addi 1, 0, 0x000",
795 "svshape 8, 1, 1, 6, 0",
796 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
797 "sv.lfsbr 0.v, 4(1), 2",
798 "svremap 31, 1, 0, 2, 0, 1, 1",
799 "svshape 8, 1, 1, 4, 0",
800 "sv.fdmadds 0.v, 0.v, 0.v, 8.v",
801 "svshape 8, 1, 1, 3, 0",
802 "sv.fadds 0.v, 0.v, 0.v"
803 ])
804 lst = list(lst)
805
806 # array and coefficients to test
807 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
808
809 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
810 # LD will bring them in, in the correct order.
811 mem = {}
812 val = 0
813 for i, a in enumerate(avi):
814 a = SINGLE(fp64toselectable(a)).value
815 shift = (i % 2) == 1
816 if shift == 0:
817 val = a # accumulate for next iteration
818 else:
819 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
820
821 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
822 n = len(avi)
823 ctable = []
824 size = n
825 while size >= 2:
826 halfsize = size // 2
827 for ci in range(halfsize):
828 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
829 size //= 2
830
831 # store in regfile
832 fprs = [0] * 32
833 for i, c in enumerate(ctable):
834 fprs[i+8] = fp64toselectable(1.0 / c) # invert
835
836 with Program(lst, bigendian=False) as program:
837 sim = self.run_tst_program(program, initial_fprs=fprs,
838 initial_mem=mem)
839 print ("spr svshape0", sim.spr['SVSHAPE0'])
840 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
841 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
842 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
843 print ("spr svshape1", sim.spr['SVSHAPE1'])
844 print ("spr svshape2", sim.spr['SVSHAPE2'])
845 print ("spr svshape3", sim.spr['SVSHAPE3'])
846
847 # outer iterative sum
848 res = transform2(avi)
849
850 for i, expected in enumerate(res):
851 print ("i", i, float(sim.fpr(i)), "expected", expected)
852
853 for i, expected in enumerate(res):
854 # convert to Power single
855 expected = DOUBLE2SINGLE(fp64toselectable(expected))
856 expected = float(expected)
857 actual = float(sim.fpr(i))
858 # approximate error calculation, good enough test
859 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
860 # and the rounding is different
861 err = abs((actual - expected) / expected)
862 print ("err", i, err)
863 self.assertTrue(err < 1e-5)
864
865 def run_tst_program(self, prog, initial_regs=None,
866 svstate=None,
867 initial_mem=None,
868 initial_fprs=None):
869 if initial_regs is None:
870 initial_regs = [0] * 32
871 simulator = run_tst(prog, initial_regs, mem=initial_mem,
872 initial_fprs=initial_fprs,
873 svstate=svstate)
874
875 print ("GPRs")
876 simulator.gpr.dump()
877 print ("FPRs")
878 simulator.fpr.dump()
879
880 return simulator
881
882
883 if __name__ == "__main__":
884 unittest.main()