pysvp64db: fix traversal
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 import math
2 import unittest
3
4 from nmutil.formaltest import FHDLTestCase
5 from openpower.decoder.helpers import SINGLE, fp64toselectable
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.isa.remap_dct_yield import (
8 halfrev2, inverse_transform2, iterate_dct_inner_butterfly_indices,
9 iterate_dct_outer_butterfly_indices, reverse_bits, transform2)
10 from openpower.decoder.isa.test_caller import run_tst
11 from openpower.decoder.isafunctions.double2single import (
12 ISACallerFnHelper_double2single)
13 from openpower.decoder.selectable_int import SelectableInt
14 from openpower.simulator.program import Program
15 from openpower.insndb.asm import SVP64Asm
16
17 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
18 # from pseudo-code.
19 fph = ISACallerFnHelper_double2single(XLEN=64, FPSCR=None)
20 fph.namespace = {'FPSCR': fph.FPSCR,
21 'NIA': None,
22 'XLEN': fph.XLEN,
23 'CIA': None,
24 'SVSTATE': None,
25 }
26
27
28 def transform_inner_radix2_dct(vec, ctable):
29
30 # Initialization
31 n = len(vec)
32 print()
33 print("transform2", n)
34 levels = n.bit_length() - 1
35
36 # reference (read/write) the in-place data in *reverse-bit-order*
37 ri = list(range(n))
38 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
39
40 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
41 # TODO: merge these two
42 vec = halfrev2(vec, False)
43 vec = [vec[ri[i]] for i in range(n)]
44
45 ################
46 # INNER butterfly
47 ################
48 xdim = n
49 ydim = 0
50 zdim = 1
51
52 # set up an SVSHAPE
53 class SVSHAPE:
54 pass
55 # j schedule
56 SVSHAPE0 = SVSHAPE()
57 SVSHAPE0.lims = [xdim, 2, zdim]
58 SVSHAPE0.mode = 0b01
59 SVSHAPE0.submode2 = 0b01
60 SVSHAPE0.skip = 0b00
61 SVSHAPE0.offset = 0 # experiment with different offset, here
62 SVSHAPE0.invxyz = [1, 0, 0] # inversion if desired
63 # j+halfstep schedule
64 SVSHAPE1 = SVSHAPE()
65 SVSHAPE1.lims = [xdim, 2, zdim]
66 SVSHAPE1.mode = 0b01
67 SVSHAPE1.submode2 = 0b01
68 SVSHAPE1.skip = 0b01
69 SVSHAPE1.offset = 0 # experiment with different offset, here
70 SVSHAPE1.invxyz = [1, 0, 0] # inversion if desired
71
72 # enumerate over the iterator function, getting new indices
73 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
74 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
75 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
76 t1, t2 = vec[jl], vec[jh]
77 coeff = ctable[k]
78 vec[jl] = t1 + t2
79 vec[jh] = (t1 - t2) * (1.0/coeff)
80 print("coeff", "ci", k,
81 "jl", jl, "jh", jh,
82 "i/n", (k+0.5), 1.0/coeff,
83 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
84 "end", bin(jle), bin(jhe))
85 if jle == 0b111: # all loops end
86 break
87
88 return vec
89
90
91 def transform_outer_radix2_dct(vec):
92
93 # Initialization
94 n = len(vec)
95 print()
96 print("transform2", n)
97 levels = n.bit_length() - 1
98
99 # outer butterfly
100 xdim = n
101 ydim = 0
102 zdim = 1
103
104 # j schedule
105 class SVSHAPE:
106 pass
107 SVSHAPE0 = SVSHAPE()
108 SVSHAPE0.lims = [xdim, 3, zdim]
109 SVSHAPE0.submode2 = 0b100
110 SVSHAPE0.mode = 0b01
111 SVSHAPE0.skip = 0b00
112 SVSHAPE0.offset = 0 # experiment with different offset, here
113 SVSHAPE0.invxyz = [0, 0, 0] # inversion if desired
114 # j+halfstep schedule
115 SVSHAPE1 = SVSHAPE()
116 SVSHAPE1.lims = [xdim, 3, zdim]
117 SVSHAPE1.mode = 0b01
118 SVSHAPE1.submode2 = 0b100
119 SVSHAPE1.skip = 0b01
120 SVSHAPE1.offset = 0 # experiment with different offset, here
121 SVSHAPE1.invxyz = [0, 0, 0] # inversion if desired
122
123 # enumerate over the iterator function, getting new indices
124 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
125 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
126 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
127 print("itersum jr", jl, jh,
128 "end", bin(jle), bin(jhe))
129 vec[jl] += vec[jh]
130 if jle == 0b111: # all loops end
131 break
132
133 print("transform2 result", vec)
134
135 return vec
136
137
138 def transform_inner_radix2_idct(vec, ctable):
139
140 # Initialization
141 n = len(vec)
142 print()
143 print("transform2", n)
144 levels = n.bit_length() - 1
145
146 # pretend we LDed data in half-swapped order
147 vec = halfrev2(vec, False)
148
149 ################
150 # INNER butterfly
151 ################
152 xdim = n
153 ydim = 0
154 zdim = 1
155
156 # set up an SVSHAPE
157 class SVSHAPE:
158 pass
159 # j schedule
160 SVSHAPE0 = SVSHAPE()
161 SVSHAPE0.lims = [xdim, 0b000001, 1]
162 SVSHAPE0.mode = 0b11
163 SVSHAPE0.submode2 = 0b11
164 SVSHAPE0.skip = 0b00
165 SVSHAPE0.offset = 0 # experiment with different offset, here
166 SVSHAPE0.invxyz = [0, 0, 0] # inversion if desired
167 # j+halfstep schedule
168 SVSHAPE1 = SVSHAPE()
169 SVSHAPE1.lims = [xdim, 0b000001, 1]
170 SVSHAPE1.mode = 0b11
171 SVSHAPE1.submode2 = 0b11
172 SVSHAPE1.skip = 0b01
173 SVSHAPE1.offset = 0 # experiment with different offset, here
174 SVSHAPE1.invxyz = [0, 0, 0] # inversion if desired
175
176 # enumerate over the iterator function, getting new indices
177 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
178 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
179 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
180 t1, t2 = vec[jl], vec[jh]
181 coeff = ctable[k]
182 vec[jl] = t1 + t2/coeff
183 vec[jh] = t1 - t2/coeff
184 print("coeff", "ci", k,
185 "jl", jl, "jh", jh,
186 "i/n", (k+0.5), 1.0/coeff,
187 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
188 "end", bin(jle), bin(jhe))
189 if jle == 0b111: # all loops end
190 break
191
192 return vec
193
194
195 def transform_outer_radix2_idct(vec):
196
197 # Initialization
198 n = len(vec)
199 print()
200 print("transform2-inv", n)
201 levels = n.bit_length() - 1
202
203 # outer butterfly
204 xdim = n
205 ydim = 0
206 zdim = 1
207
208 # reference (read/write) the in-place data in *reverse-bit-order*
209 ri = list(range(n))
210 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
211
212 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
213 # TODO: merge these two
214 vec = [vec[ri[i]] for i in range(n)]
215 vec = halfrev2(vec, True)
216
217 # j schedule
218 class SVSHAPE:
219 pass
220 SVSHAPE0 = SVSHAPE()
221 SVSHAPE0.lims = [xdim, 2, zdim]
222 SVSHAPE0.submode2 = 0b011
223 SVSHAPE0.mode = 0b11
224 SVSHAPE0.skip = 0b00
225 SVSHAPE0.offset = 0 # experiment with different offset, here
226 SVSHAPE0.invxyz = [1, 0, 1] # inversion if desired
227 # j+halfstep schedule
228 SVSHAPE1 = SVSHAPE()
229 SVSHAPE1.lims = [xdim, 2, zdim]
230 SVSHAPE1.mode = 0b11
231 SVSHAPE1.submode2 = 0b011
232 SVSHAPE1.skip = 0b01
233 SVSHAPE1.offset = 0 # experiment with different offset, here
234 SVSHAPE1.invxyz = [1, 0, 1] # inversion if desired
235
236 # enumerate over the iterator function, getting new indices
237 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
238 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
239 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
240 print("itersum jr", jl, jh,
241 "end", bin(jle), bin(jhe))
242 vec[jh] += vec[jl]
243 if jle == 0b111: # all loops end
244 break
245
246 print("transform2-inv result", vec)
247
248 return vec
249
250
251 class DCTTestCase(FHDLTestCase):
252
253 def _check_regs(self, sim, expected):
254 for i in range(32):
255 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
256
257 def test_sv_ffadds_dct(self):
258 """>>> lst = ["sv.fdmadds *0, *0, *0, *8"
259 ]
260 four in-place vector adds, four in-place vector mul-subs
261
262 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
263 FRS to perform the two multiplies. one add, one subtract.
264
265 sv.fdadds FRT, FRA, FRC, FRB actually does:
266 fadds FRT , FRB, FRA
267 fsubs FRT+vl, FRA, FRB+vl
268 """
269 lst = SVP64Asm(["sv.fdmadds *0, *8, *0"
270 ])
271 lst = list(lst)
272
273 # cheat here with these values, they're selected so that
274 # rounding errors do not occur. sigh.
275 fprs = [0] * 32
276 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
277 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
278 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
279 res = []
280 # work out the results with the twin add-sub
281 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
282 fprs[i+0] = fp64toselectable(a)
283 fprs[i+4] = fp64toselectable(b)
284 fprs[i+8] = fp64toselectable(c)
285 # this isn't quite a perfect replication of the
286 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
287 # and FPSUB32 directly to be honest.
288 t = a + b
289 diff = (a - b)
290 diff = fph.DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
291 diff = float(diff)
292 u = diff * c
293 tc = fph.DOUBLE2SINGLE(fp64toselectable(t)) # cvt to Power single
294 uc = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
295 res.append((uc, tc))
296 print("DCT", i, "in", a, b, "c", c, "res", t, u)
297
298 # SVSTATE (in this case, VL=2)
299 svstate = SVP64State()
300 svstate.vl = 4 # VL
301 svstate.maxvl = 4 # MAXVL
302 print("SVSTATE", bin(svstate.asint()))
303
304 with Program(lst, bigendian=False) as program:
305 sim = self.run_tst_program(program, svstate=svstate,
306 initial_fprs=fprs)
307 # confirm that the results are as expected
308 for i, (t, u) in enumerate(res):
309 a = float(sim.fpr(i+0))
310 b = float(sim.fpr(i+4))
311 t = float(t)
312 u = float(u)
313 print("DCT", i, "in", a, b, "res", t, u)
314 for i, (t, u) in enumerate(res):
315 self.assertEqual(sim.fpr(i+0), t)
316 self.assertEqual(sim.fpr(i+4), u)
317
318 def test_sv_remap_fpmadds_idct_outer_8(self, stride=2):
319 """>>> lst = ["svshape 8, 1, 1, 11, 0",
320 "svremap 27, 0, 1, 2, 1, 0, 0",
321 "sv.fadds *0, *0, *0"
322 ]
323 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
324 for inverse-DCT, does the iterative overlapped ADDs
325
326 SVP64 "REMAP" in Butterfly Mode.
327 """
328 lst = SVP64Asm(["svshape 8, 1, %d, 11, 0" % stride, # outer butterfly
329 "svremap 27, 0, 1, 2, 1, 0, 0",
330 "sv.fadds *0, *0, *0"
331 ])
332 lst = list(lst)
333
334 # array and coefficients to test
335 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
336
337 n = len(avi)
338 levels = n.bit_length() - 1
339 ri = list(range(n))
340 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
341 av = [avi[ri[i]] for i in range(n)]
342 av = halfrev2(av, True)
343
344 # store in regfile
345 fprs = [0] * 32
346 for i, a in enumerate(av):
347 fprs[i*stride+0] = fp64toselectable(a)
348
349 with Program(lst, bigendian=False) as program:
350 sim = self.run_tst_program(program, initial_fprs=fprs)
351 print("spr svshape0", sim.spr['SVSHAPE0'])
352 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
353 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
354 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
355 print("spr svshape1", sim.spr['SVSHAPE1'])
356 print("spr svshape2", sim.spr['SVSHAPE2'])
357 print("spr svshape3", sim.spr['SVSHAPE3'])
358
359 # outer iterative sum
360 res = transform_outer_radix2_idct(avi)
361
362 for i, expected in enumerate(res):
363 print("i", i*stride, float(sim.fpr(i*stride)),
364 "expected", expected)
365 for i, expected in enumerate(res):
366 # convert to Power single
367 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
368 expected = float(expected)
369 actual = float(sim.fpr(i*stride))
370 # approximate error calculation, good enough test
371 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
372 # and the rounding is different
373 err = abs((actual - expected) / expected)
374 print("err", i, err)
375 self.assertTrue(err < 1e-6)
376
377 def test_sv_remap_fpmadds_dct_outer_8(self, stride=2):
378 """>>> lst = ["svshape 8, 1, 1, 3, 0",
379 "svremap 27, 1, 0, 2, 0, 1, 0",
380 "sv.fadds *0, *0, *0"
381 ]
382 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
383 for DCT, does the iterative overlapped ADDs
384
385 SVP64 "REMAP" in Butterfly Mode.
386 """
387 lst = SVP64Asm(["svshape 8, 1, %d, 3, 0" % stride,
388 "svremap 27, 1, 0, 2, 0, 1, 0",
389 "sv.fadds *0, *0, *0"
390 ])
391 lst = list(lst)
392
393 # array and coefficients to test
394 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
395
396 # store in regfile
397 fprs = [0] * 32
398 for i, a in enumerate(av):
399 fprs[i*stride+0] = fp64toselectable(a)
400
401 with Program(lst, bigendian=False) as program:
402 sim = self.run_tst_program(program, initial_fprs=fprs)
403 print("spr svshape0", sim.spr['SVSHAPE0'])
404 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
405 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
406 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
407 print("spr svshape1", sim.spr['SVSHAPE1'])
408 print("spr svshape2", sim.spr['SVSHAPE2'])
409 print("spr svshape3", sim.spr['SVSHAPE3'])
410
411 # outer iterative sum
412 res = transform_outer_radix2_dct(av)
413
414 for i, expected in enumerate(res):
415 print("i", i*stride, float(sim.fpr(i*stride)),
416 "expected", expected)
417 for i, expected in enumerate(res):
418 # convert to Power single
419 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
420 expected = float(expected)
421 actual = float(sim.fpr(i*stride))
422 # approximate error calculation, good enough test
423 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
424 # and the rounding is different
425 err = abs((actual - expected) / expected)
426 print("err", i, err)
427 self.assertTrue(err < 1e-6)
428
429 def test_sv_remap_dct_cos_precompute_inner_8(self):
430 """pre-computes a DCT COS table, using the shorter costable
431 indices schedule. turns out, some COS values are repeated
432 in each layer of the DCT butterfly.
433
434 the simpler (scalar) version is in test_caller_transcendentals.py
435 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
436 need the new version of fcfids which doesn't spam memory with
437 LD/STs.
438 """
439 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
440 "svremap 0, 0, 0, 2, 0, 1, 1",
441 "sv.svstep *4, 0, 3, 1", # svstep get vector of ci
442 "sv.svstep *16, 0, 2, 1", # svstep get vector of step
443 "addi 1, 0, 0x0000",
444 "setvl 0, 0, 7, 0, 1, 1",
445 "sv.std *4, 0(1)",
446 "sv.lfd *64, 0(1)",
447 "sv.fcfids *48, *64",
448 "addi 1, 0, 0x0060",
449 "sv.std *16, 0(1)",
450 "sv.lfd *12, 0(1)",
451 "sv.fcfids *24, *12",
452 "sv.fadds *0, *24, 43", # plus 0.5
453 "sv.fmuls *0, *0, 41", # times PI
454 "sv.fdivs *0, *0, *48", # div size
455 "sv.fcoss *80, *0",
456 "sv.fdivs *80, 43, *80", # div 0.5 / x
457 ])
458 lst = list(lst)
459
460 gprs = [0] * 32
461 fprs = [0] * 128
462 # constants
463 fprs[43] = fp64toselectable(0.5) # 0.5
464 fprs[41] = fp64toselectable(math.pi) # pi
465 fprs[44] = fp64toselectable(2.0) # 2.0
466
467 n = 8
468
469 ctable = []
470 size = n
471 while size >= 2:
472 halfsize = size // 2
473 for ci in range(halfsize):
474 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
475 ctable.append(coeff)
476 print("coeff", "ci", ci, "size", size,
477 "i/n", (ci+0.5), 1.0/coeff)
478 size //= 2
479
480 with Program(lst, bigendian=False) as program:
481 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
482 print("MEM")
483 sim.mem.dump()
484 print("ci FP")
485 for i in range(len(ctable)):
486 actual = float(sim.fpr(i+24))
487 print("i", i, actual)
488 print("size FP")
489 for i in range(len(ctable)):
490 actual = float(sim.fpr(i+48))
491 print("i", i, actual)
492 print("temps")
493 for i in range(len(ctable)):
494 actual = float(sim.fpr(i))
495 print("i", i, actual)
496 for i in range(len(ctable)):
497 expected = 1.0/ctable[i]
498 actual = float(sim.fpr(i+80))
499 err = abs((actual - expected) / expected)
500 print("i", i, actual, "1/expect", 1/expected,
501 "expected", expected,
502 "err", err)
503 self.assertTrue(err < 1e-6)
504
505 def test_sv_remap_fpmadds_dct_8_mode_4(self, stride=2):
506 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
507 "svshape 8, 1, 1, 4, 0",
508 "sv.fdmadds *0, *16, *0"
509 "svshape 8, 1, 1, 3, 0",
510 "sv.fadds *0, *0, *0"
511 ]
512 runs a full in-place 8-long O(N log2 N) DCT, both
513 inner and outer butterfly "REMAP" schedules.
514 uses shorter tables: FRC also needs to be on a Schedule
515 """
516 lst = SVP64Asm(["svremap 31, 1, 0, 2, 0, 1, 1",
517 "svshape 8, 1, %d, 4, 0" % stride,
518 "sv.fdmadds *0, *16, *0",
519 "svshape 8, 1, %d, 3, 0" % stride,
520 "sv.fadds *0, *0, *0"
521 ])
522 lst = list(lst)
523
524 # array and coefficients to test
525 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
526 n = len(avi)
527 levels = n.bit_length() - 1
528 ri = list(range(n))
529 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
530 av = halfrev2(avi, False)
531 av = [av[ri[i]] for i in range(n)]
532 ctable = []
533 size = n
534 while size >= 2:
535 halfsize = size // 2
536 for ci in range(halfsize):
537 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
538 size //= 2
539
540 # store in regfile
541 fprs = [0] * 32
542 for i, a in enumerate(av):
543 fprs[i*stride+0] = fp64toselectable(a)
544 for i, c in enumerate(ctable):
545 fprs[i+16] = fp64toselectable(1.0 / c) # invert
546
547 with Program(lst, bigendian=False) as program:
548 sim = self.run_tst_program(program, initial_fprs=fprs)
549 print("spr svshape0", sim.spr['SVSHAPE0'])
550 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
551 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
552 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
553 print("spr svshape1", sim.spr['SVSHAPE1'])
554 print("spr svshape2", sim.spr['SVSHAPE2'])
555 print("spr svshape3", sim.spr['SVSHAPE3'])
556
557 # outer iterative sum
558 res = transform2(avi)
559
560 for i, expected in enumerate(res):
561 print("i", i*stride, float(sim.fpr(i*stride)),
562 "expected", expected)
563 for i, expected in enumerate(res):
564 # convert to Power single
565 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
566 expected = float(expected)
567 actual = float(sim.fpr(i*stride))
568 # approximate error calculation, good enough test
569 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
570 # and the rounding is different
571 err = abs((actual - expected) / expected)
572 print("err", i, err)
573 self.assertTrue(err < 1e-5)
574
575 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self, stride=1):
576 """>>> lst = [# LOAD bit-reversed with half-swap
577 "svshape 8, 1, 1, 6, 0",
578 "svremap 1, 0, 0, 0, 0, 0, 0",
579 "sv.lfs/els *0, 4(1)",
580 # Inner butterfly, twin +/- MUL-ADD-SUB
581 "svremap 31, 1, 0, 2, 0, 1, 1",
582 "svshape 8, 1, 1, 4, 0",
583 "sv.fdmadds *0, *32, *0"
584 # Outer butterfly, iterative sum
585 "svshape 8, 1, 1, 3, 0",
586 "sv.fadds *0, *0, *0"
587 ]
588 runs a full in-place 8-long O(N log2 N) DCT, both
589 inner and outer butterfly "REMAP" schedules, and using
590 bit-reversed half-swapped LDs.
591 uses shorter pre-loaded COS tables: FRC also needs to be on a
592 Schedule
593 """
594 lst = SVP64Asm(["addi 1, 0, 0x000",
595 "svshape 8, 1, %d, 6, 0" % stride,
596 "svremap 1, 0, 0, 0, 0, 0, 0",
597 "sv.lfs/els *0, 4(1)",
598 "svremap 31, 1, 0, 2, 0, 1, 1",
599 "svshape 8, 1, %d, 4, 0" % stride,
600 "sv.fdmadds *0, *32, *0",
601 "svshape 8, 1, %d, 3, 0" % stride,
602 "sv.fadds *0, *0, *0"
603 ])
604 lst = list(lst)
605
606 # array and coefficients to test
607 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
608
609 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
610 # LD will bring them in, in the correct order.
611 mem = {}
612 val = 0
613 for i, a in enumerate(avi):
614 a = SINGLE(fp64toselectable(a)).value
615 shift = (i % 2) == 1
616 if shift == 0:
617 val = a # accumulate for next iteration
618 else:
619 # even and odd 4-byte in same 8
620 mem[(i//2)*8] = val | (a << 32)
621
622 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
623 n = len(avi)
624 ctable = []
625 size = n
626 while size >= 2:
627 halfsize = size // 2
628 for ci in range(halfsize):
629 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
630 size //= 2
631
632 # store in regfile
633 fprs = [0] * 64
634 for i, c in enumerate(ctable):
635 fprs[i+32] = fp64toselectable(1.0 / c) # invert
636
637 with Program(lst, bigendian=False) as program:
638 sim = self.run_tst_program(program, initial_fprs=fprs,
639 initial_mem=mem)
640 print("spr svshape0", sim.spr['SVSHAPE0'])
641 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
642 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
643 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
644 print("spr svshape1", sim.spr['SVSHAPE1'])
645 print("spr svshape2", sim.spr['SVSHAPE2'])
646 print("spr svshape3", sim.spr['SVSHAPE3'])
647
648 # outer iterative sum
649 res = transform2(avi)
650
651 for i, expected in enumerate(res):
652 print("i", i*stride, float(sim.fpr(i*stride)),
653 "expected", expected)
654
655 for i, expected in enumerate(res):
656 # convert to Power single
657 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
658 expected = float(expected)
659 actual = float(sim.fpr(i*stride))
660 # approximate error calculation, good enough test
661 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
662 # and the rounding is different
663 err = abs((actual - expected) / expected)
664 print("err", i, err)
665 self.assertTrue(err < 1e-5)
666
667 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self):
668 """>>> lst = [# LOAD bit-reversed with half-swap
669 "svshape 8, 1, 1, 14, 0",
670 "svremap 1, 0, 0, 0, 0, 0, 0",
671 "sv.lfs/els *0, 4(1)",
672 # Outer butterfly, iterative sum
673 "svremap 31, 0, 1, 2, 1, 0, 1",
674 "svshape 8, 1, 1, 11, 0",
675 "sv.fadds *0, *0, *0",
676 # Inner butterfly, twin +/- MUL-ADD-SUB
677 "svshape 8, 1, 1, 12, 0",
678 "sv.ffmadds *0, *8, *0"
679 ]
680 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
681 inner and outer butterfly "REMAP" schedules, and using
682 bit-reversed half-swapped LDs.
683 uses shorter pre-loaded COS tables: FRC also needs to be on a
684 Schedule in the sv.ffmadds instruction
685 """
686 lst = SVP64Asm(["addi 1, 0, 0x000",
687 "svshape 8, 1, 1, 14, 0",
688 "svremap 1, 0, 0, 0, 0, 0, 0",
689 "sv.lfs/els *0, 4(1)",
690 "svremap 31, 0, 1, 2, 1, 0, 1",
691 "svshape 8, 1, 1, 11, 0",
692 "sv.fadds *0, *0, *0",
693 "svshape 8, 1, 1, 12, 0",
694 "sv.ffmadds *0, *8, *0"
695 ])
696 lst = list(lst)
697
698 # array and coefficients to test
699 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
700
701 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
702 # LD will bring them in, in the correct order.
703 mem = {}
704 val = 0
705 for i, a in enumerate(avi):
706 if i == 0: # first element, divide by 2
707 a /= 2.0
708 a = SINGLE(fp64toselectable(a)).value
709 shift = (i % 2) == 1
710 if shift == 0:
711 val = a # accumulate for next iteration
712 else:
713 # even and odd 4-byte in same 8
714 mem[(i//2)*8] = val | (a << 32)
715
716 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
717 n = len(avi)
718 ctable = []
719 size = 2
720 while size <= n:
721 halfsize = size // 2
722 for ci in range(halfsize):
723 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
724 size *= 2
725
726 # store in regfile
727 fprs = [0] * 32
728 for i, c in enumerate(ctable):
729 fprs[i+8] = fp64toselectable(1.0 / c) # invert
730
731 with Program(lst, bigendian=False) as program:
732 sim = self.run_tst_program(program, initial_fprs=fprs,
733 initial_mem=mem)
734 print("spr svshape0", sim.spr['SVSHAPE0'])
735 print(" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
736 print(" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
737 print(" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
738 print("spr svshape1", sim.spr['SVSHAPE1'])
739 print("spr svshape2", sim.spr['SVSHAPE2'])
740 print("spr svshape3", sim.spr['SVSHAPE3'])
741
742 # outer iterative sum
743 res = inverse_transform2(avi)
744
745 for i, expected in enumerate(res):
746 print("i", i, float(sim.fpr(i)), "expected", expected)
747
748 for i, expected in enumerate(res):
749 # convert to Power single
750 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
751 expected = float(expected)
752 actual = float(sim.fpr(i))
753 # approximate error calculation, good enough test
754 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
755 # and the rounding is different
756 err = abs((actual - expected) / expected)
757 print("err", i, err)
758 self.assertTrue(err < 1e-5)
759
760 def run_tst_program(self, prog, initial_regs=None,
761 svstate=None,
762 initial_mem=None,
763 initial_fprs=None):
764 if initial_regs is None:
765 initial_regs = [0] * 32
766 simulator = run_tst(prog, initial_regs, mem=initial_mem,
767 initial_fprs=initial_fprs,
768 svstate=svstate)
769
770 print("GPRs")
771 simulator.gpr.dump()
772 print("FPRs")
773 simulator.fpr.dump()
774
775 return simulator
776
777
778 if __name__ == "__main__":
779 unittest.main()