c9312ef4540c538ad0dd56e012fd3a6015fe288d
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_dct.py
1 from nmigen import Module, Signal
2 from nmigen.sim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 from openpower.decoder.power_decoder import (create_pdecode)
5 from openpower.simulator.program import Program
6 from openpower.decoder.isa.caller import SVP64State
7 from openpower.decoder.selectable_int import SelectableInt
8 from openpower.decoder.isa.test_caller import run_tst
9 from openpower.sv.trans.svp64 import SVP64Asm
10 from copy import deepcopy
11 from openpower.decoder.helpers import fp64toselectable, SINGLE
12 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
13 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
14 iterate_dct_inner_butterfly_indices,
15 iterate_dct_outer_butterfly_indices,
16 transform2, inverse_transform2)
17 from openpower.decoder.isa.fastdctlee import inverse_transform_iter
18 import unittest
19 import math
20
21 # really bad hack. need to access the DOUBLE2SINGLE function auto-generated
22 # from pseudo-code.
23 fph = ISACallerFnHelper(XLEN=64)
24
25
26 def transform_inner_radix2_dct(vec, ctable):
27
28 # Initialization
29 n = len(vec)
30 print ()
31 print ("transform2", n)
32 levels = n.bit_length() - 1
33
34 # reference (read/write) the in-place data in *reverse-bit-order*
35 ri = list(range(n))
36 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
37
38 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
39 # TODO: merge these two
40 vec = halfrev2(vec, False)
41 vec = [vec[ri[i]] for i in range(n)]
42
43 ################
44 # INNER butterfly
45 ################
46 xdim = n
47 ydim = 0
48 zdim = 1
49
50 # set up an SVSHAPE
51 class SVSHAPE:
52 pass
53 # j schedule
54 SVSHAPE0 = SVSHAPE()
55 SVSHAPE0.lims = [xdim, 2, zdim]
56 SVSHAPE0.mode = 0b01
57 SVSHAPE0.submode2 = 0b01
58 SVSHAPE0.skip = 0b00
59 SVSHAPE0.offset = 0 # experiment with different offset, here
60 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
61 # j+halfstep schedule
62 SVSHAPE1 = SVSHAPE()
63 SVSHAPE1.lims = [xdim, 2, zdim]
64 SVSHAPE1.mode = 0b01
65 SVSHAPE1.submode2 = 0b01
66 SVSHAPE1.skip = 0b01
67 SVSHAPE1.offset = 0 # experiment with different offset, here
68 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
69
70 # enumerate over the iterator function, getting new indices
71 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
72 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
73 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
74 t1, t2 = vec[jl], vec[jh]
75 coeff = ctable[k]
76 vec[jl] = t1 + t2
77 vec[jh] = (t1 - t2) * (1.0/coeff)
78 print ("coeff", "ci", k,
79 "jl", jl, "jh", jh,
80 "i/n", (k+0.5), 1.0/coeff,
81 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
82 "end", bin(jle), bin(jhe))
83 if jle == 0b111: # all loops end
84 break
85
86 return vec
87
88
89 def transform_outer_radix2_dct(vec):
90
91 # Initialization
92 n = len(vec)
93 print ()
94 print ("transform2", n)
95 levels = n.bit_length() - 1
96
97 # outer butterfly
98 xdim = n
99 ydim = 0
100 zdim = 1
101
102 # j schedule
103 class SVSHAPE:
104 pass
105 SVSHAPE0 = SVSHAPE()
106 SVSHAPE0.lims = [xdim, 3, zdim]
107 SVSHAPE0.submode2 = 0b100
108 SVSHAPE0.mode = 0b01
109 SVSHAPE0.skip = 0b00
110 SVSHAPE0.offset = 0 # experiment with different offset, here
111 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
112 # j+halfstep schedule
113 SVSHAPE1 = SVSHAPE()
114 SVSHAPE1.lims = [xdim, 3, zdim]
115 SVSHAPE1.mode = 0b01
116 SVSHAPE1.submode2 = 0b100
117 SVSHAPE1.skip = 0b01
118 SVSHAPE1.offset = 0 # experiment with different offset, here
119 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
120
121 # enumerate over the iterator function, getting new indices
122 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
123 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
124 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
125 print ("itersum jr", jl, jh,
126 "end", bin(jle), bin(jhe))
127 vec[jl] += vec[jh]
128 if jle == 0b111: # all loops end
129 break
130
131 print("transform2 result", vec)
132
133 return vec
134
135
136 def transform_inner_radix2_idct(vec, ctable):
137
138 # Initialization
139 n = len(vec)
140 print ()
141 print ("transform2", n)
142 levels = n.bit_length() - 1
143
144 # pretend we LDed data in half-swapped order
145 vec = halfrev2(vec, False)
146
147 ################
148 # INNER butterfly
149 ################
150 xdim = n
151 ydim = 0
152 zdim = 1
153
154 # set up an SVSHAPE
155 class SVSHAPE:
156 pass
157 # j schedule
158 SVSHAPE0 = SVSHAPE()
159 SVSHAPE0.lims = [xdim, 0b000001, 1]
160 SVSHAPE0.mode = 0b11
161 SVSHAPE0.submode2 = 0b11
162 SVSHAPE0.skip = 0b00
163 SVSHAPE0.offset = 0 # experiment with different offset, here
164 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
165 # j+halfstep schedule
166 SVSHAPE1 = SVSHAPE()
167 SVSHAPE1.lims = [xdim, 0b000001, 1]
168 SVSHAPE1.mode = 0b11
169 SVSHAPE1.submode2 = 0b11
170 SVSHAPE1.skip = 0b01
171 SVSHAPE1.offset = 0 # experiment with different offset, here
172 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
173
174 # enumerate over the iterator function, getting new indices
175 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
176 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
177 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
178 t1, t2 = vec[jl], vec[jh]
179 coeff = ctable[k]
180 vec[jl] = t1 + t2/coeff
181 vec[jh] = t1 - t2/coeff
182 print ("coeff", "ci", k,
183 "jl", jl, "jh", jh,
184 "i/n", (k+0.5), 1.0/coeff,
185 "t1, t2", t1, t2, "res", vec[jl], vec[jh],
186 "end", bin(jle), bin(jhe))
187 if jle == 0b111: # all loops end
188 break
189
190 return vec
191
192
193 def transform_outer_radix2_idct(vec):
194
195 # Initialization
196 n = len(vec)
197 print ()
198 print ("transform2-inv", n)
199 levels = n.bit_length() - 1
200
201 # outer butterfly
202 xdim = n
203 ydim = 0
204 zdim = 1
205
206 # reference (read/write) the in-place data in *reverse-bit-order*
207 ri = list(range(n))
208 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
209
210 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
211 # TODO: merge these two
212 vec = [vec[ri[i]] for i in range(n)]
213 vec = halfrev2(vec, True)
214
215 # j schedule
216 class SVSHAPE:
217 pass
218 SVSHAPE0 = SVSHAPE()
219 SVSHAPE0.lims = [xdim, 2, zdim]
220 SVSHAPE0.submode2 = 0b011
221 SVSHAPE0.mode = 0b11
222 SVSHAPE0.skip = 0b00
223 SVSHAPE0.offset = 0 # experiment with different offset, here
224 SVSHAPE0.invxyz = [1,0,1] # inversion if desired
225 # j+halfstep schedule
226 SVSHAPE1 = SVSHAPE()
227 SVSHAPE1.lims = [xdim, 2, zdim]
228 SVSHAPE1.mode = 0b11
229 SVSHAPE1.submode2 = 0b011
230 SVSHAPE1.skip = 0b01
231 SVSHAPE1.offset = 0 # experiment with different offset, here
232 SVSHAPE1.invxyz = [1,0,1] # inversion if desired
233
234 # enumerate over the iterator function, getting new indices
235 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
236 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
237 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
238 print ("itersum jr", jl, jh,
239 "end", bin(jle), bin(jhe))
240 vec[jh] += vec[jl]
241 if jle == 0b111: # all loops end
242 break
243
244 print("transform2-inv result", vec)
245
246 return vec
247
248
249 class DCTTestCase(FHDLTestCase):
250
251 def _check_regs(self, sim, expected):
252 for i in range(32):
253 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
254
255 def test_sv_ffadds_dct(self):
256 """>>> lst = ["sv.fdmadds *0, *0, *0, *8"
257 ]
258 four in-place vector adds, four in-place vector mul-subs
259
260 SVP64 "DCT" mode will *automatically* offset FRB and an implicit
261 FRS to perform the two multiplies. one add, one subtract.
262
263 sv.fdadds FRT, FRA, FRC, FRB actually does:
264 fadds FRT , FRB, FRA
265 fsubs FRT+vl, FRA, FRB+vl
266 """
267 lst = SVP64Asm(["sv.fdmadds *0, *0, *0, *8"
268 ])
269 lst = list(lst)
270
271 # cheat here with these values, they're selected so that
272 # rounding errors do not occur. sigh.
273 fprs = [0] * 32
274 av = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
275 bv = [-2.0, 2.0, -0.8, 1.4] # second half of array 4..7
276 cv = [-1.0, 0.5, 2.5, -0.25] # coefficients
277 res = []
278 # work out the results with the twin add-sub
279 for i, (a, b, c) in enumerate(zip(av, bv, cv)):
280 fprs[i+0] = fp64toselectable(a)
281 fprs[i+4] = fp64toselectable(b)
282 fprs[i+8] = fp64toselectable(c)
283 # this isn't quite a perfect replication of the
284 # FP32 mul-add-sub. better really to use FPMUL32, FPADD32
285 # and FPSUB32 directly to be honest.
286 t = a + b
287 diff = (a - b)
288 diff = fph.DOUBLE2SINGLE(fp64toselectable(diff)) # FP32 round
289 diff = float(diff)
290 u = diff * c
291 tc = fph.DOUBLE2SINGLE(fp64toselectable(t)) # cvt to Power single
292 uc = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
293 res.append((uc, tc))
294 print ("DCT", i, "in", a, b, "c", c, "res", t, u)
295
296 # SVSTATE (in this case, VL=2)
297 svstate = SVP64State()
298 svstate.vl = 4 # VL
299 svstate.maxvl = 4 # MAXVL
300 print ("SVSTATE", bin(svstate.asint()))
301
302 with Program(lst, bigendian=False) as program:
303 sim = self.run_tst_program(program, svstate=svstate,
304 initial_fprs=fprs)
305 # confirm that the results are as expected
306 for i, (t, u) in enumerate(res):
307 a = float(sim.fpr(i+0))
308 b = float(sim.fpr(i+4))
309 t = float(t)
310 u = float(u)
311 print ("DCT", i, "in", a, b, "res", t, u)
312 for i, (t, u) in enumerate(res):
313 self.assertEqual(sim.fpr(i+0), t)
314 self.assertEqual(sim.fpr(i+4), u)
315
316 def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
317 self.sv_remap_fpmadds_dct_inner_4(stride=2)
318
319 def test_sv_remap_fpmadds_dct_inner_4_stride_1(self):
320 self.sv_remap_fpmadds_dct_inner_4(stride=1)
321
322 def sv_remap_fpmadds_dct_inner_4(self, stride=2):
323 """>>> lst = ["svshape 4, 1, 1, 2, 0",
324 "svremap 27, 1, 0, 2, 0, 1, 0",
325 "sv.fdmadds *0, *0, *0, *32"
326 ]
327 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
328 for DCT
329
330 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
331 (3 inputs, 2 outputs)
332
333 Note that the coefficient (FRC) is not on a "schedule", it
334 is straight Vectorised (0123...) because DCT coefficients
335 cannot be shared between butterfly layers (due to +0.5)
336 """
337 lst = SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride,
338 "svremap 27, 1, 0, 2, 0, 1, 0",
339 "sv.fdmadds *0, *0, *0, *16"
340 ])
341 lst = list(lst)
342
343 # array and coefficients to test
344 n = 4
345 av = [7.0, -9.8, 3.0, -32.3]
346 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
347
348 levels = n.bit_length() - 1
349 ri = list(range(n))
350 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
351 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
352 av = halfrev2(avi, False)
353 av = [av[ri[i]] for i in range(n)]
354
355 # store in regfile
356 fprs = [0] * 64
357 for i, c in enumerate(coe):
358 fprs[i+16] = fp64toselectable(1.0 / c) # invert
359 for i, a in enumerate(av):
360 fprs[i*stride+0] = fp64toselectable(a)
361
362 with Program(lst, bigendian=False) as program:
363 sim = self.run_tst_program(program, initial_fprs=fprs)
364 print ("spr svshape0", sim.spr['SVSHAPE0'])
365 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
366 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
367 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
368 print ("spr svshape1", sim.spr['SVSHAPE1'])
369 print ("spr svshape2", sim.spr['SVSHAPE2'])
370 print ("spr svshape3", sim.spr['SVSHAPE3'])
371
372 # work out the results with the twin mul/add-sub
373 res = transform_inner_radix2_dct(avi, coe)
374
375 for i, expected in enumerate(res):
376 print ("i", i*stride, float(sim.fpr(i*stride)),
377 "expected", expected)
378 for i, expected in enumerate(res):
379 # convert to Power single
380 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
381 expected = float(expected)
382 actual = float(sim.fpr(i*stride))
383 # approximate error calculation, good enough test
384 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
385 # and the rounding is different
386 err = abs((actual - expected) / expected)
387 print ("err", i, err)
388 self.assertTrue(err < 1e-6)
389
390 def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
391 self.sv_remap_fpmadds_idct_inner_4(stride=2)
392
393 def test_sv_remap_fpmadds_idct_inner_4_stride_1(self):
394 self.sv_remap_fpmadds_idct_inner_4(stride=1)
395
396 def sv_remap_fpmadds_idct_inner_4(self, stride=2):
397 """>>> lst = ["svshape 4, 1, 1, 10, 0",
398 "svremap 27, 0, 1, 2, 1, 0, 0",
399 "sv.ffmadds *0, *0, *0, *8"
400 ]
401 runs a full in-place 4-long O(N log2 N) inner butterfly schedule
402 for inverse-DCT
403
404 SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
405 (3 inputs, 2 outputs)
406
407 Note that the coefficient (FRC) is not on a "schedule", it
408 is straight Vectorised (0123...) because DCT coefficients
409 cannot be shared between butterfly layers (due to +0.5)
410 """
411 lst = SVP64Asm( ["svshape 4, 1, %d, 10, 0" % stride,
412 "svremap 27, 0, 1, 2, 1, 0, 0",
413 "sv.ffmadds *0, *0, *0, *16"
414 ])
415 lst = list(lst)
416
417 # array and coefficients to test
418 n = 4
419 levels = n.bit_length() - 1
420 coe = [-0.25, 0.5, 3.1, 6.2] # 4 coefficients
421 avi = [7.0, -0.8, 2.0, -2.3] # first half of array 0..3
422 av = halfrev2(avi, False)
423
424 # store in regfile
425 fprs = [0] * 64
426 for i, c in enumerate(coe):
427 fprs[i+16] = fp64toselectable(1.0 / c) # invert
428 for i, a in enumerate(av):
429 fprs[i*stride+0] = fp64toselectable(a)
430
431 with Program(lst, bigendian=False) as program:
432 sim = self.run_tst_program(program, initial_fprs=fprs)
433 print ("spr svshape0", sim.spr['SVSHAPE0'])
434 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
435 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
436 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
437 print ("spr svshape1", sim.spr['SVSHAPE1'])
438 print ("spr svshape2", sim.spr['SVSHAPE2'])
439 print ("spr svshape3", sim.spr['SVSHAPE3'])
440
441 # work out the results with the twin mul/add-sub
442 res = transform_inner_radix2_idct(avi, coe)
443
444 for i, expected in enumerate(res):
445 print ("i", i*stride, float(sim.fpr(i*stride)),
446 "expected", expected)
447 for i, expected in enumerate(res):
448 # convert to Power single
449 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
450 expected = float(expected)
451 actual = float(sim.fpr(i*stride))
452 # approximate error calculation, good enough test
453 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
454 # and the rounding is different
455 err = abs((actual - expected) / expected)
456 print ("err", i, err)
457 self.assertTrue(err < 1e-6)
458
459 def test_sv_remap_fpmadds_idct_outer_8(self, stride=2):
460 """>>> lst = ["svshape 8, 1, 1, 11, 0",
461 "svremap 27, 0, 1, 2, 1, 0, 0",
462 "sv.fadds *0, *0, *0"
463 ]
464 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
465 for inverse-DCT, does the iterative overlapped ADDs
466
467 SVP64 "REMAP" in Butterfly Mode.
468 """
469 lst = SVP64Asm( ["svshape 8, 1, %d, 11, 0" % stride, # outer butterfly
470 "svremap 27, 0, 1, 2, 1, 0, 0",
471 "sv.fadds *0, *0, *0"
472 ])
473 lst = list(lst)
474
475 # array and coefficients to test
476 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
477
478 n = len(avi)
479 levels = n.bit_length() - 1
480 ri = list(range(n))
481 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
482 av = [avi[ri[i]] for i in range(n)]
483 av = halfrev2(av, True)
484
485 # store in regfile
486 fprs = [0] * 32
487 for i, a in enumerate(av):
488 fprs[i*stride+0] = fp64toselectable(a)
489
490 with Program(lst, bigendian=False) as program:
491 sim = self.run_tst_program(program, initial_fprs=fprs)
492 print ("spr svshape0", sim.spr['SVSHAPE0'])
493 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
494 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
495 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
496 print ("spr svshape1", sim.spr['SVSHAPE1'])
497 print ("spr svshape2", sim.spr['SVSHAPE2'])
498 print ("spr svshape3", sim.spr['SVSHAPE3'])
499
500 # outer iterative sum
501 res = transform_outer_radix2_idct(avi)
502
503 for i, expected in enumerate(res):
504 print ("i", i*stride, float(sim.fpr(i*stride)),
505 "expected", expected)
506 for i, expected in enumerate(res):
507 # convert to Power single
508 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
509 expected = float(expected)
510 actual = float(sim.fpr(i*stride))
511 # approximate error calculation, good enough test
512 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
513 # and the rounding is different
514 err = abs((actual - expected) / expected)
515 print ("err", i, err)
516 self.assertTrue(err < 1e-6)
517
518 def test_sv_remap_fpmadds_dct_outer_8(self, stride=2):
519 """>>> lst = ["svshape 8, 1, 1, 3, 0",
520 "svremap 27, 1, 0, 2, 0, 1, 0",
521 "sv.fadds *0, *0, *0"
522 ]
523 runs a full in-place 8-long O(N log2 N) outer butterfly schedule
524 for DCT, does the iterative overlapped ADDs
525
526 SVP64 "REMAP" in Butterfly Mode.
527 """
528 lst = SVP64Asm( ["svshape 8, 1, %d, 3, 0" % stride,
529 "svremap 27, 1, 0, 2, 0, 1, 0",
530 "sv.fadds *0, *0, *0"
531 ])
532 lst = list(lst)
533
534 # array and coefficients to test
535 av = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
536
537 # store in regfile
538 fprs = [0] * 32
539 for i, a in enumerate(av):
540 fprs[i*stride+0] = fp64toselectable(a)
541
542 with Program(lst, bigendian=False) as program:
543 sim = self.run_tst_program(program, initial_fprs=fprs)
544 print ("spr svshape0", sim.spr['SVSHAPE0'])
545 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
546 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
547 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
548 print ("spr svshape1", sim.spr['SVSHAPE1'])
549 print ("spr svshape2", sim.spr['SVSHAPE2'])
550 print ("spr svshape3", sim.spr['SVSHAPE3'])
551
552 # outer iterative sum
553 res = transform_outer_radix2_dct(av)
554
555 for i, expected in enumerate(res):
556 print ("i", i*stride, float(sim.fpr(i*stride)),
557 "expected", expected)
558 for i, expected in enumerate(res):
559 # convert to Power single
560 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
561 expected = float(expected)
562 actual = float(sim.fpr(i*stride))
563 # approximate error calculation, good enough test
564 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
565 # and the rounding is different
566 err = abs((actual - expected) / expected)
567 print ("err", i, err)
568 self.assertTrue(err < 1e-6)
569
570 def test_sv_remap_fpmadds_idct_8(self, stride=2):
571 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
572 "svshape 8, 1, 1, 11, 0",
573 "sv.fadds *0, *0, *0",
574 "svshape 8, 1, 1, 10, 0",
575 "sv.ffmadds *0, *0, *0, *16"
576 ]
577 runs a full in-place 8-long O(N log2 N) inverse-DCT, both
578 inner and outer butterfly "REMAP" schedules.
579 """
580 lst = SVP64Asm( ["svremap 27, 0, 1, 2, 1, 0, 1",
581 "svshape 8, 1, %d, 11, 0" % stride,
582 "sv.fadds *0, *0, *0",
583 "svshape 8, 1, %d, 10, 0" % stride,
584 "sv.ffmadds *0, *0, *0, *16"
585 ])
586 lst = list(lst)
587
588 # array and coefficients to test
589 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
590 n = len(avi)
591 levels = n.bit_length() - 1
592 ri = list(range(n))
593 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
594 av = [avi[ri[i]] for i in range(n)]
595 av = halfrev2(av, True)
596
597 # divide first value by 2.0, manually. rev and halfrev should
598 # not have moved it
599 av[0] /= 2.0
600 #avi[0] /= 2.0
601
602 print ("input data pre idct", av)
603
604 ctable = []
605 size = 2
606 while size <= n:
607 halfsize = size // 2
608 for i in range(n//size):
609 for ci in range(halfsize):
610 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
611 size *= 2
612
613 # store in regfile
614 fprs = [0] * 32
615 for i, a in enumerate(av):
616 fprs[i*stride+0] = fp64toselectable(a)
617 for i, c in enumerate(ctable):
618 fprs[i+16] = fp64toselectable(1.0 / c) # invert
619
620 with Program(lst, bigendian=False) as program:
621 sim = self.run_tst_program(program, initial_fprs=fprs)
622 print ("spr svshape0", sim.spr['SVSHAPE0'])
623 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
624 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
625 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
626 print ("spr svshape1", sim.spr['SVSHAPE1'])
627 print ("spr svshape2", sim.spr['SVSHAPE2'])
628 print ("spr svshape3", sim.spr['SVSHAPE3'])
629
630 # inverse DCT
631 expected = [-15.793373940443367, 27.46969091937703,
632 -24.712331606496313, 27.03601462756265]
633
634 #res = inverse_transform_iter(avi)
635 res = inverse_transform2(avi)
636 #res = transform_outer_radix2_idct(avi)
637
638 for i, expected in enumerate(res):
639 print ("i", i*stride, float(sim.fpr(i*stride)),
640 "expected", expected)
641 for i, expected in enumerate(res):
642 # convert to Power single
643 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
644 expected = float(expected)
645 actual = float(sim.fpr(i*stride))
646 # approximate error calculation, good enough test
647 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
648 # and the rounding is different
649 err = abs((actual - expected) / expected)
650 print ("err", i*stride, err)
651 self.assertTrue(err < 1e-5)
652
653 def test_sv_remap_fpmadds_dct_8(self, stride=2):
654 """>>> lst = ["svremap 27, 1, 0, 2, 0, 1, 1",
655 "svshape 8, 1, 1, 2, 0",
656 "sv.fdmadds *0, *0, *0, *8"
657 "svshape 8, 1, 1, 3, 0",
658 "sv.fadds *0, *0, *0"
659 ]
660 runs a full in-place 8-long O(N log2 N) DCT, both
661 inner and outer butterfly "REMAP" schedules.
662 """
663 lst = SVP64Asm( ["svremap 27, 1, 0, 2, 0, 1, 1",
664 "svshape 8, 1, %d, 2, 0" % stride,
665 "sv.fdmadds *0, *0, *0, *16",
666 "svshape 8, 1, %d, 3, 0" % stride,
667 "sv.fadds *0, *0, *0"
668 ])
669 lst = list(lst)
670
671 # array and coefficients to test
672 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
673 n = len(avi)
674 levels = n.bit_length() - 1
675 ri = list(range(n))
676 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
677 av = halfrev2(avi, False)
678 av = [av[ri[i]] for i in range(n)]
679 ctable = []
680 size = n
681 while size >= 2:
682 halfsize = size // 2
683 for i in range(n//size):
684 for ci in range(halfsize):
685 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
686 size //= 2
687
688 # store in regfile
689 fprs = [0] * 32
690 for i, a in enumerate(av):
691 fprs[i*stride+0] = fp64toselectable(a)
692 for i, c in enumerate(ctable):
693 fprs[i+16] = fp64toselectable(1.0 / c) # invert
694
695 with Program(lst, bigendian=False) as program:
696 sim = self.run_tst_program(program, initial_fprs=fprs)
697 print ("spr svshape0", sim.spr['SVSHAPE0'])
698 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
699 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
700 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
701 print ("spr svshape1", sim.spr['SVSHAPE1'])
702 print ("spr svshape2", sim.spr['SVSHAPE2'])
703 print ("spr svshape3", sim.spr['SVSHAPE3'])
704
705 # outer iterative sum
706 res = transform2(avi)
707
708 for i, expected in enumerate(res):
709 print ("i", i*stride, float(sim.fpr(i*stride)),
710 "expected", expected)
711 for i, expected in enumerate(res):
712 # convert to Power single
713 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
714 expected = float(expected)
715 actual = float(sim.fpr(i*stride))
716 # approximate error calculation, good enough test
717 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
718 # and the rounding is different
719 err = abs((actual - expected) / expected)
720 print ("err", i, err)
721 self.assertTrue(err < 1e-5)
722
723 def test_sv_remap_dct_cos_precompute_8(self):
724 """pre-computes a DCT COS table, deliberately using a lot of
725 registers so as to be able to see what is going on (dumping all
726 regs after the run).
727
728 the simpler (scalar) version is in test_caller_transcendentals.py
729 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
730 need the new version of fcfids which doesn't spam memory with
731 LD/STs.
732 """
733 lst = SVP64Asm(["svshape 8, 1, 1, 2, 0",
734 "svremap 0, 0, 0, 2, 0, 1, 1",
735 "sv.svstep *4, 4, 1", # svstep get vector of ci
736 "sv.svstep *16, 3, 1", # svstep get vector of step
737 "addi 1, 0, 0x0000",
738 "setvl 0, 0, 12, 0, 1, 1",
739 "sv.std *4, 0(1)",
740 "sv.lfd *64, 0(1)",
741 "sv.fcfids *48, *64",
742 "addi 1, 0, 0x0060",
743 "sv.std *16, 0(1)",
744 "sv.lfd *12, 0(1)",
745 "sv.fcfids *24, *12",
746 "sv.fadds *0, *24, 43", # plus 0.5
747 "sv.fmuls *0, *0, 41", # times PI
748 "sv.fdivs *0, *0, *48", # div size
749 "sv.fcoss *80, *0",
750 "sv.fdivs *80, 43, *80", # div 0.5 / x
751 ])
752 lst = list(lst)
753
754 gprs = [0] * 32
755 fprs = [0] * 128
756 # constants
757 fprs[43] = fp64toselectable(0.5) # 0.5
758 fprs[41] = fp64toselectable(math.pi) # pi
759 fprs[44] = fp64toselectable(2.0) # 2.0
760
761 n = 8
762
763 ctable = []
764 size = n
765 while size >= 2:
766 halfsize = size // 2
767 for i in range(n//size):
768 for ci in range(halfsize):
769 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
770 size //= 2
771
772 with Program(lst, bigendian=False) as program:
773 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
774 print ("MEM")
775 sim.mem.dump()
776 print ("ci FP")
777 for i in range(len(ctable)):
778 actual = float(sim.fpr(i+24))
779 print ("i", i, actual)
780 print ("size FP")
781 for i in range(len(ctable)):
782 actual = float(sim.fpr(i+48))
783 print ("i", i, actual)
784 print ("temps")
785 for i in range(len(ctable)):
786 actual = float(sim.fpr(i))
787 print ("i", i, actual)
788 for i in range(len(ctable)):
789 expected = 1.0/ctable[i]
790 actual = float(sim.fpr(i+80))
791 err = abs((actual - expected) / expected)
792 print ("i", i, actual, "1/expect", 1/expected,
793 "expected", expected,
794 "err", err)
795 self.assertTrue(err < 1e-6)
796
797 def test_sv_remap_dct_cos_precompute_inner_8(self):
798 """pre-computes a DCT COS table, using the shorter costable
799 indices schedule. turns out, some COS values are repeated
800 in each layer of the DCT butterfly.
801
802 the simpler (scalar) version is in test_caller_transcendentals.py
803 (test_fp_coss_cvt), this is the SVP64 variant. TODO: really
804 need the new version of fcfids which doesn't spam memory with
805 LD/STs.
806 """
807 lst = SVP64Asm(["svshape 8, 1, 1, 5, 0",
808 "svremap 0, 0, 0, 2, 0, 1, 1",
809 "sv.svstep *4, 3, 1", # svstep get vector of ci
810 "sv.svstep *16, 2, 1", # svstep get vector of step
811 "addi 1, 0, 0x0000",
812 "setvl 0, 0, 7, 0, 1, 1",
813 "sv.std *4, 0(1)",
814 "sv.lfd *64, 0(1)",
815 "sv.fcfids *48, *64",
816 "addi 1, 0, 0x0060",
817 "sv.std *16, 0(1)",
818 "sv.lfd *12, 0(1)",
819 "sv.fcfids *24, *12",
820 "sv.fadds *0, *24, 43", # plus 0.5
821 "sv.fmuls *0, *0, 41", # times PI
822 "sv.fdivs *0, *0, *48", # div size
823 "sv.fcoss *80, *0",
824 "sv.fdivs *80, 43, *80", # div 0.5 / x
825 ])
826 lst = list(lst)
827
828 gprs = [0] * 32
829 fprs = [0] * 128
830 # constants
831 fprs[43] = fp64toselectable(0.5) # 0.5
832 fprs[41] = fp64toselectable(math.pi) # pi
833 fprs[44] = fp64toselectable(2.0) # 2.0
834
835 n = 8
836
837 ctable = []
838 size = n
839 while size >= 2:
840 halfsize = size // 2
841 for ci in range(halfsize):
842 coeff = math.cos((ci + 0.5) * math.pi / size) * 2.0
843 ctable.append(coeff)
844 print ("coeff", "ci", ci, "size", size,
845 "i/n", (ci+0.5), 1.0/coeff)
846 size //= 2
847
848 with Program(lst, bigendian=False) as program:
849 sim = self.run_tst_program(program, gprs, initial_fprs=fprs)
850 print ("MEM")
851 sim.mem.dump()
852 print ("ci FP")
853 for i in range(len(ctable)):
854 actual = float(sim.fpr(i+24))
855 print ("i", i, actual)
856 print ("size FP")
857 for i in range(len(ctable)):
858 actual = float(sim.fpr(i+48))
859 print ("i", i, actual)
860 print ("temps")
861 for i in range(len(ctable)):
862 actual = float(sim.fpr(i))
863 print ("i", i, actual)
864 for i in range(len(ctable)):
865 expected = 1.0/ctable[i]
866 actual = float(sim.fpr(i+80))
867 err = abs((actual - expected) / expected)
868 print ("i", i, actual, "1/expect", 1/expected,
869 "expected", expected,
870 "err", err)
871 self.assertTrue(err < 1e-6)
872
873 def test_sv_remap_fpmadds_dct_8_mode_4(self, stride=2):
874 """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1",
875 "svshape 8, 1, 1, 4, 0",
876 "sv.fdmadds *0, *0, *0, *8"
877 "svshape 8, 1, 1, 3, 0",
878 "sv.fadds *0, *0, *0"
879 ]
880 runs a full in-place 8-long O(N log2 N) DCT, both
881 inner and outer butterfly "REMAP" schedules.
882 uses shorter tables: FRC also needs to be on a Schedule
883 """
884 lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1",
885 "svshape 8, 1, %d, 4, 0" % stride,
886 "sv.fdmadds *0, *0, *0, *16",
887 "svshape 8, 1, %d, 3, 0" % stride,
888 "sv.fadds *0, *0, *0"
889 ])
890 lst = list(lst)
891
892 # array and coefficients to test
893 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
894 n = len(avi)
895 levels = n.bit_length() - 1
896 ri = list(range(n))
897 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
898 av = halfrev2(avi, False)
899 av = [av[ri[i]] for i in range(n)]
900 ctable = []
901 size = n
902 while size >= 2:
903 halfsize = size // 2
904 for ci in range(halfsize):
905 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
906 size //= 2
907
908 # store in regfile
909 fprs = [0] * 32
910 for i, a in enumerate(av):
911 fprs[i*stride+0] = fp64toselectable(a)
912 for i, c in enumerate(ctable):
913 fprs[i+16] = fp64toselectable(1.0 / c) # invert
914
915 with Program(lst, bigendian=False) as program:
916 sim = self.run_tst_program(program, initial_fprs=fprs)
917 print ("spr svshape0", sim.spr['SVSHAPE0'])
918 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
919 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
920 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
921 print ("spr svshape1", sim.spr['SVSHAPE1'])
922 print ("spr svshape2", sim.spr['SVSHAPE2'])
923 print ("spr svshape3", sim.spr['SVSHAPE3'])
924
925 # outer iterative sum
926 res = transform2(avi)
927
928 for i, expected in enumerate(res):
929 print ("i", i*stride, float(sim.fpr(i*stride)),
930 "expected", expected)
931 for i, expected in enumerate(res):
932 # convert to Power single
933 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
934 expected = float(expected)
935 actual = float(sim.fpr(i*stride))
936 # approximate error calculation, good enough test
937 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
938 # and the rounding is different
939 err = abs((actual - expected) / expected)
940 print ("err", i, err)
941 self.assertTrue(err < 1e-5)
942
943 def test_sv_remap_fpmadds_ldbrev_dct_8_mode_4(self, stride=1):
944 """>>> lst = [# LOAD bit-reversed with half-swap
945 "svshape 8, 1, 1, 6, 0",
946 "svremap 1, 0, 0, 0, 0, 0, 0",
947 "sv.lfs/els *0, 4(1)",
948 # Inner butterfly, twin +/- MUL-ADD-SUB
949 "svremap 31, 1, 0, 2, 0, 1, 1",
950 "svshape 8, 1, 1, 4, 0",
951 "sv.fdmadds *0, *0, *0, *8"
952 # Outer butterfly, iterative sum
953 "svshape 8, 1, 1, 3, 0",
954 "sv.fadds *0, *0, *0"
955 ]
956 runs a full in-place 8-long O(N log2 N) DCT, both
957 inner and outer butterfly "REMAP" schedules, and using
958 bit-reversed half-swapped LDs.
959 uses shorter pre-loaded COS tables: FRC also needs to be on a
960 Schedule
961 """
962 lst = SVP64Asm( ["addi 1, 0, 0x000",
963 "svshape 8, 1, %d, 6, 0" % stride,
964 "svremap 1, 0, 0, 0, 0, 0, 0",
965 "sv.lfs/els *0, 4(1)",
966 "svremap 31, 1, 0, 2, 0, 1, 1",
967 "svshape 8, 1, %d, 4, 0" % stride,
968 "sv.fdmadds *0, *0, *0, *32",
969 "svshape 8, 1, %d, 3, 0" % stride,
970 "sv.fadds *0, *0, *0"
971 ])
972 lst = list(lst)
973
974 # array and coefficients to test
975 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
976
977 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
978 # LD will bring them in, in the correct order.
979 mem = {}
980 val = 0
981 for i, a in enumerate(avi):
982 a = SINGLE(fp64toselectable(a)).value
983 shift = (i % 2) == 1
984 if shift == 0:
985 val = a # accumulate for next iteration
986 else:
987 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
988
989 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
990 n = len(avi)
991 ctable = []
992 size = n
993 while size >= 2:
994 halfsize = size // 2
995 for ci in range(halfsize):
996 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
997 size //= 2
998
999 # store in regfile
1000 fprs = [0] * 64
1001 for i, c in enumerate(ctable):
1002 fprs[i+32] = fp64toselectable(1.0 / c) # invert
1003
1004 with Program(lst, bigendian=False) as program:
1005 sim = self.run_tst_program(program, initial_fprs=fprs,
1006 initial_mem=mem)
1007 print ("spr svshape0", sim.spr['SVSHAPE0'])
1008 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
1009 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
1010 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
1011 print ("spr svshape1", sim.spr['SVSHAPE1'])
1012 print ("spr svshape2", sim.spr['SVSHAPE2'])
1013 print ("spr svshape3", sim.spr['SVSHAPE3'])
1014
1015 # outer iterative sum
1016 res = transform2(avi)
1017
1018 for i, expected in enumerate(res):
1019 print ("i", i*stride, float(sim.fpr(i*stride)),
1020 "expected", expected)
1021
1022 for i, expected in enumerate(res):
1023 # convert to Power single
1024 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
1025 expected = float(expected)
1026 actual = float(sim.fpr(i*stride))
1027 # approximate error calculation, good enough test
1028 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1029 # and the rounding is different
1030 err = abs((actual - expected) / expected)
1031 print ("err", i, err)
1032 self.assertTrue(err < 1e-5)
1033
1034 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self):
1035 """>>> lst = [# LOAD bit-reversed with half-swap
1036 "svshape 8, 1, 1, 14, 0",
1037 "svremap 1, 0, 0, 0, 0, 0, 0",
1038 "sv.lfs/els *0, 4(1)",
1039 # Outer butterfly, iterative sum
1040 "svremap 31, 0, 1, 2, 1, 0, 1",
1041 "svshape 8, 1, 1, 11, 0",
1042 "sv.fadds *0, *0, *0",
1043 # Inner butterfly, twin +/- MUL-ADD-SUB
1044 "svshape 8, 1, 1, 10, 0",
1045 "sv.ffmadds *0, *0, *0, *8"
1046 ]
1047 runs a full in-place 8-long O(N log2 N) Inverse-DCT, both
1048 inner and outer butterfly "REMAP" schedules, and using
1049 bit-reversed half-swapped LDs.
1050 uses shorter pre-loaded COS tables: FRC also needs to be on a
1051 Schedule in the sv.ffmadds instruction
1052 """
1053 lst = SVP64Asm( ["addi 1, 0, 0x000",
1054 "svshape 8, 1, 1, 14, 0",
1055 "svremap 1, 0, 0, 0, 0, 0, 0",
1056 "sv.lfs/els *0, 4(1)",
1057 "svremap 31, 0, 1, 2, 1, 0, 1",
1058 "svshape 8, 1, 1, 11, 0",
1059 "sv.fadds *0, *0, *0",
1060 "svshape 8, 1, 1, 12, 0",
1061 "sv.ffmadds *0, *0, *0, *8"
1062 ])
1063 lst = list(lst)
1064
1065 # array and coefficients to test
1066 avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2]
1067
1068 # store in memory, in standard (expected) order, FP32s (2 per 8-bytes)
1069 # LD will bring them in, in the correct order.
1070 mem = {}
1071 val = 0
1072 for i, a in enumerate(avi):
1073 if i == 0: # first element, divide by 2
1074 a /= 2.0
1075 a = SINGLE(fp64toselectable(a)).value
1076 shift = (i % 2) == 1
1077 if shift == 0:
1078 val = a # accumulate for next iteration
1079 else:
1080 mem[(i//2)*8] = val | (a << 32) # even and odd 4-byte in same 8
1081
1082 # calculate the (shortened) COS tables, 4 2 1 not 4 2+2 1+1+1+1
1083 n = len(avi)
1084 ctable = []
1085 size = 2
1086 while size <= n:
1087 halfsize = size // 2
1088 for ci in range(halfsize):
1089 ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0)
1090 size *= 2
1091
1092 # store in regfile
1093 fprs = [0] * 32
1094 for i, c in enumerate(ctable):
1095 fprs[i+8] = fp64toselectable(1.0 / c) # invert
1096
1097 with Program(lst, bigendian=False) as program:
1098 sim = self.run_tst_program(program, initial_fprs=fprs,
1099 initial_mem=mem)
1100 print ("spr svshape0", sim.spr['SVSHAPE0'])
1101 print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz)
1102 print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz)
1103 print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz)
1104 print ("spr svshape1", sim.spr['SVSHAPE1'])
1105 print ("spr svshape2", sim.spr['SVSHAPE2'])
1106 print ("spr svshape3", sim.spr['SVSHAPE3'])
1107
1108 # outer iterative sum
1109 res = inverse_transform2(avi)
1110
1111 for i, expected in enumerate(res):
1112 print ("i", i, float(sim.fpr(i)), "expected", expected)
1113
1114 for i, expected in enumerate(res):
1115 # convert to Power single
1116 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
1117 expected = float(expected)
1118 actual = float(sim.fpr(i))
1119 # approximate error calculation, good enough test
1120 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
1121 # and the rounding is different
1122 err = abs((actual - expected) / expected)
1123 print ("err", i, err)
1124 self.assertTrue(err < 1e-5)
1125
1126 def run_tst_program(self, prog, initial_regs=None,
1127 svstate=None,
1128 initial_mem=None,
1129 initial_fprs=None):
1130 if initial_regs is None:
1131 initial_regs = [0] * 32
1132 simulator = run_tst(prog, initial_regs, mem=initial_mem,
1133 initial_fprs=initial_fprs,
1134 svstate=svstate)
1135
1136 print ("GPRs")
1137 simulator.gpr.dump()
1138 print ("FPRs")
1139 simulator.fpr.dump()
1140
1141 return simulator
1142
1143
1144 if __name__ == "__main__":
1145 unittest.main()