src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 import unittest
   2
   3 from nmutil.formaltest import FHDLTestCase
   4 from openpower.decoder.helpers import SINGLE, fp64toselectable
   5 from openpower.decoder.isa.caller import SVP64State
   6 from openpower.decoder.isa.test_caller import run_tst
   7 from openpower.decoder.isafunctions.double2single import (
   8             ISACallerFnHelper_double2single)
   9 from openpower.decoder.selectable_int import SelectableInt
  10 from openpower.simulator.program import Program
  11 from openpower.insndb.asm import SVP64Asm
  12
  13 # really bad hack.  need to access the DOUBLE2SINGLE function auto-generated
  14 # from pseudo-code.
  15 fph = ISACallerFnHelper_double2single(XLEN=64, FPSCR=None)
  16 fph.namespace = {'FPSCR': fph.FPSCR,
  17                  'NIA': None,
  18                  'XLEN': fph.XLEN,
  19                  'CIA': None,
  20                  'SVSTATE': None,
  21                 }
  22
  23
  24 def transform_radix2(vec, exptable, reverse=False):
  25     """
  26     # FFT and convolution test (Python), based on Project Nayuki
  27     #
  28     # Copyright (c) 2020 Project Nayuki. (MIT License)
  29     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  30
  31     """
  32     # bits of the integer 'val'.
  33     def reverse_bits(val, width):
  34         result = 0
  35         for _ in range(width):
  36             result = (result << 1) | (val & 1)
  37             val >>= 1
  38         return result
  39
  40     # Initialization
  41     n = len(vec)
  42     levels = n.bit_length() - 1
  43
  44     # Copy with bit-reversed permutation
  45     if reverse:
  46         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  47
  48     size = 2
  49     while size <= n:
  50         halfsize = size // 2
  51         tablestep = n // size
  52         for i in range(0, n, size):
  53             k = 0
  54             for j in range(i, i + halfsize):
  55                 # exact same actual computation, just embedded in
  56                 # triple-nested for-loops
  57                 jl, jh = j, j+halfsize
  58                 vjh = vec[jh]
  59                 temp1 = vec[jh] * exptable[k]
  60                 temp2 = vec[jl]
  61                 vec[jh] = temp2 - temp1
  62                 vec[jl] = temp2 + temp1
  63                 print("xform jl jh k", jl, jh, k,
  64                       "vj vjh ek", temp2, vjh, exptable[k],
  65                       "t1, t2", temp1, temp2,
  66                       "v[jh] v[jl]", vec[jh], vec[jl])
  67                 k += tablestep
  68         size *= 2
  69
  70     return vec
  71
  72
  73 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  74     """
  75     # FFT and convolution test (Python), based on Project Nayuki
  76     #
  77     # Copyright (c) 2020 Project Nayuki. (MIT License)
  78     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  79
  80     """
  81     # bits of the integer 'val'.
  82     def reverse_bits(val, width):
  83         result = 0
  84         for _ in range(width):
  85             result = (result << 1) | (val & 1)
  86             val >>= 1
  87         return result
  88
  89     # Initialization
  90     n = len(vec_r)
  91     levels = n.bit_length() - 1
  92
  93     # Copy with bit-reversed permutation
  94     if reverse:
  95         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  96
  97     size = 2
  98     while size <= n:
  99         halfsize = size // 2
 100         tablestep = n // size
 101         for i in range(0, n, size):
 102             k = 0
 103             for j in range(i, i + halfsize):
 104                 # exact same actual computation, just embedded in
 105                 # triple-nested for-loops
 106                 jl, jh = j, j+halfsize
 107
 108                 print("xform jl jh k", jl, jh, k,
 109                       "vr h l", vec_r[jh], vec_r[jl],
 110                       "vi h l", vec_i[jh], vec_i[jl])
 111                 print("    cr k", cos_r[k], "si k", sin_i[k])
 112                 mul1_r = vec_r[jh] * cos_r[k]
 113                 mul2_r = vec_i[jh] * sin_i[k]
 114                 tpre = mul1_r + mul2_r
 115                 print("        vec_r[jh] * cos_r[k]", mul1_r)
 116                 print("        vec_i[jh] * sin_i[k]", mul2_r)
 117                 print("    tpre", tpre)
 118                 mul1_i = vec_r[jh] * sin_i[k]
 119                 mul2_i = vec_i[jh] * cos_r[k]
 120                 tpim = -mul1_i + mul2_i
 121                 print("        vec_r[jh] * sin_i[k]", mul1_i)
 122                 print("        vec_i[jh] * cos_r[k]", mul2_i)
 123                 print("    tpim", tpim)
 124                 vec_r[jh] = vec_r[jl] - tpre
 125                 vec_i[jh] = vec_i[jl] - tpim
 126                 vec_r[jl] += tpre
 127                 vec_i[jl] += tpim
 128
 129                 print("    xform jl jh k", jl, jh, k,
 130                       "\n       vr h l", vec_r[jh], vec_r[jl],
 131                       "\n       vi h l", vec_i[jh], vec_i[jl])
 132                 k += tablestep
 133         size *= 2
 134
 135     return vec_r, vec_i
 136
 137
 138 class FFTTestCase(FHDLTestCase):
 139
 140     def _check_regs(self, sim, expected):
 141         for i in range(32):
 142             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 143
 144     def test_sv_remap_fpmadds_fft_4(self):
 145         """>>> lst = ["svshape 2, 1, 1, 1, 0",
 146                      "svremap 31, 1, 0, 2, 0, 1, 0",
 147                       "sv.ffmadds. *2, *10, *2"
 148                      ]
 149         this is a cheap (cheating) way to run a single "ffmadds." to
 150         get at least Rc=1 on sv.ffmadds to be activated. the results
 151         are not actually tested because there's no checking yet on
 152         FP Rc=1
 153         """
 154         lst = SVP64Asm(["svshape 2, 1, 1, 1, 0",
 155                         "svremap 31, 1, 0, 2, 0, 1, 0",
 156                         "sv.ffmadds *0, *8, *0"
 157                         ])
 158         lst = list(lst)
 159
 160         # array and coefficients to test
 161         av = [7.0, -9.8]  # array 0..1
 162         coe = [3.1]  # coefficients
 163
 164         # store in regfile
 165         fprs = [0] * 32
 166         for i, c in enumerate(coe):
 167             fprs[i+8] = fp64toselectable(c)
 168         for i, a in enumerate(av):
 169             fprs[i+0] = fp64toselectable(a)
 170
 171         with Program(lst, bigendian=False) as program:
 172             sim = self.run_tst_program(program, initial_fprs=fprs)
 173             print("spr svshape0", sim.spr['SVSHAPE0'])
 174             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 175             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 176             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 177             print("spr svshape1", sim.spr['SVSHAPE1'])
 178             print("spr svshape2", sim.spr['SVSHAPE2'])
 179             print("spr svshape3", sim.spr['SVSHAPE3'])
 180
 181             # work out the results with the twin mul/add-sub
 182             res = transform_radix2(av, coe)
 183
 184             for i, expected in enumerate(res):
 185                 print("i", i, float(sim.fpr(i)), "expected", expected)
 186             for i, expected in enumerate(res):
 187                 # convert to Power single
 188                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 189                 expected = float(expected)
 190                 actual = float(sim.fpr(i))
 191                 # approximate error calculation, good enough test
 192                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 193                 # and the rounding is different
 194                 err = abs(actual - expected) / expected
 195                 self.assertTrue(err < 1e-7)
 196
 197     def test_sv_remap_fpmadds_fft(self):
 198         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 199                      "svremap 31, 1, 0, 2, 0, 1, 0",
 200                       "sv.ffmadds *2, *10, *2"
 201                      ]
 202             runs a full in-place O(N log2 N) butterfly schedule for
 203             Discrete Fourier Transform.
 204
 205             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 206             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 207
 208             there is the *option* to target a different location (non-in-place)
 209             just in case.
 210
 211             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 212             (3 inputs, 2 outputs)
 213         """
 214         lst = SVP64Asm(["svshape 8, 1, 1, 1, 0",
 215                         "svremap 31, 1, 0, 2, 0, 1, 0",
 216                         "sv.ffmadds *0, *8, *0"
 217                         ])
 218         lst = list(lst)
 219
 220         # array and coefficients to test
 221         av = [7.0, -9.8, 3.0, -32.3,
 222               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 223         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 224
 225         # store in regfile
 226         fprs = [0] * 32
 227         for i, c in enumerate(coe):
 228             fprs[i+8] = fp64toselectable(c)
 229         for i, a in enumerate(av):
 230             fprs[i+0] = fp64toselectable(a)
 231
 232         with Program(lst, bigendian=False) as program:
 233             sim = self.run_tst_program(program, initial_fprs=fprs)
 234             print("spr svshape0", sim.spr['SVSHAPE0'])
 235             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 236             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 237             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 238             print("spr svshape1", sim.spr['SVSHAPE1'])
 239             print("spr svshape2", sim.spr['SVSHAPE2'])
 240             print("spr svshape3", sim.spr['SVSHAPE3'])
 241
 242             # work out the results with the twin mul/add-sub
 243             res = transform_radix2(av, coe)
 244
 245             for i, expected in enumerate(res):
 246                 print("i", i, float(sim.fpr(i)), "expected", expected)
 247             for i, expected in enumerate(res):
 248                 # convert to Power single
 249                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 250                 expected = float(expected)
 251                 actual = float(sim.fpr(i))
 252                 # approximate error calculation, good enough test
 253                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 254                 # and the rounding is different
 255                 err = abs(actual - expected) / expected
 256                 self.assertTrue(err < 1e-7)
 257
 258     def test_sv_remap_fpmadds_fft_svstep(self):
 259         """>>> lst = SVP64Asm( [
 260                             "svshape 8, 1, 1, 1, 1",
 261                              "svremap 31, 1, 0, 2, 0, 1, 0",
 262                             "sv.ffmadds *0, *8, *0",
 263                             "svstep. 12, 0, 1, 0",
 264                             "bc 6, 3, -16"
 265                             ])
 266             runs a full in-place O(N log2 N) butterfly schedule for
 267             Discrete Fourier Transform.  this version however uses
 268             SVP64 "Vertical-First" Mode and so needs an explicit
 269             branch, testing CR0.
 270
 271             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 272             (3 inputs, 2 outputs)
 273         """
 274         lst = SVP64Asm([
 275             "svshape 8, 1, 1, 1, 1",
 276             "svremap 31, 1, 0, 2, 0, 1, 0",
 277             "sv.ffmadds *0, *8, *0",
 278             "svstep. 27, 0, 1, 0",
 279             "bc 6, 3, -16"
 280         ])
 281         lst = list(lst)
 282
 283         # array and coefficients to test
 284         av = [7.0, -9.8, 3.0, -32.3,
 285               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 286         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 287
 288         # store in regfile
 289         fprs = [0] * 32
 290         for i, c in enumerate(coe):
 291             fprs[i+8] = fp64toselectable(c)
 292         for i, a in enumerate(av):
 293             fprs[i+0] = fp64toselectable(a)
 294
 295         # set total. err don't know how to calculate how many there are...
 296         # do it manually for now
 297         VL = 0
 298         size = 2
 299         n = len(av)
 300         while size <= n:
 301             halfsize = size // 2
 302             tablestep = n // size
 303             for i in range(0, n, size):
 304                 for j in range(i, i + halfsize):
 305                     VL += 1
 306             size *= 2
 307
 308         # SVSTATE (calculated VL)
 309         svstate = SVP64State()
 310         svstate.vl = VL  # VL
 311         svstate.maxvl = VL  # MAXVL
 312         print("SVSTATE", bin(svstate.asint()))
 313
 314         with Program(lst, bigendian=False) as program:
 315             sim = self.run_tst_program(program, svstate=svstate,
 316                                        initial_fprs=fprs)
 317             print("spr svshape0", sim.spr['SVSHAPE0'])
 318             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 319             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 320             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 321             print("spr svshape1", sim.spr['SVSHAPE1'])
 322             print("spr svshape2", sim.spr['SVSHAPE2'])
 323             print("spr svshape3", sim.spr['SVSHAPE3'])
 324
 325             # work out the results with the twin mul/add-sub
 326             res = transform_radix2(av, coe)
 327
 328             for i, expected in enumerate(res):
 329                 print("i", i, float(sim.fpr(i)), "expected", expected)
 330             for i, expected in enumerate(res):
 331                 # convert to Power single
 332                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 333                 expected = float(expected)
 334                 actual = float(sim.fpr(i))
 335                 # approximate error calculation, good enough test
 336                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 337                 # and the rounding is different
 338                 err = abs(actual - expected) / expected
 339                 self.assertTrue(err < 1e-7)
 340
 341     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 342         """>>> lst = SVP64Asm( [
 343                         "svshape 8, 1, 1, 1, 1",
 344                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 345                          "svremap 5, 1, 0, 2, 0, 0, 1",
 346                          "sv.fmuls 24, *0, *8",
 347                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 348                          "svremap 26, 0, 0, 0, 0, 1, 1",
 349                         "sv.ffadds *0, 24, *0",
 350                         "svstep. 27, 0, 1, 0",
 351                         "bc 6, 3, -28"
 352                             ])
 353
 354             runs a full in-place O(N log2 N) butterfly schedule for
 355             Discrete Fourier Transform.  also uses "Vertical First"
 356             but also uses temporary scalars and ffadds rather than
 357             sv.ffmadds.
 358
 359             this represents an incremental step towards complex FFT
 360
 361             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 362
 363             * single fmuls FRT, FRA, FRC
 364             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 365               (FRS is implicit / hidden in ff* operations)
 366
 367             multiply:                         # sv.fmuls FRT, FRA, FRC
 368                 temp1 = vec[jh] * exptable[k]
 369                 temp2 = vec[jl]
 370             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 371                 vec[jh] = temp2 - temp1
 372                 vec[jl] = temp2 + temp1
 373
 374             also see notes in complex fft test: here svremap is done in
 375             "non-persistent" mode (as a demo) whereas in the complex fft
 376             svremap is used in "persistent" mode, where by a complete
 377             coincidence the REMAP arguments all happen to line up and
 378             only one persistent svremap is needed.  the exact same trick
 379             *could* be applied here but for illustrative purposes it is not.
 380         """
 381         lst = SVP64Asm([
 382             "svshape 8, 1, 1, 1, 1",
 383             # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 384             "svremap 5, 1, 0, 2, 0, 0, 0",
 385             "sv.fmuls 24, *0, *8",
 386             # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 387             "svremap 26, 0, 0, 0, 0, 1, 0",
 388             "sv.ffadds *0, 24, *0",
 389             "svstep. 27, 0, 1, 0",
 390             "bc 6, 3, -28"
 391         ])
 392         lst = list(lst)
 393
 394         # array and coefficients to test
 395         av = [7.0, -9.8, 3.0, -32.3,
 396               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 397         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 398
 399         # store in regfile
 400         fprs = [0] * 32
 401         for i, c in enumerate(coe):
 402             fprs[i+8] = fp64toselectable(c)
 403         for i, a in enumerate(av):
 404             fprs[i+0] = fp64toselectable(a)
 405
 406         # set total. err don't know how to calculate how many there are...
 407         # do it manually for now
 408         VL = 0
 409         size = 2
 410         n = len(av)
 411         while size <= n:
 412             halfsize = size // 2
 413             tablestep = n // size
 414             for i in range(0, n, size):
 415                 for j in range(i, i + halfsize):
 416                     VL += 1
 417             size *= 2
 418
 419         # SVSTATE (calculated VL)
 420         svstate = SVP64State()
 421         svstate.vl = VL  # VL
 422         svstate.maxvl = VL  # MAXVL
 423         print("SVSTATE", bin(svstate.asint()))
 424
 425         with Program(lst, bigendian=False) as program:
 426             sim = self.run_tst_program(program, svstate=svstate,
 427                                        initial_fprs=fprs)
 428             print("spr svshape0", sim.spr['SVSHAPE0'])
 429             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 430             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 431             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 432             print("spr svshape1", sim.spr['SVSHAPE1'])
 433             print("spr svshape2", sim.spr['SVSHAPE2'])
 434             print("spr svshape3", sim.spr['SVSHAPE3'])
 435
 436             # work out the results with the twin mul/add-sub
 437             res = transform_radix2(av, coe)
 438
 439             for i, expected in enumerate(res):
 440                 print("i", i, float(sim.fpr(i)), "expected", expected)
 441             for i, expected in enumerate(res):
 442                 # convert to Power single
 443                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 444                 expected = float(expected)
 445                 actual = float(sim.fpr(i))
 446                 # approximate error calculation, good enough test
 447                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 448                 # and the rounding is different
 449                 err = abs(actual - expected) / expected
 450                 self.assertTrue(err < 1e-7)
 451
 452     def test_sv_fpmadds_fft(self):
 453         """>>> lst = ["sv.ffmadds *2, *10, *2"
 454                         ]
 455             four in-place vector mul-adds, four in-place vector mul-subs
 456
 457             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 458             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 459
 460             there is the *option* to target a different location (non-in-place)
 461             just in case.
 462
 463             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 464             FRS to perform the two multiplies.  one add, one subtract.
 465
 466             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 467                 fmadds  FRT   , FRA, FRC, FRA
 468                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 469
 470         """
 471         lst = SVP64Asm(["sv.ffmadds *2, *10, *2"
 472                         ])
 473         lst = list(lst)
 474
 475         fprs = [0] * 32
 476         av = [7.0, -9.8, 2.0, -32.3]  # first half of array 0..3
 477         bv = [-2.0, 2.0, -9.8, 32.3]  # second half of array 4..7
 478         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 479         res = []
 480         # work out the results with the twin mul/add-sub
 481         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 482             fprs[i+2] = fp64toselectable(a)
 483             fprs[i+6] = fp64toselectable(b)
 484             fprs[i+10] = fp64toselectable(c)
 485             mul = a * c
 486             t = b + mul
 487             u = b - mul
 488             # convert to Power single
 489             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 490             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 491             res.append((t, u))
 492             print("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 493
 494         # SVSTATE (in this case, VL=2)
 495         svstate = SVP64State()
 496         svstate.vl = 4  # VL
 497         svstate.maxvl = 4  # MAXVL
 498         print("SVSTATE", bin(svstate.asint()))
 499
 500         with Program(lst, bigendian=False) as program:
 501             sim = self.run_tst_program(program, svstate=svstate,
 502                                        initial_fprs=fprs)
 503             # confirm that the results are as expected
 504             for i, (t, u) in enumerate(res):
 505                 self.assertEqual(sim.fpr(i+2), t)
 506                 self.assertEqual(sim.fpr(i+6), u)
 507
 508     def test_sv_ffadds_fft(self):
 509         """>>> lst = ["sv.ffadds *2, *2, *2"
 510                         ]
 511             four in-place vector adds, four in-place vector subs
 512
 513             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 514             FRS to perform the two multiplies.  one add, one subtract.
 515
 516             sv.ffadds FRT, FRA, FRB  actually does:
 517                 fadds FRT   , FRB, FRA
 518                 fsubs FRT+vl, FRA, FRB+vl
 519         """
 520         lst = SVP64Asm(["sv.ffadds *2, *2, *2"
 521                         ])
 522         lst = list(lst)
 523
 524         fprs = [0] * 32
 525         av = [7.0, -9.8, 2.0, -32.3]  # first half of array 0..3
 526         bv = [-2.0, 2.0, -9.8, 32.3]  # second half of array 4..7
 527         res = []
 528         # work out the results with the twin add-sub
 529         for i, (a, b) in enumerate(zip(av, bv)):
 530             fprs[i+2] = fp64toselectable(a)
 531             fprs[i+6] = fp64toselectable(b)
 532             t = b + a
 533             u = b - a
 534             # convert to Power single
 535             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 536             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 537             res.append((t, u))
 538             print("FFT", i, "in", a, b, "res", t, u)
 539
 540         # SVSTATE (in this case, VL=2)
 541         svstate = SVP64State()
 542         svstate.vl = 4  # VL
 543         svstate.maxvl = 4  # MAXVL
 544         print("SVSTATE", bin(svstate.asint()))
 545
 546         with Program(lst, bigendian=False) as program:
 547             sim = self.run_tst_program(program, svstate=svstate,
 548                                        initial_fprs=fprs)
 549             # confirm that the results are as expected
 550             for i, (t, u) in enumerate(res):
 551                 a = float(sim.fpr(i+2))
 552                 b = float(sim.fpr(i+6))
 553                 t = float(t)
 554                 u = float(u)
 555                 print("FFT", i, "in", a, b, "res", t, u)
 556             for i, (t, u) in enumerate(res):
 557                 self.assertEqual(sim.fpr(i+2), t)
 558                 self.assertEqual(sim.fpr(i+6), u)
 559
 560     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 561         """
 562             runs a full in-place O(N log2 N) butterfly schedule for
 563             Discrete Fourier Transform.  this version however uses
 564             SVP64 "Vertical-First" Mode and so needs an explicit
 565             branch, testing CR0.
 566
 567             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 568             (3 inputs, 2 outputs)
 569
 570             complex calculation (FFT):
 571
 572                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 573                 vec_r[jh] = vec_r[jl] - tpre
 574                 vec_r[jl] += tpre
 575
 576                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 577                 vec_i[jh] = vec_i[jl] - tpim
 578                 vec_i[jl] += tpim
 579
 580             real-only calculation (DFT):
 581
 582                 temp1 = vec[jh] * exptable[k]
 583                 temp2 = vec[jl]
 584                 vec[jh] = temp2 - temp1
 585                 vec[jl] = temp2 + temp1
 586
 587             note: a rather nice convenience / coincidence. the meaning of
 588             these two instructions is:
 589                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 590                 "svremap 5, 1, 0, 2, 0, 0, 1",
 591                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 592                 "svremap 26, 0, 0, 0, 0, 1, 1",
 593
 594             however it turns out that they can be *merged*, and for
 595             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 596             *ignore* their REMAPs (by definition, because you can't REMAP
 597             scalar operands), and for the second one (sv.ffads) exactly the
 598             right REMAPs are also ignored!
 599
 600             therefore we can merge:
 601                 "svremap 5, 1, 0, 2, 0, 0, 1",
 602                 "svremap 26, 0, 0, 0, 0, 1, 1",
 603             into:
 604                 "svremap 31, 1, 0, 2, 0, 1, 1",
 605             and save one instruction.
 606         """
 607         lst = SVP64Asm([
 608             # set triple butterfly mode with persistent "REMAP"
 609             "svshape 8, 1, 1, 1, 1",
 610             "svremap 31, 1, 0, 2, 0, 1, 1",
 611             # tpre
 612             "sv.fmuls 24, *0, *16",    # mul1_r = r*cos_r
 613             "sv.fmadds 24, *8, *20, 24",  # mul2_r = i*sin_i
 614             # tpre = mul1_r + mul2_r
 615             # tpim
 616             "sv.fmuls 26, *0, *20",    # mul1_i = r*sin_i
 617             "sv.fmsubs 26, *8, *16, 26",  # mul2_i = i*cos_r
 618             # tpim = mul2_i - mul1_i
 619             # vec_r jh/jl
 620             "sv.ffadds *0, 24, *0",    # vh/vl +/- tpre
 621             # vec_i jh/jl
 622             "sv.ffadds *8, 26, *8",    # vh/vl +- tpim
 623
 624             # svstep loop
 625             "svstep. 27, 0, 1, 0",
 626             "bc 6, 3, -56"
 627         ])
 628         lst = list(lst)
 629
 630         # array and coefficients to test
 631         ar = [7.0, -9.8, 3.0, -32.3,
 632               -2.0, 5.0, -9.8, 31.3]  # array 0..7 real
 633         ai = [1.0, -1.8, 3.0, 19.3,
 634               4.0, -2.0, -0.8, 1.3]  # array 0..7 imaginary
 635         coer = [-0.25, 0.5, 3.1, 6.2]  # coefficients real
 636         coei = [0.21, -0.1, 1.1, -4.0]  # coefficients imaginary
 637
 638         # store in regfile
 639         fprs = [0] * 64
 640         for i, a in enumerate(ar):
 641             fprs[i+0] = fp64toselectable(a)
 642         for i, a in enumerate(ai):
 643             fprs[i+8] = fp64toselectable(a)
 644         for i, cr in enumerate(coer):
 645             fprs[i+16] = fp64toselectable(cr)
 646         for i, ci in enumerate(coei):
 647             fprs[i+20] = fp64toselectable(ci)
 648
 649         # set total. err don't know how to calculate how many there are...
 650         # do it manually for now
 651         VL = 0
 652         size = 2
 653         n = len(ar)
 654         while size <= n:
 655             halfsize = size // 2
 656             tablestep = n // size
 657             for i in range(0, n, size):
 658                 for j in range(i, i + halfsize):
 659                     VL += 1
 660             size *= 2
 661
 662         # SVSTATE (calculated VL)
 663         svstate = SVP64State()
 664         svstate.vl = VL  # VL
 665         svstate.maxvl = VL  # MAXVL
 666         print("SVSTATE", bin(svstate.asint()))
 667
 668         with Program(lst, bigendian=False) as program:
 669             sim = self.run_tst_program(program, svstate=svstate,
 670                                        initial_fprs=fprs)
 671             print("spr svshape0", sim.spr['SVSHAPE0'])
 672             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 673             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 674             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 675             print("spr svshape1", sim.spr['SVSHAPE1'])
 676             print("spr svshape2", sim.spr['SVSHAPE2'])
 677             print("spr svshape3", sim.spr['SVSHAPE3'])
 678
 679             # work out the results with the twin mul/add-sub, explicit
 680             # complex numbers
 681             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 682
 683             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 684                 print("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 685                       "expected_r", expected_r,
 686                       "expected_i", expected_i)
 687             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 688                 # convert to Power single
 689                 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r))
 690                 expected_r = float(expected_r)
 691                 actual_r = float(sim.fpr(i))
 692                 # approximate error calculation, good enough test
 693                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 694                 # and the rounding is different
 695                 err = abs(actual_r - expected_r) / expected_r
 696                 self.assertTrue(err < 1e-6)
 697                 # convert to Power single
 698                 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i))
 699                 expected_i = float(expected_i)
 700                 actual_i = float(sim.fpr(i+8))
 701                 # approximate error calculation, good enough test
 702                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 703                 # and the rounding is different
 704                 err = abs(actual_i - expected_i) / expected_i
 705                 self.assertTrue(err < 1e-6)
 706
 707     def test_sv_ffadds_fft_scalar(self):
 708         """>>> lst = ["sv.ffadds *2, 12, 13"
 709                         ]
 710             four in-place vector adds and subs, but done with a scalar
 711             pair (fp12, fp13)
 712         """
 713         lst = SVP64Asm(["sv.ffadds *2, 12, 13"
 714                         ])
 715         lst = list(lst)
 716
 717         fprs = [0] * 32
 718         scalar_a = 1.3
 719         scalar_b = -2.0
 720         fprs[12] = fp64toselectable(scalar_a)
 721         fprs[13] = fp64toselectable(scalar_b)
 722         res = []
 723         # work out the results with the twin add-sub
 724         for i in range(4):
 725             t = scalar_b + scalar_a
 726             u = scalar_b - scalar_a
 727             # convert to Power single
 728             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 729             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 730             res.append((t, u))
 731             print("FFT", i, "res", t, u)
 732
 733         # SVSTATE (in this case, VL=2)
 734         svstate = SVP64State()
 735         svstate.vl = 4  # VL
 736         svstate.maxvl = 4  # MAXVL
 737         print("SVSTATE", bin(svstate.asint()))
 738
 739         with Program(lst, bigendian=False) as program:
 740             sim = self.run_tst_program(program, svstate=svstate,
 741                                        initial_fprs=fprs)
 742             # confirm that the results are as expected
 743             for i, (t, u) in enumerate(res):
 744                 a = float(sim.fpr(i+2))
 745                 b = float(sim.fpr(i+6))
 746                 t = float(t)
 747                 u = float(u)
 748                 print("FFT", i, "in", a, b, "res", t, u)
 749             for i, (t, u) in enumerate(res):
 750                 self.assertEqual(sim.fpr(i+2), t)
 751                 self.assertEqual(sim.fpr(i+6), u)
 752
 753     def test_sv_remap_fpmadds_fft_ldst(self):
 754         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 755                          "sv.lfs/els *0, 4(0)",
 756                          "svshape 8, 1, 1, 1, 0",
 757                          "svremap 31, 1, 0, 2, 0, 1, 0",
 758                          "sv.ffmadds *0, *8, *0"
 759
 760             runs a full in-place O(N log2 N) butterfly schedule for
 761             Discrete Fourier Transform, using bit-reversed LD/ST
 762         """
 763         lst = SVP64Asm(["svshape 8, 1, 1, 15, 0",
 764                         "svremap 1, 0, 0, 0, 0, 0, 0",
 765                         "sv.lfs/els *0, 4(0)",
 766                         "svshape 8, 1, 1, 1, 0",
 767                         "svremap 31, 1, 0, 2, 0, 1, 0",
 768                         "sv.ffmadds *0, *8, *0"
 769                         ])
 770         lst = list(lst)
 771
 772         # array and coefficients to test
 773         av = [7.0, -9.8, 3.0, -32.3,
 774               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 775         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 776
 777         # store in regfile
 778         fprs = [0] * 32
 779         for i, c in enumerate(coe):
 780             fprs[i+8] = fp64toselectable(c)
 781         # store in memory
 782         mem = {}
 783         val = 0
 784         for i, a in enumerate(av):
 785             a = SINGLE(fp64toselectable(a)).value
 786             shift = (i % 2) == 1
 787             if shift == 0:
 788                 val = a
 789             else:
 790                 mem[(i//2)*8] = val | (a << 32)
 791
 792         with Program(lst, bigendian=False) as program:
 793             sim = self.run_tst_program(program, initial_mem=mem,
 794                                        initial_fprs=fprs)
 795             print("spr svshape0", sim.spr['SVSHAPE0'])
 796             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 797             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 798             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 799             print("spr svshape1", sim.spr['SVSHAPE1'])
 800             print("spr svshape2", sim.spr['SVSHAPE2'])
 801             print("spr svshape3", sim.spr['SVSHAPE3'])
 802
 803             print("mem dump")
 804             print(sim.mem.dump())
 805
 806             # work out the results with the twin mul/add-sub,
 807             # note bit-reverse mode requested
 808             res = transform_radix2(av, coe, reverse=True)
 809
 810             for i, expected in enumerate(res):
 811                 print("i", i, float(sim.fpr(i)), "expected", expected)
 812             for i, expected in enumerate(res):
 813                 # convert to Power single
 814                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 815                 expected = float(expected)
 816                 actual = float(sim.fpr(i))
 817                 # approximate error calculation, good enough test
 818                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 819                 # and the rounding is different
 820                 err = abs(actual - expected) / expected
 821                 self.assertTrue(err < 1e-6)
 822
 823     def run_tst_program(self, prog, initial_regs=None,
 824                         svstate=None,
 825                         initial_mem=None,
 826                         initial_fprs=None):
 827         if initial_regs is None:
 828             initial_regs = [0] * 32
 829         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 830                             initial_fprs=initial_fprs,
 831                             svstate=svstate)
 832
 833         print("GPRs")
 834         simulator.gpr.dump()
 835         print("FPRs")
 836         simulator.fpr.dump()
 837
 838         return simulator
 839
 840
 841 if __name__ == "__main__":
 842     unittest.main()