src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 import unittest
   2
   3 from nmutil.formaltest import FHDLTestCase
   4 from openpower.decoder.helpers import SINGLE, fp64toselectable
   5 from openpower.decoder.isa.caller import SVP64State
   6 from openpower.decoder.isa.test_caller import run_tst
   7 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.simulator.program import Program
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11
  12 # really bad hack.  need to access the DOUBLE2SINGLE function auto-generated
  13 # from pseudo-code.
  14 fph = ISACallerFnHelper(XLEN=64)
  15
  16
  17 def transform_radix2(vec, exptable, reverse=False):
  18     """
  19     # FFT and convolution test (Python), based on Project Nayuki
  20     #
  21     # Copyright (c) 2020 Project Nayuki. (MIT License)
  22     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  23
  24     """
  25     # bits of the integer 'val'.
  26     def reverse_bits(val, width):
  27         result = 0
  28         for _ in range(width):
  29             result = (result << 1) | (val & 1)
  30             val >>= 1
  31         return result
  32
  33     # Initialization
  34     n = len(vec)
  35     levels = n.bit_length() - 1
  36
  37     # Copy with bit-reversed permutation
  38     if reverse:
  39         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  40
  41     size = 2
  42     while size <= n:
  43         halfsize = size // 2
  44         tablestep = n // size
  45         for i in range(0, n, size):
  46             k = 0
  47             for j in range(i, i + halfsize):
  48                 # exact same actual computation, just embedded in
  49                 # triple-nested for-loops
  50                 jl, jh = j, j+halfsize
  51                 vjh = vec[jh]
  52                 temp1 = vec[jh] * exptable[k]
  53                 temp2 = vec[jl]
  54                 vec[jh] = temp2 - temp1
  55                 vec[jl] = temp2 + temp1
  56                 print("xform jl jh k", jl, jh, k,
  57                       "vj vjh ek", temp2, vjh, exptable[k],
  58                       "t1, t2", temp1, temp2,
  59                       "v[jh] v[jl]", vec[jh], vec[jl])
  60                 k += tablestep
  61         size *= 2
  62
  63     return vec
  64
  65
  66 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  67     """
  68     # FFT and convolution test (Python), based on Project Nayuki
  69     #
  70     # Copyright (c) 2020 Project Nayuki. (MIT License)
  71     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  72
  73     """
  74     # bits of the integer 'val'.
  75     def reverse_bits(val, width):
  76         result = 0
  77         for _ in range(width):
  78             result = (result << 1) | (val & 1)
  79             val >>= 1
  80         return result
  81
  82     # Initialization
  83     n = len(vec_r)
  84     levels = n.bit_length() - 1
  85
  86     # Copy with bit-reversed permutation
  87     if reverse:
  88         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  89
  90     size = 2
  91     while size <= n:
  92         halfsize = size // 2
  93         tablestep = n // size
  94         for i in range(0, n, size):
  95             k = 0
  96             for j in range(i, i + halfsize):
  97                 # exact same actual computation, just embedded in
  98                 # triple-nested for-loops
  99                 jl, jh = j, j+halfsize
 100
 101                 print("xform jl jh k", jl, jh, k,
 102                       "vr h l", vec_r[jh], vec_r[jl],
 103                       "vi h l", vec_i[jh], vec_i[jl])
 104                 print("    cr k", cos_r[k], "si k", sin_i[k])
 105                 mul1_r = vec_r[jh] * cos_r[k]
 106                 mul2_r = vec_i[jh] * sin_i[k]
 107                 tpre = mul1_r + mul2_r
 108                 print("        vec_r[jh] * cos_r[k]", mul1_r)
 109                 print("        vec_i[jh] * sin_i[k]", mul2_r)
 110                 print("    tpre", tpre)
 111                 mul1_i = vec_r[jh] * sin_i[k]
 112                 mul2_i = vec_i[jh] * cos_r[k]
 113                 tpim = -mul1_i + mul2_i
 114                 print("        vec_r[jh] * sin_i[k]", mul1_i)
 115                 print("        vec_i[jh] * cos_r[k]", mul2_i)
 116                 print("    tpim", tpim)
 117                 vec_r[jh] = vec_r[jl] - tpre
 118                 vec_i[jh] = vec_i[jl] - tpim
 119                 vec_r[jl] += tpre
 120                 vec_i[jl] += tpim
 121
 122                 print("    xform jl jh k", jl, jh, k,
 123                       "\n       vr h l", vec_r[jh], vec_r[jl],
 124                       "\n       vi h l", vec_i[jh], vec_i[jl])
 125                 k += tablestep
 126         size *= 2
 127
 128     return vec_r, vec_i
 129
 130
 131 class FFTTestCase(FHDLTestCase):
 132
 133     def _check_regs(self, sim, expected):
 134         for i in range(32):
 135             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 136
 137     def test_sv_remap_fpmadds_fft_4(self):
 138         """>>> lst = ["svshape 2, 1, 1, 1, 0",
 139                      "svremap 31, 1, 0, 2, 0, 1, 0",
 140                       "sv.ffmadds. *2, *2, *2, *10"
 141                      ]
 142         this is a cheap (cheating) way to run a single "ffmadds." to
 143         get at least Rc=1 on sv.ffmadds to be activated. the results
 144         are not actually tested because there's no checking yet on
 145         FP Rc=1
 146         """
 147         lst = SVP64Asm(["svshape 2, 1, 1, 1, 0",
 148                         "svremap 31, 1, 0, 2, 0, 1, 0",
 149                         "sv.ffmadds *0, *0, *0, *8"
 150                         ])
 151         lst = list(lst)
 152
 153         # array and coefficients to test
 154         av = [7.0, -9.8]  # array 0..1
 155         coe = [3.1]  # coefficients
 156
 157         # store in regfile
 158         fprs = [0] * 32
 159         for i, c in enumerate(coe):
 160             fprs[i+8] = fp64toselectable(c)
 161         for i, a in enumerate(av):
 162             fprs[i+0] = fp64toselectable(a)
 163
 164         with Program(lst, bigendian=False) as program:
 165             sim = self.run_tst_program(program, initial_fprs=fprs)
 166             print("spr svshape0", sim.spr['SVSHAPE0'])
 167             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 168             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 169             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 170             print("spr svshape1", sim.spr['SVSHAPE1'])
 171             print("spr svshape2", sim.spr['SVSHAPE2'])
 172             print("spr svshape3", sim.spr['SVSHAPE3'])
 173
 174             # work out the results with the twin mul/add-sub
 175             res = transform_radix2(av, coe)
 176
 177             for i, expected in enumerate(res):
 178                 print("i", i, float(sim.fpr(i)), "expected", expected)
 179             for i, expected in enumerate(res):
 180                 # convert to Power single
 181                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 182                 expected = float(expected)
 183                 actual = float(sim.fpr(i))
 184                 # approximate error calculation, good enough test
 185                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 186                 # and the rounding is different
 187                 err = abs(actual - expected) / expected
 188                 self.assertTrue(err < 1e-7)
 189
 190     def test_sv_remap_fpmadds_fft(self):
 191         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 192                      "svremap 31, 1, 0, 2, 0, 1, 0",
 193                       "sv.ffmadds *2, *2, *2, *10"
 194                      ]
 195             runs a full in-place O(N log2 N) butterfly schedule for
 196             Discrete Fourier Transform.
 197
 198             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 199             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 200
 201             there is the *option* to target a different location (non-in-place)
 202             just in case.
 203
 204             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 205             (3 inputs, 2 outputs)
 206         """
 207         lst = SVP64Asm(["svshape 8, 1, 1, 1, 0",
 208                         "svremap 31, 1, 0, 2, 0, 1, 0",
 209                         "sv.ffmadds *0, *0, *0, *8"
 210                         ])
 211         lst = list(lst)
 212
 213         # array and coefficients to test
 214         av = [7.0, -9.8, 3.0, -32.3,
 215               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 216         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 217
 218         # store in regfile
 219         fprs = [0] * 32
 220         for i, c in enumerate(coe):
 221             fprs[i+8] = fp64toselectable(c)
 222         for i, a in enumerate(av):
 223             fprs[i+0] = fp64toselectable(a)
 224
 225         with Program(lst, bigendian=False) as program:
 226             sim = self.run_tst_program(program, initial_fprs=fprs)
 227             print("spr svshape0", sim.spr['SVSHAPE0'])
 228             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 229             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 230             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 231             print("spr svshape1", sim.spr['SVSHAPE1'])
 232             print("spr svshape2", sim.spr['SVSHAPE2'])
 233             print("spr svshape3", sim.spr['SVSHAPE3'])
 234
 235             # work out the results with the twin mul/add-sub
 236             res = transform_radix2(av, coe)
 237
 238             for i, expected in enumerate(res):
 239                 print("i", i, float(sim.fpr(i)), "expected", expected)
 240             for i, expected in enumerate(res):
 241                 # convert to Power single
 242                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 243                 expected = float(expected)
 244                 actual = float(sim.fpr(i))
 245                 # approximate error calculation, good enough test
 246                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 247                 # and the rounding is different
 248                 err = abs(actual - expected) / expected
 249                 self.assertTrue(err < 1e-7)
 250
 251     def test_sv_remap_fpmadds_fft_svstep(self):
 252         """>>> lst = SVP64Asm( [
 253                             "svshape 8, 1, 1, 1, 1",
 254                              "svremap 31, 1, 0, 2, 0, 1, 0",
 255                             "sv.ffmadds *0, *0, *0, *8",
 256                             "svstep. 12, 1, 0",
 257                             "bc 6, 3, -16"
 258                             ])
 259             runs a full in-place O(N log2 N) butterfly schedule for
 260             Discrete Fourier Transform.  this version however uses
 261             SVP64 "Vertical-First" Mode and so needs an explicit
 262             branch, testing CR0.
 263
 264             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 265             (3 inputs, 2 outputs)
 266         """
 267         lst = SVP64Asm([
 268             "svshape 8, 1, 1, 1, 1",
 269             "svremap 31, 1, 0, 2, 0, 1, 0",
 270             "sv.ffmadds *0, *0, *0, *8",
 271             "svstep. 27, 1, 0",
 272             "bc 6, 3, -16"
 273         ])
 274         lst = list(lst)
 275
 276         # array and coefficients to test
 277         av = [7.0, -9.8, 3.0, -32.3,
 278               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 279         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 280
 281         # store in regfile
 282         fprs = [0] * 32
 283         for i, c in enumerate(coe):
 284             fprs[i+8] = fp64toselectable(c)
 285         for i, a in enumerate(av):
 286             fprs[i+0] = fp64toselectable(a)
 287
 288         # set total. err don't know how to calculate how many there are...
 289         # do it manually for now
 290         VL = 0
 291         size = 2
 292         n = len(av)
 293         while size <= n:
 294             halfsize = size // 2
 295             tablestep = n // size
 296             for i in range(0, n, size):
 297                 for j in range(i, i + halfsize):
 298                     VL += 1
 299             size *= 2
 300
 301         # SVSTATE (calculated VL)
 302         svstate = SVP64State()
 303         svstate.vl = VL  # VL
 304         svstate.maxvl = VL  # MAXVL
 305         print("SVSTATE", bin(svstate.asint()))
 306
 307         with Program(lst, bigendian=False) as program:
 308             sim = self.run_tst_program(program, svstate=svstate,
 309                                        initial_fprs=fprs)
 310             print("spr svshape0", sim.spr['SVSHAPE0'])
 311             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 312             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 313             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 314             print("spr svshape1", sim.spr['SVSHAPE1'])
 315             print("spr svshape2", sim.spr['SVSHAPE2'])
 316             print("spr svshape3", sim.spr['SVSHAPE3'])
 317
 318             # work out the results with the twin mul/add-sub
 319             res = transform_radix2(av, coe)
 320
 321             for i, expected in enumerate(res):
 322                 print("i", i, float(sim.fpr(i)), "expected", expected)
 323             for i, expected in enumerate(res):
 324                 # convert to Power single
 325                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 326                 expected = float(expected)
 327                 actual = float(sim.fpr(i))
 328                 # approximate error calculation, good enough test
 329                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 330                 # and the rounding is different
 331                 err = abs(actual - expected) / expected
 332                 self.assertTrue(err < 1e-7)
 333
 334     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 335         """>>> lst = SVP64Asm( [
 336                         "svshape 8, 1, 1, 1, 1",
 337                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 338                          "svremap 5, 1, 0, 2, 0, 0, 1",
 339                          "sv.fmuls 24, *0, *8",
 340                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 341                          "svremap 26, 0, 0, 0, 0, 1, 1",
 342                         "sv.ffadds *0, 24, *0",
 343                         "svstep. 27, 1, 0",
 344                         "bc 6, 3, -28"
 345                             ])
 346
 347             runs a full in-place O(N log2 N) butterfly schedule for
 348             Discrete Fourier Transform.  also uses "Vertical First"
 349             but also uses temporary scalars and ffadds rather than
 350             sv.ffmadds.
 351
 352             this represents an incremental step towards complex FFT
 353
 354             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 355
 356             * single fmuls FRT, FRA, FRC
 357             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 358               (FRS is implicit / hidden in ff* operations)
 359
 360             multiply:                         # sv.fmuls FRT, FRA, FRC
 361                 temp1 = vec[jh] * exptable[k]
 362                 temp2 = vec[jl]
 363             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 364                 vec[jh] = temp2 - temp1
 365                 vec[jl] = temp2 + temp1
 366
 367             also see notes in complex fft test: here svremap is done in
 368             "non-persistent" mode (as a demo) whereas in the complex fft
 369             svremap is used in "persistent" mode, where by a complete
 370             coincidence the REMAP arguments all happen to line up and
 371             only one persistent svremap is needed.  the exact same trick
 372             *could* be applied here but for illustrative purposes it is not.
 373         """
 374         lst = SVP64Asm([
 375             "svshape 8, 1, 1, 1, 1",
 376             # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 377             "svremap 5, 1, 0, 2, 0, 0, 0",
 378             "sv.fmuls 24, *0, *8",
 379             # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 380             "svremap 26, 0, 0, 0, 0, 1, 0",
 381             "sv.ffadds *0, 24, *0",
 382             "svstep. 27, 1, 0",
 383             "bc 6, 3, -28"
 384         ])
 385         lst = list(lst)
 386
 387         # array and coefficients to test
 388         av = [7.0, -9.8, 3.0, -32.3,
 389               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 390         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 391
 392         # store in regfile
 393         fprs = [0] * 32
 394         for i, c in enumerate(coe):
 395             fprs[i+8] = fp64toselectable(c)
 396         for i, a in enumerate(av):
 397             fprs[i+0] = fp64toselectable(a)
 398
 399         # set total. err don't know how to calculate how many there are...
 400         # do it manually for now
 401         VL = 0
 402         size = 2
 403         n = len(av)
 404         while size <= n:
 405             halfsize = size // 2
 406             tablestep = n // size
 407             for i in range(0, n, size):
 408                 for j in range(i, i + halfsize):
 409                     VL += 1
 410             size *= 2
 411
 412         # SVSTATE (calculated VL)
 413         svstate = SVP64State()
 414         svstate.vl = VL  # VL
 415         svstate.maxvl = VL  # MAXVL
 416         print("SVSTATE", bin(svstate.asint()))
 417
 418         with Program(lst, bigendian=False) as program:
 419             sim = self.run_tst_program(program, svstate=svstate,
 420                                        initial_fprs=fprs)
 421             print("spr svshape0", sim.spr['SVSHAPE0'])
 422             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 423             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 424             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 425             print("spr svshape1", sim.spr['SVSHAPE1'])
 426             print("spr svshape2", sim.spr['SVSHAPE2'])
 427             print("spr svshape3", sim.spr['SVSHAPE3'])
 428
 429             # work out the results with the twin mul/add-sub
 430             res = transform_radix2(av, coe)
 431
 432             for i, expected in enumerate(res):
 433                 print("i", i, float(sim.fpr(i)), "expected", expected)
 434             for i, expected in enumerate(res):
 435                 # convert to Power single
 436                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 437                 expected = float(expected)
 438                 actual = float(sim.fpr(i))
 439                 # approximate error calculation, good enough test
 440                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 441                 # and the rounding is different
 442                 err = abs(actual - expected) / expected
 443                 self.assertTrue(err < 1e-7)
 444
 445     def test_sv_fpmadds_fft(self):
 446         """>>> lst = ["sv.ffmadds *2, *2, *2, *10"
 447                         ]
 448             four in-place vector mul-adds, four in-place vector mul-subs
 449
 450             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 451             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 452
 453             there is the *option* to target a different location (non-in-place)
 454             just in case.
 455
 456             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 457             FRS to perform the two multiplies.  one add, one subtract.
 458
 459             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 460                 fmadds  FRT   , FRA, FRC, FRA
 461                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 462
 463         """
 464         lst = SVP64Asm(["sv.ffmadds *2, *2, *2, *10"
 465                         ])
 466         lst = list(lst)
 467
 468         fprs = [0] * 32
 469         av = [7.0, -9.8, 2.0, -32.3]  # first half of array 0..3
 470         bv = [-2.0, 2.0, -9.8, 32.3]  # second half of array 4..7
 471         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 472         res = []
 473         # work out the results with the twin mul/add-sub
 474         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 475             fprs[i+2] = fp64toselectable(a)
 476             fprs[i+6] = fp64toselectable(b)
 477             fprs[i+10] = fp64toselectable(c)
 478             mul = a * c
 479             t = b + mul
 480             u = b - mul
 481             # convert to Power single
 482             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 483             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 484             res.append((t, u))
 485             print("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 486
 487         # SVSTATE (in this case, VL=2)
 488         svstate = SVP64State()
 489         svstate.vl = 4  # VL
 490         svstate.maxvl = 4  # MAXVL
 491         print("SVSTATE", bin(svstate.asint()))
 492
 493         with Program(lst, bigendian=False) as program:
 494             sim = self.run_tst_program(program, svstate=svstate,
 495                                        initial_fprs=fprs)
 496             # confirm that the results are as expected
 497             for i, (t, u) in enumerate(res):
 498                 self.assertEqual(sim.fpr(i+2), t)
 499                 self.assertEqual(sim.fpr(i+6), u)
 500
 501     def test_sv_ffadds_fft(self):
 502         """>>> lst = ["sv.ffadds *2, *2, *2"
 503                         ]
 504             four in-place vector adds, four in-place vector subs
 505
 506             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 507             FRS to perform the two multiplies.  one add, one subtract.
 508
 509             sv.ffadds FRT, FRA, FRB  actually does:
 510                 fadds FRT   , FRB, FRA
 511                 fsubs FRT+vl, FRA, FRB+vl
 512         """
 513         lst = SVP64Asm(["sv.ffadds *2, *2, *2"
 514                         ])
 515         lst = list(lst)
 516
 517         fprs = [0] * 32
 518         av = [7.0, -9.8, 2.0, -32.3]  # first half of array 0..3
 519         bv = [-2.0, 2.0, -9.8, 32.3]  # second half of array 4..7
 520         res = []
 521         # work out the results with the twin add-sub
 522         for i, (a, b) in enumerate(zip(av, bv)):
 523             fprs[i+2] = fp64toselectable(a)
 524             fprs[i+6] = fp64toselectable(b)
 525             t = b + a
 526             u = b - a
 527             # convert to Power single
 528             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 529             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 530             res.append((t, u))
 531             print("FFT", i, "in", a, b, "res", t, u)
 532
 533         # SVSTATE (in this case, VL=2)
 534         svstate = SVP64State()
 535         svstate.vl = 4  # VL
 536         svstate.maxvl = 4  # MAXVL
 537         print("SVSTATE", bin(svstate.asint()))
 538
 539         with Program(lst, bigendian=False) as program:
 540             sim = self.run_tst_program(program, svstate=svstate,
 541                                        initial_fprs=fprs)
 542             # confirm that the results are as expected
 543             for i, (t, u) in enumerate(res):
 544                 a = float(sim.fpr(i+2))
 545                 b = float(sim.fpr(i+6))
 546                 t = float(t)
 547                 u = float(u)
 548                 print("FFT", i, "in", a, b, "res", t, u)
 549             for i, (t, u) in enumerate(res):
 550                 self.assertEqual(sim.fpr(i+2), t)
 551                 self.assertEqual(sim.fpr(i+6), u)
 552
 553     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 554         """
 555             runs a full in-place O(N log2 N) butterfly schedule for
 556             Discrete Fourier Transform.  this version however uses
 557             SVP64 "Vertical-First" Mode and so needs an explicit
 558             branch, testing CR0.
 559
 560             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 561             (3 inputs, 2 outputs)
 562
 563             complex calculation (FFT):
 564
 565                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 566                 vec_r[jh] = vec_r[jl] - tpre
 567                 vec_r[jl] += tpre
 568
 569                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 570                 vec_i[jh] = vec_i[jl] - tpim
 571                 vec_i[jl] += tpim
 572
 573             real-only calculation (DFT):
 574
 575                 temp1 = vec[jh] * exptable[k]
 576                 temp2 = vec[jl]
 577                 vec[jh] = temp2 - temp1
 578                 vec[jl] = temp2 + temp1
 579
 580             note: a rather nice convenience / coincidence. the meaning of
 581             these two instructions is:
 582                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 583                 "svremap 5, 1, 0, 2, 0, 0, 1",
 584                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 585                 "svremap 26, 0, 0, 0, 0, 1, 1",
 586
 587             however it turns out that they can be *merged*, and for
 588             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 589             *ignore* their REMAPs (by definition, because you can't REMAP
 590             scalar operands), and for the second one (sv.ffads) exactly the
 591             right REMAPs are also ignored!
 592
 593             therefore we can merge:
 594                 "svremap 5, 1, 0, 2, 0, 0, 1",
 595                 "svremap 26, 0, 0, 0, 0, 1, 1",
 596             into:
 597                 "svremap 31, 1, 0, 2, 0, 1, 1",
 598             and save one instruction.
 599         """
 600         lst = SVP64Asm([
 601             # set triple butterfly mode with persistent "REMAP"
 602             "svshape 8, 1, 1, 1, 1",
 603             "svremap 31, 1, 0, 2, 0, 1, 1",
 604             # tpre
 605             "sv.fmuls 24, *0, *16",    # mul1_r = r*cos_r
 606             "sv.fmadds 24, *8, *20, 24",  # mul2_r = i*sin_i
 607             # tpre = mul1_r + mul2_r
 608             # tpim
 609             "sv.fmuls 26, *0, *20",    # mul1_i = r*sin_i
 610             "sv.fmsubs 26, *8, *16, 26",  # mul2_i = i*cos_r
 611             # tpim = mul2_i - mul1_i
 612             # vec_r jh/jl
 613             "sv.ffadds *0, 24, *0",    # vh/vl +/- tpre
 614             # vec_i jh/jl
 615             "sv.ffadds *8, 26, *8",    # vh/vl +- tpim
 616
 617             # svstep loop
 618             "svstep. 27, 1, 0",
 619             "bc 6, 3, -56"
 620         ])
 621         lst = list(lst)
 622
 623         # array and coefficients to test
 624         ar = [7.0, -9.8, 3.0, -32.3,
 625               -2.0, 5.0, -9.8, 31.3]  # array 0..7 real
 626         ai = [1.0, -1.8, 3.0, 19.3,
 627               4.0, -2.0, -0.8, 1.3]  # array 0..7 imaginary
 628         coer = [-0.25, 0.5, 3.1, 6.2]  # coefficients real
 629         coei = [0.21, -0.1, 1.1, -4.0]  # coefficients imaginary
 630
 631         # store in regfile
 632         fprs = [0] * 64
 633         for i, a in enumerate(ar):
 634             fprs[i+0] = fp64toselectable(a)
 635         for i, a in enumerate(ai):
 636             fprs[i+8] = fp64toselectable(a)
 637         for i, cr in enumerate(coer):
 638             fprs[i+16] = fp64toselectable(cr)
 639         for i, ci in enumerate(coei):
 640             fprs[i+20] = fp64toselectable(ci)
 641
 642         # set total. err don't know how to calculate how many there are...
 643         # do it manually for now
 644         VL = 0
 645         size = 2
 646         n = len(ar)
 647         while size <= n:
 648             halfsize = size // 2
 649             tablestep = n // size
 650             for i in range(0, n, size):
 651                 for j in range(i, i + halfsize):
 652                     VL += 1
 653             size *= 2
 654
 655         # SVSTATE (calculated VL)
 656         svstate = SVP64State()
 657         svstate.vl = VL  # VL
 658         svstate.maxvl = VL  # MAXVL
 659         print("SVSTATE", bin(svstate.asint()))
 660
 661         with Program(lst, bigendian=False) as program:
 662             sim = self.run_tst_program(program, svstate=svstate,
 663                                        initial_fprs=fprs)
 664             print("spr svshape0", sim.spr['SVSHAPE0'])
 665             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 666             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 667             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 668             print("spr svshape1", sim.spr['SVSHAPE1'])
 669             print("spr svshape2", sim.spr['SVSHAPE2'])
 670             print("spr svshape3", sim.spr['SVSHAPE3'])
 671
 672             # work out the results with the twin mul/add-sub, explicit
 673             # complex numbers
 674             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 675
 676             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 677                 print("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 678                       "expected_r", expected_r,
 679                       "expected_i", expected_i)
 680             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 681                 # convert to Power single
 682                 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r))
 683                 expected_r = float(expected_r)
 684                 actual_r = float(sim.fpr(i))
 685                 # approximate error calculation, good enough test
 686                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 687                 # and the rounding is different
 688                 err = abs(actual_r - expected_r) / expected_r
 689                 self.assertTrue(err < 1e-6)
 690                 # convert to Power single
 691                 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i))
 692                 expected_i = float(expected_i)
 693                 actual_i = float(sim.fpr(i+8))
 694                 # approximate error calculation, good enough test
 695                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 696                 # and the rounding is different
 697                 err = abs(actual_i - expected_i) / expected_i
 698                 self.assertTrue(err < 1e-6)
 699
 700     def test_sv_ffadds_fft_scalar(self):
 701         """>>> lst = ["sv.ffadds *2, 12, 13"
 702                         ]
 703             four in-place vector adds and subs, but done with a scalar
 704             pair (fp12, fp13)
 705         """
 706         lst = SVP64Asm(["sv.ffadds *2, 12, 13"
 707                         ])
 708         lst = list(lst)
 709
 710         fprs = [0] * 32
 711         scalar_a = 1.3
 712         scalar_b = -2.0
 713         fprs[12] = fp64toselectable(scalar_a)
 714         fprs[13] = fp64toselectable(scalar_b)
 715         res = []
 716         # work out the results with the twin add-sub
 717         for i in range(4):
 718             t = scalar_b + scalar_a
 719             u = scalar_b - scalar_a
 720             # convert to Power single
 721             t = fph.DOUBLE2SINGLE(fp64toselectable(t))
 722             u = fph.DOUBLE2SINGLE(fp64toselectable(u))  # from double
 723             res.append((t, u))
 724             print("FFT", i, "res", t, u)
 725
 726         # SVSTATE (in this case, VL=2)
 727         svstate = SVP64State()
 728         svstate.vl = 4  # VL
 729         svstate.maxvl = 4  # MAXVL
 730         print("SVSTATE", bin(svstate.asint()))
 731
 732         with Program(lst, bigendian=False) as program:
 733             sim = self.run_tst_program(program, svstate=svstate,
 734                                        initial_fprs=fprs)
 735             # confirm that the results are as expected
 736             for i, (t, u) in enumerate(res):
 737                 a = float(sim.fpr(i+2))
 738                 b = float(sim.fpr(i+6))
 739                 t = float(t)
 740                 u = float(u)
 741                 print("FFT", i, "in", a, b, "res", t, u)
 742             for i, (t, u) in enumerate(res):
 743                 self.assertEqual(sim.fpr(i+2), t)
 744                 self.assertEqual(sim.fpr(i+6), u)
 745
 746     def test_sv_remap_fpmadds_fft_ldst(self):
 747         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 748                          "sv.lfs/els *0, 4(0)",
 749                          "svshape 8, 1, 1, 1, 0",
 750                          "svremap 31, 1, 0, 2, 0, 1, 0",
 751                          "sv.ffmadds *0, *0, *0, *8"
 752
 753             runs a full in-place O(N log2 N) butterfly schedule for
 754             Discrete Fourier Transform, using bit-reversed LD/ST
 755         """
 756         lst = SVP64Asm(["svshape 8, 1, 1, 15, 0",
 757                         "svremap 1, 0, 0, 0, 0, 0, 0",
 758                         "sv.lfs/els *0, 4(0)",
 759                         "svshape 8, 1, 1, 1, 0",
 760                         "svremap 31, 1, 0, 2, 0, 1, 0",
 761                         "sv.ffmadds *0, *0, *0, *8"
 762                         ])
 763         lst = list(lst)
 764
 765         # array and coefficients to test
 766         av = [7.0, -9.8, 3.0, -32.3,
 767               -2.0, 5.0, -9.8, 31.3]  # array 0..7
 768         coe = [-0.25, 0.5, 3.1, 6.2]  # coefficients
 769
 770         # store in regfile
 771         fprs = [0] * 32
 772         for i, c in enumerate(coe):
 773             fprs[i+8] = fp64toselectable(c)
 774         # store in memory
 775         mem = {}
 776         val = 0
 777         for i, a in enumerate(av):
 778             a = SINGLE(fp64toselectable(a)).value
 779             shift = (i % 2) == 1
 780             if shift == 0:
 781                 val = a
 782             else:
 783                 mem[(i//2)*8] = val | (a << 32)
 784
 785         with Program(lst, bigendian=False) as program:
 786             sim = self.run_tst_program(program, initial_mem=mem,
 787                                        initial_fprs=fprs)
 788             print("spr svshape0", sim.spr['SVSHAPE0'])
 789             print("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 790             print("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 791             print("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 792             print("spr svshape1", sim.spr['SVSHAPE1'])
 793             print("spr svshape2", sim.spr['SVSHAPE2'])
 794             print("spr svshape3", sim.spr['SVSHAPE3'])
 795
 796             print("mem dump")
 797             print(sim.mem.dump())
 798
 799             # work out the results with the twin mul/add-sub,
 800             # note bit-reverse mode requested
 801             res = transform_radix2(av, coe, reverse=True)
 802
 803             for i, expected in enumerate(res):
 804                 print("i", i, float(sim.fpr(i)), "expected", expected)
 805             for i, expected in enumerate(res):
 806                 # convert to Power single
 807                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 808                 expected = float(expected)
 809                 actual = float(sim.fpr(i))
 810                 # approximate error calculation, good enough test
 811                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 812                 # and the rounding is different
 813                 err = abs(actual - expected) / expected
 814                 self.assertTrue(err < 1e-6)
 815
 816     def run_tst_program(self, prog, initial_regs=None,
 817                         svstate=None,
 818                         initial_mem=None,
 819                         initial_fprs=None):
 820         if initial_regs is None:
 821             initial_regs = [0] * 32
 822         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 823                             initial_fprs=initial_fprs,
 824                             svstate=svstate)
 825
 826         print("GPRs")
 827         simulator.gpr.dump()
 828         print("FPRs")
 829         simulator.fpr.dump()
 830
 831         return simulator
 832
 833
 834 if __name__ == "__main__":
 835     unittest.main()