src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.back.pysim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable, SINGLE
  13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
  14
  15
  16 def transform_radix2(vec, exptable, reverse=False):
  17     """
  18     # FFT and convolution test (Python), based on Project Nayuki
  19     #
  20     # Copyright (c) 2020 Project Nayuki. (MIT License)
  21     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  22
  23     """
  24     # bits of the integer 'val'.
  25     def reverse_bits(val, width):
  26         result = 0
  27         for _ in range(width):
  28             result = (result << 1) | (val & 1)
  29             val >>= 1
  30         return result
  31
  32     # Initialization
  33     n = len(vec)
  34     levels = n.bit_length() - 1
  35
  36     # Copy with bit-reversed permutation
  37     if reverse:
  38         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  39
  40     size = 2
  41     while size <= n:
  42         halfsize = size // 2
  43         tablestep = n // size
  44         for i in range(0, n, size):
  45             k = 0
  46             for j in range(i, i + halfsize):
  47                 # exact same actual computation, just embedded in
  48                 # triple-nested for-loops
  49                 jl, jh = j, j+halfsize
  50                 vjh = vec[jh]
  51                 temp1 = vec[jh] * exptable[k]
  52                 temp2 = vec[jl]
  53                 vec[jh] = temp2 - temp1
  54                 vec[jl] = temp2 + temp1
  55                 print ("xform jl jh k", jl, jh, k,
  56                        "vj vjh ek", temp2, vjh, exptable[k],
  57                        "t1, t2", temp1, temp2,
  58                        "v[jh] v[jl]", vec[jh], vec[jl])
  59                 k += tablestep
  60         size *= 2
  61
  62     return vec
  63
  64
  65 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  66     """
  67     # FFT and convolution test (Python), based on Project Nayuki
  68     #
  69     # Copyright (c) 2020 Project Nayuki. (MIT License)
  70     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  71
  72     """
  73     # bits of the integer 'val'.
  74     def reverse_bits(val, width):
  75         result = 0
  76         for _ in range(width):
  77             result = (result << 1) | (val & 1)
  78             val >>= 1
  79         return result
  80
  81     # Initialization
  82     n = len(vec_r)
  83     levels = n.bit_length() - 1
  84
  85     # Copy with bit-reversed permutation
  86     if reverse:
  87         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  88
  89     size = 2
  90     while size <= n:
  91         halfsize = size // 2
  92         tablestep = n // size
  93         for i in range(0, n, size):
  94             k = 0
  95             for j in range(i, i + halfsize):
  96                 # exact same actual computation, just embedded in
  97                 # triple-nested for-loops
  98                 jl, jh = j, j+halfsize
  99
 100                 print ("xform jl jh k", jl, jh, k,
 101                         "vr h l", vec_r[jh], vec_r[jl],
 102                         "vi h l", vec_i[jh], vec_i[jl])
 103                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 104                 mul1_r =  vec_r[jh] * cos_r[k]
 105                 mul2_r = vec_i[jh] * sin_i[k]
 106                 tpre =  mul1_r + mul2_r
 107                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 108                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 109                 print ("    tpre", tpre)
 110                 mul1_i = vec_r[jh] * sin_i[k]
 111                 mul2_i = vec_i[jh] * cos_r[k]
 112                 tpim = -mul1_i + mul2_i
 113                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 114                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 115                 print ("    tpim", tpim)
 116                 vec_r[jh] = vec_r[jl] - tpre
 117                 vec_i[jh] = vec_i[jl] - tpim
 118                 vec_r[jl] += tpre
 119                 vec_i[jl] += tpim
 120
 121                 print ("    xform jl jh k", jl, jh, k,
 122                         "\n       vr h l", vec_r[jh], vec_r[jl],
 123                         "\n       vi h l", vec_i[jh], vec_i[jl])
 124                 k += tablestep
 125         size *= 2
 126
 127     return vec_r, vec_i
 128
 129
 130 class FFTTestCase(FHDLTestCase):
 131
 132     def _check_regs(self, sim, expected):
 133         for i in range(32):
 134             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 135
 136     def test_sv_remap_fpmadds_fft(self):
 137         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 138                      "svremap 31, 1, 0, 2, 0, 1, 0",
 139                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 140                      ]
 141             runs a full in-place O(N log2 N) butterfly schedule for
 142             Discrete Fourier Transform.
 143
 144             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 145             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 146
 147             there is the *option* to target a different location (non-in-place)
 148             just in case.
 149
 150             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 151             (3 inputs, 2 outputs)
 152         """
 153         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 154                          "svremap 31, 1, 0, 2, 0, 1, 0",
 155                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 156                         ])
 157         lst = list(lst)
 158
 159         # array and coefficients to test
 160         av = [7.0, -9.8, 3.0, -32.3,
 161               -2.0, 5.0, -9.8, 31.3] # array 0..7
 162         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 163
 164         # store in regfile
 165         fprs = [0] * 32
 166         for i, c in enumerate(coe):
 167             fprs[i+8] = fp64toselectable(c)
 168         for i, a in enumerate(av):
 169             fprs[i+0] = fp64toselectable(a)
 170
 171         with Program(lst, bigendian=False) as program:
 172             sim = self.run_tst_program(program, initial_fprs=fprs)
 173             print ("spr svshape0", sim.spr['SVSHAPE0'])
 174             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 175             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 176             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 177             print ("spr svshape1", sim.spr['SVSHAPE1'])
 178             print ("spr svshape2", sim.spr['SVSHAPE2'])
 179             print ("spr svshape3", sim.spr['SVSHAPE3'])
 180
 181             # work out the results with the twin mul/add-sub
 182             res = transform_radix2(av, coe)
 183
 184             for i, expected in enumerate(res):
 185                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 186             for i, expected in enumerate(res):
 187                 # convert to Power single
 188                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 189                 expected = float(expected)
 190                 actual = float(sim.fpr(i))
 191                 # approximate error calculation, good enough test
 192                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 193                 # and the rounding is different
 194                 err = abs(actual - expected) / expected
 195                 self.assertTrue(err < 1e-7)
 196
 197     def test_sv_remap_fpmadds_fft_svstep(self):
 198         """>>> lst = SVP64Asm( [
 199                             "svshape 8, 1, 1, 1, 1",
 200                              "svremap 31, 1, 0, 2, 0, 1, 0",
 201                             "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 202                             "setvl. 0, 0, 1, 1, 0, 0",
 203                             "bc 4, 2, -16"
 204                             ])
 205             runs a full in-place O(N log2 N) butterfly schedule for
 206             Discrete Fourier Transform.  this version however uses
 207             SVP64 "Vertical-First" Mode and so needs an explicit
 208             branch, testing CR0.
 209
 210             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 211             (3 inputs, 2 outputs)
 212         """
 213         lst = SVP64Asm( [
 214                         "svshape 8, 1, 1, 1, 1",
 215                          "svremap 31, 1, 0, 2, 0, 1, 0",
 216                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 217                         "setvl. 0, 0, 1, 1, 0, 0",
 218                         "bc 4, 2, -16"
 219                         ])
 220         lst = list(lst)
 221
 222         # array and coefficients to test
 223         av = [7.0, -9.8, 3.0, -32.3,
 224               -2.0, 5.0, -9.8, 31.3] # array 0..7
 225         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 226
 227         # store in regfile
 228         fprs = [0] * 32
 229         for i, c in enumerate(coe):
 230             fprs[i+8] = fp64toselectable(c)
 231         for i, a in enumerate(av):
 232             fprs[i+0] = fp64toselectable(a)
 233
 234         # set total. err don't know how to calculate how many there are...
 235         # do it manually for now
 236         VL = 0
 237         size = 2
 238         n = len(av)
 239         while size <= n:
 240             halfsize = size // 2
 241             tablestep = n // size
 242             for i in range(0, n, size):
 243                 for j in range(i, i + halfsize):
 244                     VL += 1
 245             size *= 2
 246
 247         # SVSTATE (calculated VL)
 248         svstate = SVP64State()
 249         svstate.vl = VL # VL
 250         svstate.maxvl = VL # MAXVL
 251         print ("SVSTATE", bin(svstate.asint()))
 252
 253         with Program(lst, bigendian=False) as program:
 254             sim = self.run_tst_program(program, svstate=svstate,
 255                                        initial_fprs=fprs)
 256             print ("spr svshape0", sim.spr['SVSHAPE0'])
 257             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 258             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 259             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 260             print ("spr svshape1", sim.spr['SVSHAPE1'])
 261             print ("spr svshape2", sim.spr['SVSHAPE2'])
 262             print ("spr svshape3", sim.spr['SVSHAPE3'])
 263
 264             # work out the results with the twin mul/add-sub
 265             res = transform_radix2(av, coe)
 266
 267             for i, expected in enumerate(res):
 268                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 269             for i, expected in enumerate(res):
 270                 # convert to Power single
 271                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 272                 expected = float(expected)
 273                 actual = float(sim.fpr(i))
 274                 # approximate error calculation, good enough test
 275                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 276                 # and the rounding is different
 277                 err = abs(actual - expected) / expected
 278                 self.assertTrue(err < 1e-7)
 279
 280     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 281         """>>> lst = SVP64Asm( [
 282                         "svshape 8, 1, 1, 1, 1",
 283                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 284                          "svremap 5, 1, 0, 2, 0, 0, 1",
 285                          "sv.fmuls 24, 0.v, 8.v",
 286                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 287                          "svremap 26, 0, 0, 0, 0, 1, 1",
 288                         "sv.ffadds 0.v, 24, 0.v",
 289                         "setvl. 0, 0, 1, 1, 0, 0",
 290                         "bc 4, 2, -28"
 291                             ])
 292
 293             runs a full in-place O(N log2 N) butterfly schedule for
 294             Discrete Fourier Transform.  also uses "Vertical First"
 295             but also uses temporary scalars and ffadds rather than
 296             sv.ffmadds.
 297
 298             this represents an incremental step towards complex FFT
 299
 300             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 301
 302             * single fmuls FRT, FRA, FRC
 303             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 304               (FRS is implicit / hidden in ff* operations)
 305
 306             multiply:                         # sv.fmuls FRT, FRA, FRC
 307                 temp1 = vec[jh] * exptable[k]
 308                 temp2 = vec[jl]
 309             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 310                 vec[jh] = temp2 - temp1
 311                 vec[jl] = temp2 + temp1
 312
 313             also see notes in complex fft test: here svremap is done in
 314             "non-persistent" mode (as a demo) whereas in the complex fft
 315             svremap is used in "persistent" mode, where by a complete
 316             coincidence the REMAP arguments all happen to line up and
 317             only one persistent svremap is needed.  the exact same trick
 318             *could* be applied here but for illustrative purposes it is not.
 319         """
 320         lst = SVP64Asm( [
 321                         "svshape 8, 1, 1, 1, 1",
 322                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 323                          "svremap 5, 1, 0, 2, 0, 0, 0",
 324                          "sv.fmuls 24, 0.v, 8.v",
 325                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 326                          "svremap 26, 0, 0, 0, 0, 1, 0",
 327                         "sv.ffadds 0.v, 24, 0.v",
 328                         "setvl. 0, 0, 1, 1, 0, 0",
 329                         "bc 4, 2, -28"
 330                         ])
 331         lst = list(lst)
 332
 333         # array and coefficients to test
 334         av = [7.0, -9.8, 3.0, -32.3,
 335               -2.0, 5.0, -9.8, 31.3] # array 0..7
 336         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 337
 338         # store in regfile
 339         fprs = [0] * 32
 340         for i, c in enumerate(coe):
 341             fprs[i+8] = fp64toselectable(c)
 342         for i, a in enumerate(av):
 343             fprs[i+0] = fp64toselectable(a)
 344
 345         # set total. err don't know how to calculate how many there are...
 346         # do it manually for now
 347         VL = 0
 348         size = 2
 349         n = len(av)
 350         while size <= n:
 351             halfsize = size // 2
 352             tablestep = n // size
 353             for i in range(0, n, size):
 354                 for j in range(i, i + halfsize):
 355                     VL += 1
 356             size *= 2
 357
 358         # SVSTATE (calculated VL)
 359         svstate = SVP64State()
 360         svstate.vl = VL # VL
 361         svstate.maxvl = VL # MAXVL
 362         print ("SVSTATE", bin(svstate.asint()))
 363
 364         with Program(lst, bigendian=False) as program:
 365             sim = self.run_tst_program(program, svstate=svstate,
 366                                        initial_fprs=fprs)
 367             print ("spr svshape0", sim.spr['SVSHAPE0'])
 368             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 369             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 370             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 371             print ("spr svshape1", sim.spr['SVSHAPE1'])
 372             print ("spr svshape2", sim.spr['SVSHAPE2'])
 373             print ("spr svshape3", sim.spr['SVSHAPE3'])
 374
 375             # work out the results with the twin mul/add-sub
 376             res = transform_radix2(av, coe)
 377
 378             for i, expected in enumerate(res):
 379                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 380             for i, expected in enumerate(res):
 381                 # convert to Power single
 382                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 383                 expected = float(expected)
 384                 actual = float(sim.fpr(i))
 385                 # approximate error calculation, good enough test
 386                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 387                 # and the rounding is different
 388                 err = abs(actual - expected) / expected
 389                 self.assertTrue(err < 1e-7)
 390
 391     def test_sv_fpmadds_fft(self):
 392         """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 393                         ]
 394             four in-place vector mul-adds, four in-place vector mul-subs
 395
 396             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 397             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 398
 399             there is the *option* to target a different location (non-in-place)
 400             just in case.
 401
 402             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 403             FRS to perform the two multiplies.  one add, one subtract.
 404
 405             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 406                 fmadds  FRT   , FRA, FRC, FRA
 407                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 408
 409         """
 410         lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 411                         ])
 412         lst = list(lst)
 413
 414         fprs = [0] * 32
 415         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 416         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 417         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 418         res = []
 419         # work out the results with the twin mul/add-sub
 420         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 421             fprs[i+2] = fp64toselectable(a)
 422             fprs[i+6] = fp64toselectable(b)
 423             fprs[i+10] = fp64toselectable(c)
 424             mul = a * c
 425             t = b + mul
 426             u = b - mul
 427             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 428             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 429             res.append((t, u))
 430             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 431
 432         # SVSTATE (in this case, VL=2)
 433         svstate = SVP64State()
 434         svstate.vl = 4 # VL
 435         svstate.maxvl = 4 # MAXVL
 436         print ("SVSTATE", bin(svstate.asint()))
 437
 438         with Program(lst, bigendian=False) as program:
 439             sim = self.run_tst_program(program, svstate=svstate,
 440                                        initial_fprs=fprs)
 441             # confirm that the results are as expected
 442             for i, (t, u) in enumerate(res):
 443                 self.assertEqual(sim.fpr(i+2), t)
 444                 self.assertEqual(sim.fpr(i+6), u)
 445
 446     def test_sv_ffadds_fft(self):
 447         """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
 448                         ]
 449             four in-place vector adds, four in-place vector subs
 450
 451             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 452             FRS to perform the two multiplies.  one add, one subtract.
 453
 454             sv.ffadds FRT, FRA, FRB  actually does:
 455                 fadds FRT   , FRB, FRA
 456                 fsubs FRT+vl, FRA, FRB+vl
 457         """
 458         lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
 459                         ])
 460         lst = list(lst)
 461
 462         fprs = [0] * 32
 463         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 464         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 465         res = []
 466         # work out the results with the twin add-sub
 467         for i, (a, b) in enumerate(zip(av, bv)):
 468             fprs[i+2] = fp64toselectable(a)
 469             fprs[i+6] = fp64toselectable(b)
 470             t = b + a
 471             u = b - a
 472             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 473             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 474             res.append((t, u))
 475             print ("FFT", i, "in", a, b, "res", t, u)
 476
 477         # SVSTATE (in this case, VL=2)
 478         svstate = SVP64State()
 479         svstate.vl = 4 # VL
 480         svstate.maxvl = 4 # MAXVL
 481         print ("SVSTATE", bin(svstate.asint()))
 482
 483         with Program(lst, bigendian=False) as program:
 484             sim = self.run_tst_program(program, svstate=svstate,
 485                                        initial_fprs=fprs)
 486             # confirm that the results are as expected
 487             for i, (t, u) in enumerate(res):
 488                 a = float(sim.fpr(i+2))
 489                 b = float(sim.fpr(i+6))
 490                 t = float(t)
 491                 u = float(u)
 492                 print ("FFT", i, "in", a, b, "res", t, u)
 493             for i, (t, u) in enumerate(res):
 494                 self.assertEqual(sim.fpr(i+2), t)
 495                 self.assertEqual(sim.fpr(i+6), u)
 496
 497     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 498         """
 499             runs a full in-place O(N log2 N) butterfly schedule for
 500             Discrete Fourier Transform.  this version however uses
 501             SVP64 "Vertical-First" Mode and so needs an explicit
 502             branch, testing CR0.
 503
 504             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 505             (3 inputs, 2 outputs)
 506
 507             complex calculation (FFT):
 508
 509                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 510                 vec_r[jh] = vec_r[jl] - tpre
 511                 vec_r[jl] += tpre
 512
 513                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 514                 vec_i[jh] = vec_i[jl] - tpim
 515                 vec_i[jl] += tpim
 516
 517             real-only calculation (DFT):
 518
 519                 temp1 = vec[jh] * exptable[k]
 520                 temp2 = vec[jl]
 521                 vec[jh] = temp2 - temp1
 522                 vec[jl] = temp2 + temp1
 523
 524             note: a rather nice convenience / coincidence. the meaning of
 525             these two instructions is:
 526                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 527                 "svremap 5, 1, 0, 2, 0, 0, 1",
 528                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 529                 "svremap 26, 0, 0, 0, 0, 1, 1",
 530
 531             however it turns out that they can be *merged*, and for
 532             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 533             *ignore* their REMAPs (by definition), and for the second
 534             one (sv.ffads) exactly the right REMAPs are also ignored!
 535
 536                 "svremap 31, 1, 0, 2, 0, 1, 1",
 537         """
 538         lst = SVP64Asm( [
 539                         # set triple butterfly mode with persistent "REMAP"
 540                         "svshape 8, 1, 1, 1, 1",
 541                         "svremap 31, 1, 0, 2, 0, 1, 1",
 542                         # tpre
 543                         "sv.fmuls 24, 0.v, 16.v",    # mul1_r = r*cos_r
 544                         "sv.fmadds 24, 8.v, 20.v, 24", # mul2_r = i*sin_i
 545                                                      # tpre = mul1_r + mul2_r
 546                         # tpim
 547                         "sv.fmuls 26, 0.v, 20.v",    # mul1_i = r*sin_i
 548                         "sv.fmsubs 26, 8.v, 16.v, 26", # mul2_i = i*cos_r
 549                                                      # tpim = mul2_i - mul1_i
 550                         # vec_r jh/jl
 551                         "sv.ffadds 0.v, 24, 0.v",    # vh/vl +/- tpre
 552                         # vec_i jh/jl
 553                         "sv.ffadds 8.v, 26, 8.v",    # vh/vl +- tpim
 554
 555                         # svstep loop
 556                         "setvl. 0, 0, 1, 1, 0, 0",
 557                         "bc 4, 2, -56"
 558                         ])
 559         lst = list(lst)
 560
 561         # array and coefficients to test
 562         ar = [7.0, -9.8, 3.0, -32.3,
 563               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 564         ai = [1.0, -1.8, 3.0, 19.3,
 565               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 566         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 567         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 568
 569         # store in regfile
 570         fprs = [0] * 64
 571         for i, a in enumerate(ar):
 572             fprs[i+0] = fp64toselectable(a)
 573         for i, a in enumerate(ai):
 574             fprs[i+8] = fp64toselectable(a)
 575         for i, cr in enumerate(coer):
 576             fprs[i+16] = fp64toselectable(cr)
 577         for i, ci in enumerate(coei):
 578             fprs[i+20] = fp64toselectable(ci)
 579
 580         # set total. err don't know how to calculate how many there are...
 581         # do it manually for now
 582         VL = 0
 583         size = 2
 584         n = len(ar)
 585         while size <= n:
 586             halfsize = size // 2
 587             tablestep = n // size
 588             for i in range(0, n, size):
 589                 for j in range(i, i + halfsize):
 590                     VL += 1
 591             size *= 2
 592
 593         # SVSTATE (calculated VL)
 594         svstate = SVP64State()
 595         svstate.vl = VL # VL
 596         svstate.maxvl = VL # MAXVL
 597         print ("SVSTATE", bin(svstate.asint()))
 598
 599         with Program(lst, bigendian=False) as program:
 600             sim = self.run_tst_program(program, svstate=svstate,
 601                                        initial_fprs=fprs)
 602             print ("spr svshape0", sim.spr['SVSHAPE0'])
 603             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 604             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 605             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 606             print ("spr svshape1", sim.spr['SVSHAPE1'])
 607             print ("spr svshape2", sim.spr['SVSHAPE2'])
 608             print ("spr svshape3", sim.spr['SVSHAPE3'])
 609
 610             # work out the results with the twin mul/add-sub, explicit
 611             # complex numbers
 612             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 613
 614             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 615                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 616                        "expected_r", expected_r,
 617                        "expected_i", expected_i)
 618             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 619                 # convert to Power single
 620                 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
 621                 expected_r = float(expected_r)
 622                 actual_r = float(sim.fpr(i))
 623                 # approximate error calculation, good enough test
 624                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 625                 # and the rounding is different
 626                 err = abs(actual_r - expected_r ) / expected_r
 627                 self.assertTrue(err < 1e-6)
 628                 # convert to Power single
 629                 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
 630                 expected_i = float(expected_i)
 631                 actual_i = float(sim.fpr(i+8))
 632                 # approximate error calculation, good enough test
 633                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 634                 # and the rounding is different
 635                 err = abs(actual_i - expected_i ) / expected_i
 636                 self.assertTrue(err < 1e-6)
 637
 638     def test_sv_ffadds_fft_scalar(self):
 639         """>>> lst = ["sv.ffadds 2.v, 12, 13"
 640                         ]
 641             four in-place vector adds and subs, but done with a scalar
 642             pair (fp12, fp13)
 643         """
 644         lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
 645                         ])
 646         lst = list(lst)
 647
 648         fprs = [0] * 32
 649         scalar_a = 1.3
 650         scalar_b = -2.0
 651         fprs[12] = fp64toselectable(scalar_a)
 652         fprs[13] = fp64toselectable(scalar_b)
 653         res = []
 654         # work out the results with the twin add-sub
 655         for i in range(4):
 656             t = scalar_b + scalar_a
 657             u = scalar_b - scalar_a
 658             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 659             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 660             res.append((t, u))
 661             print ("FFT", i, "res", t, u)
 662
 663         # SVSTATE (in this case, VL=2)
 664         svstate = SVP64State()
 665         svstate.vl = 4 # VL
 666         svstate.maxvl = 4 # MAXVL
 667         print ("SVSTATE", bin(svstate.asint()))
 668
 669         with Program(lst, bigendian=False) as program:
 670             sim = self.run_tst_program(program, svstate=svstate,
 671                                        initial_fprs=fprs)
 672             # confirm that the results are as expected
 673             for i, (t, u) in enumerate(res):
 674                 a = float(sim.fpr(i+2))
 675                 b = float(sim.fpr(i+6))
 676                 t = float(t)
 677                 u = float(u)
 678                 print ("FFT", i, "in", a, b, "res", t, u)
 679             for i, (t, u) in enumerate(res):
 680                 self.assertEqual(sim.fpr(i+2), t)
 681                 self.assertEqual(sim.fpr(i+6), u)
 682
 683     def test_sv_remap_fpmadds_fft_ldst(self):
 684         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 685                      "svremap 31, 1, 0, 2, 0, 1, 0",
 686                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 687                      ]
 688             runs a full in-place O(N log2 N) butterfly schedule for
 689             Discrete Fourier Transform, using bit-reversed LD/ST
 690         """
 691         lst = SVP64Asm( ["setvl 0, 0, 8, 0, 1, 1",
 692                          "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
 693                          "svshape 8, 1, 1, 1, 0",
 694                          "svremap 31, 1, 0, 2, 0, 1, 0",
 695                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 696                         ])
 697         lst = list(lst)
 698
 699         # array and coefficients to test
 700         av = [7.0, -9.8, 3.0, -32.3,
 701               -2.0, 5.0, -9.8, 31.3] # array 0..7
 702         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 703
 704         # store in regfile
 705         fprs = [0] * 32
 706         for i, c in enumerate(coe):
 707             fprs[i+8] = fp64toselectable(c)
 708         # store in memory
 709         mem = {}
 710         val = 0
 711         for i, a in enumerate(av):
 712             a = SINGLE(fp64toselectable(a)).value
 713             shift = (i % 2) == 1
 714             if shift == 0:
 715                 val = a
 716             else:
 717                 mem[(i//2)*8] = val | (a << 32)
 718
 719         with Program(lst, bigendian=False) as program:
 720             sim = self.run_tst_program(program, initial_mem=mem,
 721                                                 initial_fprs=fprs)
 722             print ("spr svshape0", sim.spr['SVSHAPE0'])
 723             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 724             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 725             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 726             print ("spr svshape1", sim.spr['SVSHAPE1'])
 727             print ("spr svshape2", sim.spr['SVSHAPE2'])
 728             print ("spr svshape3", sim.spr['SVSHAPE3'])
 729
 730             print ("mem dump")
 731             print (sim.mem.dump())
 732
 733             # work out the results with the twin mul/add-sub
 734             res = transform_radix2(av, coe, reverse=True)
 735
 736             for i, expected in enumerate(res):
 737                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 738             for i, expected in enumerate(res):
 739                 # convert to Power single
 740                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 741                 expected = float(expected)
 742                 actual = float(sim.fpr(i))
 743                 # approximate error calculation, good enough test
 744                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 745                 # and the rounding is different
 746                 err = abs(actual - expected) / expected
 747                 self.assertTrue(err < 1e-6)
 748
 749     def run_tst_program(self, prog, initial_regs=None,
 750                               svstate=None,
 751                               initial_mem=None,
 752                               initial_fprs=None):
 753         if initial_regs is None:
 754             initial_regs = [0] * 32
 755         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 756                                                 initial_fprs=initial_fprs,
 757                                                 svstate=svstate)
 758
 759         print ("GPRs")
 760         simulator.gpr.dump()
 761         print ("FPRs")
 762         simulator.fpr.dump()
 763
 764         return simulator
 765
 766
 767 if __name__ == "__main__":
 768     unittest.main()