src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.back.pysim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable, SINGLE
  13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
  14
  15
  16 def transform_radix2(vec, exptable, reverse=False):
  17     """
  18     # FFT and convolution test (Python), based on Project Nayuki
  19     #
  20     # Copyright (c) 2020 Project Nayuki. (MIT License)
  21     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  22
  23     """
  24     # bits of the integer 'val'.
  25     def reverse_bits(val, width):
  26         result = 0
  27         for _ in range(width):
  28             result = (result << 1) | (val & 1)
  29             val >>= 1
  30         return result
  31
  32     # Initialization
  33     n = len(vec)
  34     levels = n.bit_length() - 1
  35
  36     # Copy with bit-reversed permutation
  37     if reverse:
  38         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  39
  40     size = 2
  41     while size <= n:
  42         halfsize = size // 2
  43         tablestep = n // size
  44         for i in range(0, n, size):
  45             k = 0
  46             for j in range(i, i + halfsize):
  47                 # exact same actual computation, just embedded in
  48                 # triple-nested for-loops
  49                 jl, jh = j, j+halfsize
  50                 vjh = vec[jh]
  51                 temp1 = vec[jh] * exptable[k]
  52                 temp2 = vec[jl]
  53                 vec[jh] = temp2 - temp1
  54                 vec[jl] = temp2 + temp1
  55                 print ("xform jl jh k", jl, jh, k,
  56                        "vj vjh ek", temp2, vjh, exptable[k],
  57                        "t1, t2", temp1, temp2,
  58                        "v[jh] v[jl]", vec[jh], vec[jl])
  59                 k += tablestep
  60         size *= 2
  61
  62     return vec
  63
  64
  65 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  66     """
  67     # FFT and convolution test (Python), based on Project Nayuki
  68     #
  69     # Copyright (c) 2020 Project Nayuki. (MIT License)
  70     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  71
  72     """
  73     # bits of the integer 'val'.
  74     def reverse_bits(val, width):
  75         result = 0
  76         for _ in range(width):
  77             result = (result << 1) | (val & 1)
  78             val >>= 1
  79         return result
  80
  81     # Initialization
  82     n = len(vec_r)
  83     levels = n.bit_length() - 1
  84
  85     # Copy with bit-reversed permutation
  86     if reverse:
  87         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  88
  89     size = 2
  90     while size <= n:
  91         halfsize = size // 2
  92         tablestep = n // size
  93         for i in range(0, n, size):
  94             k = 0
  95             for j in range(i, i + halfsize):
  96                 # exact same actual computation, just embedded in
  97                 # triple-nested for-loops
  98                 jl, jh = j, j+halfsize
  99
 100                 print ("xform jl jh k", jl, jh, k,
 101                         "vr h l", vec_r[jh], vec_r[jl],
 102                         "vi h l", vec_i[jh], vec_i[jl])
 103                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 104                 mul1_r =  vec_r[jh] * cos_r[k]
 105                 mul2_r = vec_i[jh] * sin_i[k]
 106                 tpre =  mul1_r + mul2_r
 107                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 108                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 109                 print ("    tpre", tpre)
 110                 mul1_i = vec_r[jh] * sin_i[k]
 111                 mul2_i = vec_i[jh] * cos_r[k]
 112                 tpim = -mul1_i + mul2_i
 113                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 114                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 115                 print ("    tpim", tpim)
 116                 vec_r[jh] = vec_r[jl] - tpre
 117                 vec_i[jh] = vec_i[jl] - tpim
 118                 vec_r[jl] += tpre
 119                 vec_i[jl] += tpim
 120
 121                 print ("    xform jl jh k", jl, jh, k,
 122                         "\n       vr h l", vec_r[jh], vec_r[jl],
 123                         "\n       vi h l", vec_i[jh], vec_i[jl])
 124                 k += tablestep
 125         size *= 2
 126
 127     return vec_r, vec_i
 128
 129
 130 class FFTTestCase(FHDLTestCase):
 131
 132     def _check_regs(self, sim, expected):
 133         for i in range(32):
 134             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 135
 136     def test_sv_remap_fpmadds_fft(self):
 137         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 138                      "svremap 31, 1, 0, 2, 0, 1, 0",
 139                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 140                      ]
 141             runs a full in-place O(N log2 N) butterfly schedule for
 142             Discrete Fourier Transform.
 143
 144             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 145             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 146
 147             there is the *option* to target a different location (non-in-place)
 148             just in case.
 149
 150             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 151             (3 inputs, 2 outputs)
 152         """
 153         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 154                          "svremap 31, 1, 0, 2, 0, 1, 0",
 155                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 156                         ])
 157         lst = list(lst)
 158
 159         # array and coefficients to test
 160         av = [7.0, -9.8, 3.0, -32.3,
 161               -2.0, 5.0, -9.8, 31.3] # array 0..7
 162         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 163
 164         # store in regfile
 165         fprs = [0] * 32
 166         for i, c in enumerate(coe):
 167             fprs[i+8] = fp64toselectable(c)
 168         for i, a in enumerate(av):
 169             fprs[i+0] = fp64toselectable(a)
 170
 171         with Program(lst, bigendian=False) as program:
 172             sim = self.run_tst_program(program, initial_fprs=fprs)
 173             print ("spr svshape0", sim.spr['SVSHAPE0'])
 174             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 175             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 176             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 177             print ("spr svshape1", sim.spr['SVSHAPE1'])
 178             print ("spr svshape2", sim.spr['SVSHAPE2'])
 179             print ("spr svshape3", sim.spr['SVSHAPE3'])
 180
 181             # work out the results with the twin mul/add-sub
 182             res = transform_radix2(av, coe)
 183
 184             for i, expected in enumerate(res):
 185                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 186             for i, expected in enumerate(res):
 187                 # convert to Power single
 188                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 189                 expected = float(expected)
 190                 actual = float(sim.fpr(i))
 191                 # approximate error calculation, good enough test
 192                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 193                 # and the rounding is different
 194                 err = abs(actual - expected) / expected
 195                 self.assertTrue(err < 1e-7)
 196
 197     def test_sv_remap_fpmadds_fft_svstep(self):
 198         """>>> lst = SVP64Asm( [
 199                             "svshape 8, 1, 1, 1, 1",
 200                              "svremap 31, 1, 0, 2, 0, 1, 0",
 201                             "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 202                             "setvl. 0, 0, 1, 1, 0, 0",
 203                             "bc 6, 3, -16"
 204                             ])
 205             runs a full in-place O(N log2 N) butterfly schedule for
 206             Discrete Fourier Transform.  this version however uses
 207             SVP64 "Vertical-First" Mode and so needs an explicit
 208             branch, testing CR0.
 209
 210             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 211             (3 inputs, 2 outputs)
 212         """
 213         lst = SVP64Asm( [
 214                         "svshape 8, 1, 1, 1, 1",
 215                          "svremap 31, 1, 0, 2, 0, 1, 0",
 216                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 217                         "setvl. 0, 0, 1, 1, 0, 0",
 218                         "bc 6, 3, -16"
 219                         ])
 220         lst = list(lst)
 221
 222         # array and coefficients to test
 223         av = [7.0, -9.8, 3.0, -32.3,
 224               -2.0, 5.0, -9.8, 31.3] # array 0..7
 225         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 226
 227         # store in regfile
 228         fprs = [0] * 32
 229         for i, c in enumerate(coe):
 230             fprs[i+8] = fp64toselectable(c)
 231         for i, a in enumerate(av):
 232             fprs[i+0] = fp64toselectable(a)
 233
 234         # set total. err don't know how to calculate how many there are...
 235         # do it manually for now
 236         VL = 0
 237         size = 2
 238         n = len(av)
 239         while size <= n:
 240             halfsize = size // 2
 241             tablestep = n // size
 242             for i in range(0, n, size):
 243                 for j in range(i, i + halfsize):
 244                     VL += 1
 245             size *= 2
 246
 247         # SVSTATE (calculated VL)
 248         svstate = SVP64State()
 249         svstate.vl = VL # VL
 250         svstate.maxvl = VL # MAXVL
 251         print ("SVSTATE", bin(svstate.asint()))
 252
 253         with Program(lst, bigendian=False) as program:
 254             sim = self.run_tst_program(program, svstate=svstate,
 255                                        initial_fprs=fprs)
 256             print ("spr svshape0", sim.spr['SVSHAPE0'])
 257             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 258             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 259             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 260             print ("spr svshape1", sim.spr['SVSHAPE1'])
 261             print ("spr svshape2", sim.spr['SVSHAPE2'])
 262             print ("spr svshape3", sim.spr['SVSHAPE3'])
 263
 264             # work out the results with the twin mul/add-sub
 265             res = transform_radix2(av, coe)
 266
 267             for i, expected in enumerate(res):
 268                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 269             for i, expected in enumerate(res):
 270                 # convert to Power single
 271                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 272                 expected = float(expected)
 273                 actual = float(sim.fpr(i))
 274                 # approximate error calculation, good enough test
 275                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 276                 # and the rounding is different
 277                 err = abs(actual - expected) / expected
 278                 self.assertTrue(err < 1e-7)
 279
 280     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 281         """>>> lst = SVP64Asm( [
 282                         "svshape 8, 1, 1, 1, 1",
 283                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 284                          "svremap 5, 1, 0, 2, 0, 0, 1",
 285                          "sv.fmuls 24, 0.v, 8.v",
 286                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 287                          "svremap 26, 0, 0, 0, 0, 1, 1",
 288                         "sv.ffadds 0.v, 24, 0.v",
 289                         "setvl. 0, 0, 1, 1, 0, 0",
 290                         "bc 6, 3, -28"
 291                             ])
 292
 293             runs a full in-place O(N log2 N) butterfly schedule for
 294             Discrete Fourier Transform.  also uses "Vertical First"
 295             but also uses temporary scalars and ffadds rather than
 296             sv.ffmadds.
 297
 298             this represents an incremental step towards complex FFT
 299
 300             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 301
 302             * single fmuls FRT, FRA, FRC
 303             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 304               (FRS is implicit / hidden in ff* operations)
 305
 306             multiply:                         # sv.fmuls FRT, FRA, FRC
 307                 temp1 = vec[jh] * exptable[k]
 308                 temp2 = vec[jl]
 309             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 310                 vec[jh] = temp2 - temp1
 311                 vec[jl] = temp2 + temp1
 312
 313             also see notes in complex fft test: here svremap is done in
 314             "non-persistent" mode (as a demo) whereas in the complex fft
 315             svremap is used in "persistent" mode, where by a complete
 316             coincidence the REMAP arguments all happen to line up and
 317             only one persistent svremap is needed.  the exact same trick
 318             *could* be applied here but for illustrative purposes it is not.
 319         """
 320         lst = SVP64Asm( [
 321                         "svshape 8, 1, 1, 1, 1",
 322                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 323                          "svremap 5, 1, 0, 2, 0, 0, 0",
 324                          "sv.fmuls 24, 0.v, 8.v",
 325                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 326                          "svremap 26, 0, 0, 0, 0, 1, 0",
 327                         "sv.ffadds 0.v, 24, 0.v",
 328                         "setvl. 0, 0, 1, 1, 0, 0",
 329                         "bc 6, 3, -28"
 330                         ])
 331         lst = list(lst)
 332
 333         # array and coefficients to test
 334         av = [7.0, -9.8, 3.0, -32.3,
 335               -2.0, 5.0, -9.8, 31.3] # array 0..7
 336         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 337
 338         # store in regfile
 339         fprs = [0] * 32
 340         for i, c in enumerate(coe):
 341             fprs[i+8] = fp64toselectable(c)
 342         for i, a in enumerate(av):
 343             fprs[i+0] = fp64toselectable(a)
 344
 345         # set total. err don't know how to calculate how many there are...
 346         # do it manually for now
 347         VL = 0
 348         size = 2
 349         n = len(av)
 350         while size <= n:
 351             halfsize = size // 2
 352             tablestep = n // size
 353             for i in range(0, n, size):
 354                 for j in range(i, i + halfsize):
 355                     VL += 1
 356             size *= 2
 357
 358         # SVSTATE (calculated VL)
 359         svstate = SVP64State()
 360         svstate.vl = VL # VL
 361         svstate.maxvl = VL # MAXVL
 362         print ("SVSTATE", bin(svstate.asint()))
 363
 364         with Program(lst, bigendian=False) as program:
 365             sim = self.run_tst_program(program, svstate=svstate,
 366                                        initial_fprs=fprs)
 367             print ("spr svshape0", sim.spr['SVSHAPE0'])
 368             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 369             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 370             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 371             print ("spr svshape1", sim.spr['SVSHAPE1'])
 372             print ("spr svshape2", sim.spr['SVSHAPE2'])
 373             print ("spr svshape3", sim.spr['SVSHAPE3'])
 374
 375             # work out the results with the twin mul/add-sub
 376             res = transform_radix2(av, coe)
 377
 378             for i, expected in enumerate(res):
 379                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 380             for i, expected in enumerate(res):
 381                 # convert to Power single
 382                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 383                 expected = float(expected)
 384                 actual = float(sim.fpr(i))
 385                 # approximate error calculation, good enough test
 386                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 387                 # and the rounding is different
 388                 err = abs(actual - expected) / expected
 389                 self.assertTrue(err < 1e-7)
 390
 391     def test_sv_fpmadds_fft(self):
 392         """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 393                         ]
 394             four in-place vector mul-adds, four in-place vector mul-subs
 395
 396             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 397             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 398
 399             there is the *option* to target a different location (non-in-place)
 400             just in case.
 401
 402             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 403             FRS to perform the two multiplies.  one add, one subtract.
 404
 405             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 406                 fmadds  FRT   , FRA, FRC, FRA
 407                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 408
 409         """
 410         lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 411                         ])
 412         lst = list(lst)
 413
 414         fprs = [0] * 32
 415         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 416         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 417         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 418         res = []
 419         # work out the results with the twin mul/add-sub
 420         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 421             fprs[i+2] = fp64toselectable(a)
 422             fprs[i+6] = fp64toselectable(b)
 423             fprs[i+10] = fp64toselectable(c)
 424             mul = a * c
 425             t = b + mul
 426             u = b - mul
 427             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 428             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 429             res.append((t, u))
 430             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 431
 432         # SVSTATE (in this case, VL=2)
 433         svstate = SVP64State()
 434         svstate.vl = 4 # VL
 435         svstate.maxvl = 4 # MAXVL
 436         print ("SVSTATE", bin(svstate.asint()))
 437
 438         with Program(lst, bigendian=False) as program:
 439             sim = self.run_tst_program(program, svstate=svstate,
 440                                        initial_fprs=fprs)
 441             # confirm that the results are as expected
 442             for i, (t, u) in enumerate(res):
 443                 self.assertEqual(sim.fpr(i+2), t)
 444                 self.assertEqual(sim.fpr(i+6), u)
 445
 446     def test_sv_ffadds_fft(self):
 447         """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
 448                         ]
 449             four in-place vector adds, four in-place vector subs
 450
 451             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 452             FRS to perform the two multiplies.  one add, one subtract.
 453
 454             sv.ffadds FRT, FRA, FRB  actually does:
 455                 fadds FRT   , FRB, FRA
 456                 fsubs FRT+vl, FRA, FRB+vl
 457         """
 458         lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
 459                         ])
 460         lst = list(lst)
 461
 462         fprs = [0] * 32
 463         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 464         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 465         res = []
 466         # work out the results with the twin add-sub
 467         for i, (a, b) in enumerate(zip(av, bv)):
 468             fprs[i+2] = fp64toselectable(a)
 469             fprs[i+6] = fp64toselectable(b)
 470             t = b + a
 471             u = b - a
 472             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 473             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 474             res.append((t, u))
 475             print ("FFT", i, "in", a, b, "res", t, u)
 476
 477         # SVSTATE (in this case, VL=2)
 478         svstate = SVP64State()
 479         svstate.vl = 4 # VL
 480         svstate.maxvl = 4 # MAXVL
 481         print ("SVSTATE", bin(svstate.asint()))
 482
 483         with Program(lst, bigendian=False) as program:
 484             sim = self.run_tst_program(program, svstate=svstate,
 485                                        initial_fprs=fprs)
 486             # confirm that the results are as expected
 487             for i, (t, u) in enumerate(res):
 488                 a = float(sim.fpr(i+2))
 489                 b = float(sim.fpr(i+6))
 490                 t = float(t)
 491                 u = float(u)
 492                 print ("FFT", i, "in", a, b, "res", t, u)
 493             for i, (t, u) in enumerate(res):
 494                 self.assertEqual(sim.fpr(i+2), t)
 495                 self.assertEqual(sim.fpr(i+6), u)
 496
 497     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 498         """
 499             runs a full in-place O(N log2 N) butterfly schedule for
 500             Discrete Fourier Transform.  this version however uses
 501             SVP64 "Vertical-First" Mode and so needs an explicit
 502             branch, testing CR0.
 503
 504             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 505             (3 inputs, 2 outputs)
 506
 507             complex calculation (FFT):
 508
 509                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 510                 vec_r[jh] = vec_r[jl] - tpre
 511                 vec_r[jl] += tpre
 512
 513                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 514                 vec_i[jh] = vec_i[jl] - tpim
 515                 vec_i[jl] += tpim
 516
 517             real-only calculation (DFT):
 518
 519                 temp1 = vec[jh] * exptable[k]
 520                 temp2 = vec[jl]
 521                 vec[jh] = temp2 - temp1
 522                 vec[jl] = temp2 + temp1
 523
 524             note: a rather nice convenience / coincidence. the meaning of
 525             these two instructions is:
 526                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 527                 "svremap 5, 1, 0, 2, 0, 0, 1",
 528                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 529                 "svremap 26, 0, 0, 0, 0, 1, 1",
 530
 531             however it turns out that they can be *merged*, and for
 532             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 533             *ignore* their REMAPs (by definition), and for the second
 534             one (sv.ffads) exactly the right REMAPs are also ignored!
 535
 536                 "svremap 31, 1, 0, 2, 0, 1, 1",
 537         """
 538         lst = SVP64Asm( [
 539                         # set triple butterfly mode with persistent "REMAP"
 540                         "svshape 8, 1, 1, 1, 1",
 541                         "svremap 31, 1, 0, 2, 0, 1, 1",
 542                         # tpre
 543                         "sv.fmuls 24, 0.v, 16.v",    # mul1_r = r*cos_r
 544                         "sv.fmadds 24, 8.v, 20.v, 24", # mul2_r = i*sin_i
 545                                                      # tpre = mul1_r + mul2_r
 546                         # tpim
 547                         "sv.fmuls 26, 0.v, 20.v",    # mul1_i = r*sin_i
 548                         "sv.fmsubs 26, 8.v, 16.v, 26", # mul2_i = i*cos_r
 549                                                      # tpim = mul2_i - mul1_i
 550                         # vec_r jh/jl
 551                         "sv.ffadds 0.v, 24, 0.v",    # vh/vl +/- tpre
 552                         # vec_i jh/jl
 553                         "sv.ffadds 8.v, 26, 8.v",    # vh/vl +- tpim
 554
 555                         # svstep loop
 556                         "setvl. 0, 0, 1, 1, 0, 0",
 557                         "bc 6, 3, -56"
 558                         ])
 559         lst = list(lst)
 560
 561         # array and coefficients to test
 562         ar = [7.0, -9.8, 3.0, -32.3,
 563               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 564         ai = [1.0, -1.8, 3.0, 19.3,
 565               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 566         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 567         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 568
 569         # store in regfile
 570         fprs = [0] * 64
 571         for i, a in enumerate(ar):
 572             fprs[i+0] = fp64toselectable(a)
 573         for i, a in enumerate(ai):
 574             fprs[i+8] = fp64toselectable(a)
 575         for i, cr in enumerate(coer):
 576             fprs[i+16] = fp64toselectable(cr)
 577         for i, ci in enumerate(coei):
 578             fprs[i+20] = fp64toselectable(ci)
 579
 580         # set total. err don't know how to calculate how many there are...
 581         # do it manually for now
 582         VL = 0
 583         size = 2
 584         n = len(ar)
 585         while size <= n:
 586             halfsize = size // 2
 587             tablestep = n // size
 588             for i in range(0, n, size):
 589                 for j in range(i, i + halfsize):
 590                     VL += 1
 591             size *= 2
 592
 593         # SVSTATE (calculated VL)
 594         svstate = SVP64State()
 595         svstate.vl = VL # VL
 596         svstate.maxvl = VL # MAXVL
 597         print ("SVSTATE", bin(svstate.asint()))
 598
 599         with Program(lst, bigendian=False) as program:
 600             sim = self.run_tst_program(program, svstate=svstate,
 601                                        initial_fprs=fprs)
 602             print ("spr svshape0", sim.spr['SVSHAPE0'])
 603             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 604             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 605             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 606             print ("spr svshape1", sim.spr['SVSHAPE1'])
 607             print ("spr svshape2", sim.spr['SVSHAPE2'])
 608             print ("spr svshape3", sim.spr['SVSHAPE3'])
 609
 610             # work out the results with the twin mul/add-sub, explicit
 611             # complex numbers
 612             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 613
 614             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 615                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 616                        "expected_r", expected_r,
 617                        "expected_i", expected_i)
 618             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 619                 # convert to Power single
 620                 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
 621                 expected_r = float(expected_r)
 622                 actual_r = float(sim.fpr(i))
 623                 # approximate error calculation, good enough test
 624                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 625                 # and the rounding is different
 626                 err = abs(actual_r - expected_r ) / expected_r
 627                 self.assertTrue(err < 1e-6)
 628                 # convert to Power single
 629                 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
 630                 expected_i = float(expected_i)
 631                 actual_i = float(sim.fpr(i+8))
 632                 # approximate error calculation, good enough test
 633                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 634                 # and the rounding is different
 635                 err = abs(actual_i - expected_i ) / expected_i
 636                 self.assertTrue(err < 1e-6)
 637
 638     def test_sv_ffadds_fft_scalar(self):
 639         """>>> lst = ["sv.ffadds 2.v, 12, 13"
 640                         ]
 641             four in-place vector adds and subs, but done with a scalar
 642             pair (fp12, fp13)
 643         """
 644         lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
 645                         ])
 646         lst = list(lst)
 647
 648         fprs = [0] * 32
 649         scalar_a = 1.3
 650         scalar_b = -2.0
 651         fprs[12] = fp64toselectable(scalar_a)
 652         fprs[13] = fp64toselectable(scalar_b)
 653         res = []
 654         # work out the results with the twin add-sub
 655         for i in range(4):
 656             t = scalar_b + scalar_a
 657             u = scalar_b - scalar_a
 658             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 659             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 660             res.append((t, u))
 661             print ("FFT", i, "res", t, u)
 662
 663         # SVSTATE (in this case, VL=2)
 664         svstate = SVP64State()
 665         svstate.vl = 4 # VL
 666         svstate.maxvl = 4 # MAXVL
 667         print ("SVSTATE", bin(svstate.asint()))
 668
 669         with Program(lst, bigendian=False) as program:
 670             sim = self.run_tst_program(program, svstate=svstate,
 671                                        initial_fprs=fprs)
 672             # confirm that the results are as expected
 673             for i, (t, u) in enumerate(res):
 674                 a = float(sim.fpr(i+2))
 675                 b = float(sim.fpr(i+6))
 676                 t = float(t)
 677                 u = float(u)
 678                 print ("FFT", i, "in", a, b, "res", t, u)
 679             for i, (t, u) in enumerate(res):
 680                 self.assertEqual(sim.fpr(i+2), t)
 681                 self.assertEqual(sim.fpr(i+6), u)
 682
 683     def test_sv_remap_fpmadds_fft_ldst(self):
 684         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 685                          "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
 686                          "svshape 8, 1, 1, 1, 0",
 687                          "svremap 31, 1, 0, 2, 0, 1, 0",
 688                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 689
 690             runs a full in-place O(N log2 N) butterfly schedule for
 691             Discrete Fourier Transform, using bit-reversed LD/ST
 692         """
 693         lst = SVP64Asm( ["setvl 0, 0, 8, 0, 1, 1",
 694                          "sv.lfsbr 0.v, 4(0), 20", # bit-reversed
 695                          "svshape 8, 1, 1, 1, 0",
 696                          "svremap 31, 1, 0, 2, 0, 1, 0",
 697                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 698                         ])
 699         lst = list(lst)
 700
 701         # array and coefficients to test
 702         av = [7.0, -9.8, 3.0, -32.3,
 703               -2.0, 5.0, -9.8, 31.3] # array 0..7
 704         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 705
 706         # store in regfile
 707         fprs = [0] * 32
 708         for i, c in enumerate(coe):
 709             fprs[i+8] = fp64toselectable(c)
 710         # store in memory
 711         mem = {}
 712         val = 0
 713         for i, a in enumerate(av):
 714             a = SINGLE(fp64toselectable(a)).value
 715             shift = (i % 2) == 1
 716             if shift == 0:
 717                 val = a
 718             else:
 719                 mem[(i//2)*8] = val | (a << 32)
 720
 721         with Program(lst, bigendian=False) as program:
 722             sim = self.run_tst_program(program, initial_mem=mem,
 723                                                 initial_fprs=fprs)
 724             print ("spr svshape0", sim.spr['SVSHAPE0'])
 725             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 726             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 727             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 728             print ("spr svshape1", sim.spr['SVSHAPE1'])
 729             print ("spr svshape2", sim.spr['SVSHAPE2'])
 730             print ("spr svshape3", sim.spr['SVSHAPE3'])
 731
 732             print ("mem dump")
 733             print (sim.mem.dump())
 734
 735             # work out the results with the twin mul/add-sub,
 736             # note bit-reverse mode requested
 737             res = transform_radix2(av, coe, reverse=True)
 738
 739             for i, expected in enumerate(res):
 740                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 741             for i, expected in enumerate(res):
 742                 # convert to Power single
 743                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 744                 expected = float(expected)
 745                 actual = float(sim.fpr(i))
 746                 # approximate error calculation, good enough test
 747                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 748                 # and the rounding is different
 749                 err = abs(actual - expected) / expected
 750                 self.assertTrue(err < 1e-6)
 751
 752     def run_tst_program(self, prog, initial_regs=None,
 753                               svstate=None,
 754                               initial_mem=None,
 755                               initial_fprs=None):
 756         if initial_regs is None:
 757             initial_regs = [0] * 32
 758         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 759                                                 initial_fprs=initial_fprs,
 760                                                 svstate=svstate)
 761
 762         print ("GPRs")
 763         simulator.gpr.dump()
 764         print ("FPRs")
 765         simulator.fpr.dump()
 766
 767         return simulator
 768
 769
 770 if __name__ == "__main__":
 771     unittest.main()