src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.sim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable, SINGLE
  13 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
  14
  15 # really bad hack.  need to access the DOUBLE2SINGLE function auto-generated
  16 # from pseudo-code.
  17 fph = ISACallerFnHelper(XLEN=64)
  18
  19
  20 def transform_radix2(vec, exptable, reverse=False):
  21     """
  22     # FFT and convolution test (Python), based on Project Nayuki
  23     #
  24     # Copyright (c) 2020 Project Nayuki. (MIT License)
  25     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  26
  27     """
  28     # bits of the integer 'val'.
  29     def reverse_bits(val, width):
  30         result = 0
  31         for _ in range(width):
  32             result = (result << 1) | (val & 1)
  33             val >>= 1
  34         return result
  35
  36     # Initialization
  37     n = len(vec)
  38     levels = n.bit_length() - 1
  39
  40     # Copy with bit-reversed permutation
  41     if reverse:
  42         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  43
  44     size = 2
  45     while size <= n:
  46         halfsize = size // 2
  47         tablestep = n // size
  48         for i in range(0, n, size):
  49             k = 0
  50             for j in range(i, i + halfsize):
  51                 # exact same actual computation, just embedded in
  52                 # triple-nested for-loops
  53                 jl, jh = j, j+halfsize
  54                 vjh = vec[jh]
  55                 temp1 = vec[jh] * exptable[k]
  56                 temp2 = vec[jl]
  57                 vec[jh] = temp2 - temp1
  58                 vec[jl] = temp2 + temp1
  59                 print ("xform jl jh k", jl, jh, k,
  60                        "vj vjh ek", temp2, vjh, exptable[k],
  61                        "t1, t2", temp1, temp2,
  62                        "v[jh] v[jl]", vec[jh], vec[jl])
  63                 k += tablestep
  64         size *= 2
  65
  66     return vec
  67
  68
  69 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  70     """
  71     # FFT and convolution test (Python), based on Project Nayuki
  72     #
  73     # Copyright (c) 2020 Project Nayuki. (MIT License)
  74     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  75
  76     """
  77     # bits of the integer 'val'.
  78     def reverse_bits(val, width):
  79         result = 0
  80         for _ in range(width):
  81             result = (result << 1) | (val & 1)
  82             val >>= 1
  83         return result
  84
  85     # Initialization
  86     n = len(vec_r)
  87     levels = n.bit_length() - 1
  88
  89     # Copy with bit-reversed permutation
  90     if reverse:
  91         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  92
  93     size = 2
  94     while size <= n:
  95         halfsize = size // 2
  96         tablestep = n // size
  97         for i in range(0, n, size):
  98             k = 0
  99             for j in range(i, i + halfsize):
 100                 # exact same actual computation, just embedded in
 101                 # triple-nested for-loops
 102                 jl, jh = j, j+halfsize
 103
 104                 print ("xform jl jh k", jl, jh, k,
 105                         "vr h l", vec_r[jh], vec_r[jl],
 106                         "vi h l", vec_i[jh], vec_i[jl])
 107                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 108                 mul1_r =  vec_r[jh] * cos_r[k]
 109                 mul2_r = vec_i[jh] * sin_i[k]
 110                 tpre =  mul1_r + mul2_r
 111                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 112                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 113                 print ("    tpre", tpre)
 114                 mul1_i = vec_r[jh] * sin_i[k]
 115                 mul2_i = vec_i[jh] * cos_r[k]
 116                 tpim = -mul1_i + mul2_i
 117                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 118                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 119                 print ("    tpim", tpim)
 120                 vec_r[jh] = vec_r[jl] - tpre
 121                 vec_i[jh] = vec_i[jl] - tpim
 122                 vec_r[jl] += tpre
 123                 vec_i[jl] += tpim
 124
 125                 print ("    xform jl jh k", jl, jh, k,
 126                         "\n       vr h l", vec_r[jh], vec_r[jl],
 127                         "\n       vi h l", vec_i[jh], vec_i[jl])
 128                 k += tablestep
 129         size *= 2
 130
 131     return vec_r, vec_i
 132
 133
 134 class FFTTestCase(FHDLTestCase):
 135
 136     def _check_regs(self, sim, expected):
 137         for i in range(32):
 138             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 139
 140     def test_sv_remap_fpmadds_fft_4(self):
 141         """>>> lst = ["svshape 2, 1, 1, 1, 0",
 142                      "svremap 31, 1, 0, 2, 0, 1, 0",
 143                       "sv.ffmadds. *2, *2, *2, *10"
 144                      ]
 145         this is a cheap (cheating) way to run a single "ffmadds." to
 146         get at least Rc=1 on sv.ffmadds to be activated. the results
 147         are not actually tested because there's no checking yet on
 148         FP Rc=1
 149         """
 150         lst = SVP64Asm( ["svshape 2, 1, 1, 1, 0",
 151                          "svremap 31, 1, 0, 2, 0, 1, 0",
 152                         "sv.ffmadds *0, *0, *0, *8"
 153                         ])
 154         lst = list(lst)
 155
 156         # array and coefficients to test
 157         av = [7.0, -9.8 ] # array 0..1
 158         coe = [3.1] # coefficients
 159
 160         # store in regfile
 161         fprs = [0] * 32
 162         for i, c in enumerate(coe):
 163             fprs[i+8] = fp64toselectable(c)
 164         for i, a in enumerate(av):
 165             fprs[i+0] = fp64toselectable(a)
 166
 167         with Program(lst, bigendian=False) as program:
 168             sim = self.run_tst_program(program, initial_fprs=fprs)
 169             print ("spr svshape0", sim.spr['SVSHAPE0'])
 170             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 171             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 172             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 173             print ("spr svshape1", sim.spr['SVSHAPE1'])
 174             print ("spr svshape2", sim.spr['SVSHAPE2'])
 175             print ("spr svshape3", sim.spr['SVSHAPE3'])
 176
 177             # work out the results with the twin mul/add-sub
 178             res = transform_radix2(av, coe)
 179
 180             for i, expected in enumerate(res):
 181                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 182             for i, expected in enumerate(res):
 183                 # convert to Power single
 184                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 185                 expected = float(expected)
 186                 actual = float(sim.fpr(i))
 187                 # approximate error calculation, good enough test
 188                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 189                 # and the rounding is different
 190                 err = abs(actual - expected) / expected
 191                 self.assertTrue(err < 1e-7)
 192
 193     def test_sv_remap_fpmadds_fft(self):
 194         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 195                      "svremap 31, 1, 0, 2, 0, 1, 0",
 196                       "sv.ffmadds *2, *2, *2, *10"
 197                      ]
 198             runs a full in-place O(N log2 N) butterfly schedule for
 199             Discrete Fourier Transform.
 200
 201             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 202             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 203
 204             there is the *option* to target a different location (non-in-place)
 205             just in case.
 206
 207             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 208             (3 inputs, 2 outputs)
 209         """
 210         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 211                          "svremap 31, 1, 0, 2, 0, 1, 0",
 212                         "sv.ffmadds *0, *0, *0, *8"
 213                         ])
 214         lst = list(lst)
 215
 216         # array and coefficients to test
 217         av = [7.0, -9.8, 3.0, -32.3,
 218               -2.0, 5.0, -9.8, 31.3] # array 0..7
 219         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 220
 221         # store in regfile
 222         fprs = [0] * 32
 223         for i, c in enumerate(coe):
 224             fprs[i+8] = fp64toselectable(c)
 225         for i, a in enumerate(av):
 226             fprs[i+0] = fp64toselectable(a)
 227
 228         with Program(lst, bigendian=False) as program:
 229             sim = self.run_tst_program(program, initial_fprs=fprs)
 230             print ("spr svshape0", sim.spr['SVSHAPE0'])
 231             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 232             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 233             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 234             print ("spr svshape1", sim.spr['SVSHAPE1'])
 235             print ("spr svshape2", sim.spr['SVSHAPE2'])
 236             print ("spr svshape3", sim.spr['SVSHAPE3'])
 237
 238             # work out the results with the twin mul/add-sub
 239             res = transform_radix2(av, coe)
 240
 241             for i, expected in enumerate(res):
 242                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 243             for i, expected in enumerate(res):
 244                 # convert to Power single
 245                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 246                 expected = float(expected)
 247                 actual = float(sim.fpr(i))
 248                 # approximate error calculation, good enough test
 249                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 250                 # and the rounding is different
 251                 err = abs(actual - expected) / expected
 252                 self.assertTrue(err < 1e-7)
 253
 254     def test_sv_remap_fpmadds_fft_svstep(self):
 255         """>>> lst = SVP64Asm( [
 256                             "svshape 8, 1, 1, 1, 1",
 257                              "svremap 31, 1, 0, 2, 0, 1, 0",
 258                             "sv.ffmadds *0, *0, *0, *8",
 259                             "setvl. 0, 0, 1, 1, 0, 0",
 260                             "bc 6, 3, -16"
 261                             ])
 262             runs a full in-place O(N log2 N) butterfly schedule for
 263             Discrete Fourier Transform.  this version however uses
 264             SVP64 "Vertical-First" Mode and so needs an explicit
 265             branch, testing CR0.
 266
 267             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 268             (3 inputs, 2 outputs)
 269         """
 270         lst = SVP64Asm( [
 271                         "svshape 8, 1, 1, 1, 1",
 272                          "svremap 31, 1, 0, 2, 0, 1, 0",
 273                         "sv.ffmadds *0, *0, *0, *8",
 274                         "setvl. 0, 0, 1, 1, 0, 0",
 275                         "bc 6, 3, -16"
 276                         ])
 277         lst = list(lst)
 278
 279         # array and coefficients to test
 280         av = [7.0, -9.8, 3.0, -32.3,
 281               -2.0, 5.0, -9.8, 31.3] # array 0..7
 282         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 283
 284         # store in regfile
 285         fprs = [0] * 32
 286         for i, c in enumerate(coe):
 287             fprs[i+8] = fp64toselectable(c)
 288         for i, a in enumerate(av):
 289             fprs[i+0] = fp64toselectable(a)
 290
 291         # set total. err don't know how to calculate how many there are...
 292         # do it manually for now
 293         VL = 0
 294         size = 2
 295         n = len(av)
 296         while size <= n:
 297             halfsize = size // 2
 298             tablestep = n // size
 299             for i in range(0, n, size):
 300                 for j in range(i, i + halfsize):
 301                     VL += 1
 302             size *= 2
 303
 304         # SVSTATE (calculated VL)
 305         svstate = SVP64State()
 306         svstate.vl = VL # VL
 307         svstate.maxvl = VL # MAXVL
 308         print ("SVSTATE", bin(svstate.asint()))
 309
 310         with Program(lst, bigendian=False) as program:
 311             sim = self.run_tst_program(program, svstate=svstate,
 312                                        initial_fprs=fprs)
 313             print ("spr svshape0", sim.spr['SVSHAPE0'])
 314             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 315             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 316             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 317             print ("spr svshape1", sim.spr['SVSHAPE1'])
 318             print ("spr svshape2", sim.spr['SVSHAPE2'])
 319             print ("spr svshape3", sim.spr['SVSHAPE3'])
 320
 321             # work out the results with the twin mul/add-sub
 322             res = transform_radix2(av, coe)
 323
 324             for i, expected in enumerate(res):
 325                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 326             for i, expected in enumerate(res):
 327                 # convert to Power single
 328                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 329                 expected = float(expected)
 330                 actual = float(sim.fpr(i))
 331                 # approximate error calculation, good enough test
 332                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 333                 # and the rounding is different
 334                 err = abs(actual - expected) / expected
 335                 self.assertTrue(err < 1e-7)
 336
 337     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 338         """>>> lst = SVP64Asm( [
 339                         "svshape 8, 1, 1, 1, 1",
 340                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 341                          "svremap 5, 1, 0, 2, 0, 0, 1",
 342                          "sv.fmuls 24, *0, *8",
 343                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 344                          "svremap 26, 0, 0, 0, 0, 1, 1",
 345                         "sv.ffadds *0, 24, *0",
 346                         "setvl. 0, 0, 1, 1, 0, 0",
 347                         "bc 6, 3, -28"
 348                             ])
 349
 350             runs a full in-place O(N log2 N) butterfly schedule for
 351             Discrete Fourier Transform.  also uses "Vertical First"
 352             but also uses temporary scalars and ffadds rather than
 353             sv.ffmadds.
 354
 355             this represents an incremental step towards complex FFT
 356
 357             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 358
 359             * single fmuls FRT, FRA, FRC
 360             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 361               (FRS is implicit / hidden in ff* operations)
 362
 363             multiply:                         # sv.fmuls FRT, FRA, FRC
 364                 temp1 = vec[jh] * exptable[k]
 365                 temp2 = vec[jl]
 366             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 367                 vec[jh] = temp2 - temp1
 368                 vec[jl] = temp2 + temp1
 369
 370             also see notes in complex fft test: here svremap is done in
 371             "non-persistent" mode (as a demo) whereas in the complex fft
 372             svremap is used in "persistent" mode, where by a complete
 373             coincidence the REMAP arguments all happen to line up and
 374             only one persistent svremap is needed.  the exact same trick
 375             *could* be applied here but for illustrative purposes it is not.
 376         """
 377         lst = SVP64Asm( [
 378                         "svshape 8, 1, 1, 1, 1",
 379                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 380                          "svremap 5, 1, 0, 2, 0, 0, 0",
 381                          "sv.fmuls 24, *0, *8",
 382                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 383                          "svremap 26, 0, 0, 0, 0, 1, 0",
 384                         "sv.ffadds *0, 24, *0",
 385                         "setvl. 0, 0, 1, 1, 0, 0",
 386                         "bc 6, 3, -28"
 387                         ])
 388         lst = list(lst)
 389
 390         # array and coefficients to test
 391         av = [7.0, -9.8, 3.0, -32.3,
 392               -2.0, 5.0, -9.8, 31.3] # array 0..7
 393         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 394
 395         # store in regfile
 396         fprs = [0] * 32
 397         for i, c in enumerate(coe):
 398             fprs[i+8] = fp64toselectable(c)
 399         for i, a in enumerate(av):
 400             fprs[i+0] = fp64toselectable(a)
 401
 402         # set total. err don't know how to calculate how many there are...
 403         # do it manually for now
 404         VL = 0
 405         size = 2
 406         n = len(av)
 407         while size <= n:
 408             halfsize = size // 2
 409             tablestep = n // size
 410             for i in range(0, n, size):
 411                 for j in range(i, i + halfsize):
 412                     VL += 1
 413             size *= 2
 414
 415         # SVSTATE (calculated VL)
 416         svstate = SVP64State()
 417         svstate.vl = VL # VL
 418         svstate.maxvl = VL # MAXVL
 419         print ("SVSTATE", bin(svstate.asint()))
 420
 421         with Program(lst, bigendian=False) as program:
 422             sim = self.run_tst_program(program, svstate=svstate,
 423                                        initial_fprs=fprs)
 424             print ("spr svshape0", sim.spr['SVSHAPE0'])
 425             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 426             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 427             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 428             print ("spr svshape1", sim.spr['SVSHAPE1'])
 429             print ("spr svshape2", sim.spr['SVSHAPE2'])
 430             print ("spr svshape3", sim.spr['SVSHAPE3'])
 431
 432             # work out the results with the twin mul/add-sub
 433             res = transform_radix2(av, coe)
 434
 435             for i, expected in enumerate(res):
 436                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 437             for i, expected in enumerate(res):
 438                 # convert to Power single
 439                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 440                 expected = float(expected)
 441                 actual = float(sim.fpr(i))
 442                 # approximate error calculation, good enough test
 443                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 444                 # and the rounding is different
 445                 err = abs(actual - expected) / expected
 446                 self.assertTrue(err < 1e-7)
 447
 448     def test_sv_fpmadds_fft(self):
 449         """>>> lst = ["sv.ffmadds *2, *2, *2, *10"
 450                         ]
 451             four in-place vector mul-adds, four in-place vector mul-subs
 452
 453             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 454             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 455
 456             there is the *option* to target a different location (non-in-place)
 457             just in case.
 458
 459             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 460             FRS to perform the two multiplies.  one add, one subtract.
 461
 462             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 463                 fmadds  FRT   , FRA, FRC, FRA
 464                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 465
 466         """
 467         lst = SVP64Asm(["sv.ffmadds *2, *2, *2, *10"
 468                         ])
 469         lst = list(lst)
 470
 471         fprs = [0] * 32
 472         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 473         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 474         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 475         res = []
 476         # work out the results with the twin mul/add-sub
 477         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 478             fprs[i+2] = fp64toselectable(a)
 479             fprs[i+6] = fp64toselectable(b)
 480             fprs[i+10] = fp64toselectable(c)
 481             mul = a * c
 482             t = b + mul
 483             u = b - mul
 484             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 485             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 486             res.append((t, u))
 487             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 488
 489         # SVSTATE (in this case, VL=2)
 490         svstate = SVP64State()
 491         svstate.vl = 4 # VL
 492         svstate.maxvl = 4 # MAXVL
 493         print ("SVSTATE", bin(svstate.asint()))
 494
 495         with Program(lst, bigendian=False) as program:
 496             sim = self.run_tst_program(program, svstate=svstate,
 497                                        initial_fprs=fprs)
 498             # confirm that the results are as expected
 499             for i, (t, u) in enumerate(res):
 500                 self.assertEqual(sim.fpr(i+2), t)
 501                 self.assertEqual(sim.fpr(i+6), u)
 502
 503     def test_sv_ffadds_fft(self):
 504         """>>> lst = ["sv.ffadds *2, *2, *2"
 505                         ]
 506             four in-place vector adds, four in-place vector subs
 507
 508             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 509             FRS to perform the two multiplies.  one add, one subtract.
 510
 511             sv.ffadds FRT, FRA, FRB  actually does:
 512                 fadds FRT   , FRB, FRA
 513                 fsubs FRT+vl, FRA, FRB+vl
 514         """
 515         lst = SVP64Asm(["sv.ffadds *2, *2, *2"
 516                         ])
 517         lst = list(lst)
 518
 519         fprs = [0] * 32
 520         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 521         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 522         res = []
 523         # work out the results with the twin add-sub
 524         for i, (a, b) in enumerate(zip(av, bv)):
 525             fprs[i+2] = fp64toselectable(a)
 526             fprs[i+6] = fp64toselectable(b)
 527             t = b + a
 528             u = b - a
 529             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 530             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 531             res.append((t, u))
 532             print ("FFT", i, "in", a, b, "res", t, u)
 533
 534         # SVSTATE (in this case, VL=2)
 535         svstate = SVP64State()
 536         svstate.vl = 4 # VL
 537         svstate.maxvl = 4 # MAXVL
 538         print ("SVSTATE", bin(svstate.asint()))
 539
 540         with Program(lst, bigendian=False) as program:
 541             sim = self.run_tst_program(program, svstate=svstate,
 542                                        initial_fprs=fprs)
 543             # confirm that the results are as expected
 544             for i, (t, u) in enumerate(res):
 545                 a = float(sim.fpr(i+2))
 546                 b = float(sim.fpr(i+6))
 547                 t = float(t)
 548                 u = float(u)
 549                 print ("FFT", i, "in", a, b, "res", t, u)
 550             for i, (t, u) in enumerate(res):
 551                 self.assertEqual(sim.fpr(i+2), t)
 552                 self.assertEqual(sim.fpr(i+6), u)
 553
 554     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 555         """
 556             runs a full in-place O(N log2 N) butterfly schedule for
 557             Discrete Fourier Transform.  this version however uses
 558             SVP64 "Vertical-First" Mode and so needs an explicit
 559             branch, testing CR0.
 560
 561             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 562             (3 inputs, 2 outputs)
 563
 564             complex calculation (FFT):
 565
 566                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 567                 vec_r[jh] = vec_r[jl] - tpre
 568                 vec_r[jl] += tpre
 569
 570                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 571                 vec_i[jh] = vec_i[jl] - tpim
 572                 vec_i[jl] += tpim
 573
 574             real-only calculation (DFT):
 575
 576                 temp1 = vec[jh] * exptable[k]
 577                 temp2 = vec[jl]
 578                 vec[jh] = temp2 - temp1
 579                 vec[jl] = temp2 + temp1
 580
 581             note: a rather nice convenience / coincidence. the meaning of
 582             these two instructions is:
 583                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 584                 "svremap 5, 1, 0, 2, 0, 0, 1",
 585                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 586                 "svremap 26, 0, 0, 0, 0, 1, 1",
 587
 588             however it turns out that they can be *merged*, and for
 589             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 590             *ignore* their REMAPs (by definition, because you can't REMAP
 591             scalar operands), and for the second one (sv.ffads) exactly the
 592             right REMAPs are also ignored!
 593
 594             therefore we can merge:
 595                 "svremap 5, 1, 0, 2, 0, 0, 1",
 596                 "svremap 26, 0, 0, 0, 0, 1, 1",
 597             into:
 598                 "svremap 31, 1, 0, 2, 0, 1, 1",
 599             and save one instruction.
 600         """
 601         lst = SVP64Asm( [
 602                         # set triple butterfly mode with persistent "REMAP"
 603                         "svshape 8, 1, 1, 1, 1",
 604                         "svremap 31, 1, 0, 2, 0, 1, 1",
 605                         # tpre
 606                         "sv.fmuls 24, *0, *16",    # mul1_r = r*cos_r
 607                         "sv.fmadds 24, *8, *20, 24", # mul2_r = i*sin_i
 608                                                      # tpre = mul1_r + mul2_r
 609                         # tpim
 610                         "sv.fmuls 26, *0, *20",    # mul1_i = r*sin_i
 611                         "sv.fmsubs 26, *8, *16, 26", # mul2_i = i*cos_r
 612                                                      # tpim = mul2_i - mul1_i
 613                         # vec_r jh/jl
 614                         "sv.ffadds *0, 24, *0",    # vh/vl +/- tpre
 615                         # vec_i jh/jl
 616                         "sv.ffadds *8, 26, *8",    # vh/vl +- tpim
 617
 618                         # svstep loop
 619                         "setvl. 0, 0, 1, 1, 0, 0",
 620                         "bc 6, 3, -56"
 621                         ])
 622         lst = list(lst)
 623
 624         # array and coefficients to test
 625         ar = [7.0, -9.8, 3.0, -32.3,
 626               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 627         ai = [1.0, -1.8, 3.0, 19.3,
 628               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 629         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 630         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 631
 632         # store in regfile
 633         fprs = [0] * 64
 634         for i, a in enumerate(ar):
 635             fprs[i+0] = fp64toselectable(a)
 636         for i, a in enumerate(ai):
 637             fprs[i+8] = fp64toselectable(a)
 638         for i, cr in enumerate(coer):
 639             fprs[i+16] = fp64toselectable(cr)
 640         for i, ci in enumerate(coei):
 641             fprs[i+20] = fp64toselectable(ci)
 642
 643         # set total. err don't know how to calculate how many there are...
 644         # do it manually for now
 645         VL = 0
 646         size = 2
 647         n = len(ar)
 648         while size <= n:
 649             halfsize = size // 2
 650             tablestep = n // size
 651             for i in range(0, n, size):
 652                 for j in range(i, i + halfsize):
 653                     VL += 1
 654             size *= 2
 655
 656         # SVSTATE (calculated VL)
 657         svstate = SVP64State()
 658         svstate.vl = VL # VL
 659         svstate.maxvl = VL # MAXVL
 660         print ("SVSTATE", bin(svstate.asint()))
 661
 662         with Program(lst, bigendian=False) as program:
 663             sim = self.run_tst_program(program, svstate=svstate,
 664                                        initial_fprs=fprs)
 665             print ("spr svshape0", sim.spr['SVSHAPE0'])
 666             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 667             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 668             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 669             print ("spr svshape1", sim.spr['SVSHAPE1'])
 670             print ("spr svshape2", sim.spr['SVSHAPE2'])
 671             print ("spr svshape3", sim.spr['SVSHAPE3'])
 672
 673             # work out the results with the twin mul/add-sub, explicit
 674             # complex numbers
 675             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 676
 677             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 678                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 679                        "expected_r", expected_r,
 680                        "expected_i", expected_i)
 681             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 682                 # convert to Power single
 683                 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r ))
 684                 expected_r = float(expected_r)
 685                 actual_r = float(sim.fpr(i))
 686                 # approximate error calculation, good enough test
 687                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 688                 # and the rounding is different
 689                 err = abs(actual_r - expected_r ) / expected_r
 690                 self.assertTrue(err < 1e-6)
 691                 # convert to Power single
 692                 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i ))
 693                 expected_i = float(expected_i)
 694                 actual_i = float(sim.fpr(i+8))
 695                 # approximate error calculation, good enough test
 696                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 697                 # and the rounding is different
 698                 err = abs(actual_i - expected_i ) / expected_i
 699                 self.assertTrue(err < 1e-6)
 700
 701     def test_sv_ffadds_fft_scalar(self):
 702         """>>> lst = ["sv.ffadds *2, 12, 13"
 703                         ]
 704             four in-place vector adds and subs, but done with a scalar
 705             pair (fp12, fp13)
 706         """
 707         lst = SVP64Asm(["sv.ffadds *2, 12, 13"
 708                         ])
 709         lst = list(lst)
 710
 711         fprs = [0] * 32
 712         scalar_a = 1.3
 713         scalar_b = -2.0
 714         fprs[12] = fp64toselectable(scalar_a)
 715         fprs[13] = fp64toselectable(scalar_b)
 716         res = []
 717         # work out the results with the twin add-sub
 718         for i in range(4):
 719             t = scalar_b + scalar_a
 720             u = scalar_b - scalar_a
 721             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 722             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 723             res.append((t, u))
 724             print ("FFT", i, "res", t, u)
 725
 726         # SVSTATE (in this case, VL=2)
 727         svstate = SVP64State()
 728         svstate.vl = 4 # VL
 729         svstate.maxvl = 4 # MAXVL
 730         print ("SVSTATE", bin(svstate.asint()))
 731
 732         with Program(lst, bigendian=False) as program:
 733             sim = self.run_tst_program(program, svstate=svstate,
 734                                        initial_fprs=fprs)
 735             # confirm that the results are as expected
 736             for i, (t, u) in enumerate(res):
 737                 a = float(sim.fpr(i+2))
 738                 b = float(sim.fpr(i+6))
 739                 t = float(t)
 740                 u = float(u)
 741                 print ("FFT", i, "in", a, b, "res", t, u)
 742             for i, (t, u) in enumerate(res):
 743                 self.assertEqual(sim.fpr(i+2), t)
 744                 self.assertEqual(sim.fpr(i+6), u)
 745
 746     def test_sv_remap_fpmadds_fft_ldst(self):
 747         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 748                          "sv.lfs/els *0, 4(0)",
 749                          "svshape 8, 1, 1, 1, 0",
 750                          "svremap 31, 1, 0, 2, 0, 1, 0",
 751                          "sv.ffmadds *0, *0, *0, *8"
 752
 753             runs a full in-place O(N log2 N) butterfly schedule for
 754             Discrete Fourier Transform, using bit-reversed LD/ST
 755         """
 756         lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
 757                          "svremap 1, 0, 0, 0, 0, 0, 0",
 758                          "sv.lfs/els *0, 4(0)",
 759                          "svshape 8, 1, 1, 1, 0",
 760                          "svremap 31, 1, 0, 2, 0, 1, 0",
 761                          "sv.ffmadds *0, *0, *0, *8"
 762                         ])
 763         lst = list(lst)
 764
 765         # array and coefficients to test
 766         av = [7.0, -9.8, 3.0, -32.3,
 767               -2.0, 5.0, -9.8, 31.3] # array 0..7
 768         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 769
 770         # store in regfile
 771         fprs = [0] * 32
 772         for i, c in enumerate(coe):
 773             fprs[i+8] = fp64toselectable(c)
 774         # store in memory
 775         mem = {}
 776         val = 0
 777         for i, a in enumerate(av):
 778             a = SINGLE(fp64toselectable(a)).value
 779             shift = (i % 2) == 1
 780             if shift == 0:
 781                 val = a
 782             else:
 783                 mem[(i//2)*8] = val | (a << 32)
 784
 785         with Program(lst, bigendian=False) as program:
 786             sim = self.run_tst_program(program, initial_mem=mem,
 787                                                 initial_fprs=fprs)
 788             print ("spr svshape0", sim.spr['SVSHAPE0'])
 789             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 790             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 791             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 792             print ("spr svshape1", sim.spr['SVSHAPE1'])
 793             print ("spr svshape2", sim.spr['SVSHAPE2'])
 794             print ("spr svshape3", sim.spr['SVSHAPE3'])
 795
 796             print ("mem dump")
 797             print (sim.mem.dump())
 798
 799             # work out the results with the twin mul/add-sub,
 800             # note bit-reverse mode requested
 801             res = transform_radix2(av, coe, reverse=True)
 802
 803             for i, expected in enumerate(res):
 804                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 805             for i, expected in enumerate(res):
 806                 # convert to Power single
 807                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 808                 expected = float(expected)
 809                 actual = float(sim.fpr(i))
 810                 # approximate error calculation, good enough test
 811                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 812                 # and the rounding is different
 813                 err = abs(actual - expected) / expected
 814                 self.assertTrue(err < 1e-6)
 815
 816     def run_tst_program(self, prog, initial_regs=None,
 817                               svstate=None,
 818                               initial_mem=None,
 819                               initial_fprs=None):
 820         if initial_regs is None:
 821             initial_regs = [0] * 32
 822         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 823                                                 initial_fprs=initial_fprs,
 824                                                 svstate=svstate)
 825
 826         print ("GPRs")
 827         simulator.gpr.dump()
 828         print ("FPRs")
 829         simulator.fpr.dump()
 830
 831         return simulator
 832
 833
 834 if __name__ == "__main__":
 835     unittest.main()