src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.back.pysim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable, SINGLE
  13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
  14
  15
  16 def transform_radix2(vec, exptable, reverse=False):
  17     """
  18     # FFT and convolution test (Python), based on Project Nayuki
  19     #
  20     # Copyright (c) 2020 Project Nayuki. (MIT License)
  21     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  22
  23     """
  24     # bits of the integer 'val'.
  25     def reverse_bits(val, width):
  26         result = 0
  27         for _ in range(width):
  28             result = (result << 1) | (val & 1)
  29             val >>= 1
  30         return result
  31
  32     # Initialization
  33     n = len(vec)
  34     levels = n.bit_length() - 1
  35
  36     # Copy with bit-reversed permutation
  37     if reverse:
  38         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  39
  40     size = 2
  41     while size <= n:
  42         halfsize = size // 2
  43         tablestep = n // size
  44         for i in range(0, n, size):
  45             k = 0
  46             for j in range(i, i + halfsize):
  47                 # exact same actual computation, just embedded in
  48                 # triple-nested for-loops
  49                 jl, jh = j, j+halfsize
  50                 vjh = vec[jh]
  51                 temp1 = vec[jh] * exptable[k]
  52                 temp2 = vec[jl]
  53                 vec[jh] = temp2 - temp1
  54                 vec[jl] = temp2 + temp1
  55                 print ("xform jl jh k", jl, jh, k,
  56                        "vj vjh ek", temp2, vjh, exptable[k],
  57                        "t1, t2", temp1, temp2,
  58                        "v[jh] v[jl]", vec[jh], vec[jl])
  59                 k += tablestep
  60         size *= 2
  61
  62     return vec
  63
  64
  65 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  66     """
  67     # FFT and convolution test (Python), based on Project Nayuki
  68     #
  69     # Copyright (c) 2020 Project Nayuki. (MIT License)
  70     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  71
  72     """
  73     # bits of the integer 'val'.
  74     def reverse_bits(val, width):
  75         result = 0
  76         for _ in range(width):
  77             result = (result << 1) | (val & 1)
  78             val >>= 1
  79         return result
  80
  81     # Initialization
  82     n = len(vec_r)
  83     levels = n.bit_length() - 1
  84
  85     # Copy with bit-reversed permutation
  86     if reverse:
  87         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  88
  89     size = 2
  90     while size <= n:
  91         halfsize = size // 2
  92         tablestep = n // size
  93         for i in range(0, n, size):
  94             k = 0
  95             for j in range(i, i + halfsize):
  96                 # exact same actual computation, just embedded in
  97                 # triple-nested for-loops
  98                 jl, jh = j, j+halfsize
  99
 100                 print ("xform jl jh k", jl, jh, k,
 101                         "vr h l", vec_r[jh], vec_r[jl],
 102                         "vi h l", vec_i[jh], vec_i[jl])
 103                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 104                 mul1_r =  vec_r[jh] * cos_r[k]
 105                 mul2_r = vec_i[jh] * sin_i[k]
 106                 tpre =  mul1_r + mul2_r
 107                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 108                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 109                 print ("    tpre", tpre)
 110                 mul1_i = vec_r[jh] * sin_i[k]
 111                 mul2_i = vec_i[jh] * cos_r[k]
 112                 tpim = -mul1_i + mul2_i
 113                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 114                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 115                 print ("    tpim", tpim)
 116                 vec_r[jh] = vec_r[jl] - tpre
 117                 vec_i[jh] = vec_i[jl] - tpim
 118                 vec_r[jl] += tpre
 119                 vec_i[jl] += tpim
 120
 121                 print ("    xform jl jh k", jl, jh, k,
 122                         "\n       vr h l", vec_r[jh], vec_r[jl],
 123                         "\n       vi h l", vec_i[jh], vec_i[jl])
 124                 k += tablestep
 125         size *= 2
 126
 127     return vec_r, vec_i
 128
 129
 130 class FFTTestCase(FHDLTestCase):
 131
 132     def _check_regs(self, sim, expected):
 133         for i in range(32):
 134             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 135
 136     def test_sv_remap_fpmadds_fft(self):
 137         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 138                      "svremap 31, 1, 0, 2, 0, 1, 0",
 139                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 140                      ]
 141             runs a full in-place O(N log2 N) butterfly schedule for
 142             Discrete Fourier Transform.
 143
 144             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 145             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 146
 147             there is the *option* to target a different location (non-in-place)
 148             just in case.
 149
 150             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 151             (3 inputs, 2 outputs)
 152         """
 153         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 154                          "svremap 31, 1, 0, 2, 0, 1, 0",
 155                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 156                         ])
 157         lst = list(lst)
 158
 159         # array and coefficients to test
 160         av = [7.0, -9.8, 3.0, -32.3,
 161               -2.0, 5.0, -9.8, 31.3] # array 0..7
 162         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 163
 164         # store in regfile
 165         fprs = [0] * 32
 166         for i, c in enumerate(coe):
 167             fprs[i+8] = fp64toselectable(c)
 168         for i, a in enumerate(av):
 169             fprs[i+0] = fp64toselectable(a)
 170
 171         with Program(lst, bigendian=False) as program:
 172             sim = self.run_tst_program(program, initial_fprs=fprs)
 173             print ("spr svshape0", sim.spr['SVSHAPE0'])
 174             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 175             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 176             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 177             print ("spr svshape1", sim.spr['SVSHAPE1'])
 178             print ("spr svshape2", sim.spr['SVSHAPE2'])
 179             print ("spr svshape3", sim.spr['SVSHAPE3'])
 180
 181             # work out the results with the twin mul/add-sub
 182             res = transform_radix2(av, coe)
 183
 184             for i, expected in enumerate(res):
 185                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 186             for i, expected in enumerate(res):
 187                 # convert to Power single
 188                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 189                 expected = float(expected)
 190                 actual = float(sim.fpr(i))
 191                 # approximate error calculation, good enough test
 192                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 193                 # and the rounding is different
 194                 err = abs(actual - expected) / expected
 195                 self.assertTrue(err < 1e-7)
 196
 197     def test_sv_remap_fpmadds_fft_svstep(self):
 198         """>>> lst = SVP64Asm( [
 199                             "svshape 8, 1, 1, 1, 1",
 200                              "svremap 31, 1, 0, 2, 0, 1, 0",
 201                             "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 202                             "setvl. 0, 0, 1, 1, 0, 0",
 203                             "bc 6, 3, -16"
 204                             ])
 205             runs a full in-place O(N log2 N) butterfly schedule for
 206             Discrete Fourier Transform.  this version however uses
 207             SVP64 "Vertical-First" Mode and so needs an explicit
 208             branch, testing CR0.
 209
 210             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 211             (3 inputs, 2 outputs)
 212         """
 213         lst = SVP64Asm( [
 214                         "svshape 8, 1, 1, 1, 1",
 215                          "svremap 31, 1, 0, 2, 0, 1, 0",
 216                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 217                         "setvl. 0, 0, 1, 1, 0, 0",
 218                         "bc 6, 3, -16"
 219                         ])
 220         lst = list(lst)
 221
 222         # array and coefficients to test
 223         av = [7.0, -9.8, 3.0, -32.3,
 224               -2.0, 5.0, -9.8, 31.3] # array 0..7
 225         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 226
 227         # store in regfile
 228         fprs = [0] * 32
 229         for i, c in enumerate(coe):
 230             fprs[i+8] = fp64toselectable(c)
 231         for i, a in enumerate(av):
 232             fprs[i+0] = fp64toselectable(a)
 233
 234         # set total. err don't know how to calculate how many there are...
 235         # do it manually for now
 236         VL = 0
 237         size = 2
 238         n = len(av)
 239         while size <= n:
 240             halfsize = size // 2
 241             tablestep = n // size
 242             for i in range(0, n, size):
 243                 for j in range(i, i + halfsize):
 244                     VL += 1
 245             size *= 2
 246
 247         # SVSTATE (calculated VL)
 248         svstate = SVP64State()
 249         svstate.vl = VL # VL
 250         svstate.maxvl = VL # MAXVL
 251         print ("SVSTATE", bin(svstate.asint()))
 252
 253         with Program(lst, bigendian=False) as program:
 254             sim = self.run_tst_program(program, svstate=svstate,
 255                                        initial_fprs=fprs)
 256             print ("spr svshape0", sim.spr['SVSHAPE0'])
 257             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 258             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 259             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 260             print ("spr svshape1", sim.spr['SVSHAPE1'])
 261             print ("spr svshape2", sim.spr['SVSHAPE2'])
 262             print ("spr svshape3", sim.spr['SVSHAPE3'])
 263
 264             # work out the results with the twin mul/add-sub
 265             res = transform_radix2(av, coe)
 266
 267             for i, expected in enumerate(res):
 268                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 269             for i, expected in enumerate(res):
 270                 # convert to Power single
 271                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 272                 expected = float(expected)
 273                 actual = float(sim.fpr(i))
 274                 # approximate error calculation, good enough test
 275                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 276                 # and the rounding is different
 277                 err = abs(actual - expected) / expected
 278                 self.assertTrue(err < 1e-7)
 279
 280     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 281         """>>> lst = SVP64Asm( [
 282                         "svshape 8, 1, 1, 1, 1",
 283                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 284                          "svremap 5, 1, 0, 2, 0, 0, 1",
 285                          "sv.fmuls 24, 0.v, 8.v",
 286                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 287                          "svremap 26, 0, 0, 0, 0, 1, 1",
 288                         "sv.ffadds 0.v, 24, 0.v",
 289                         "setvl. 0, 0, 1, 1, 0, 0",
 290                         "bc 6, 3, -28"
 291                             ])
 292
 293             runs a full in-place O(N log2 N) butterfly schedule for
 294             Discrete Fourier Transform.  also uses "Vertical First"
 295             but also uses temporary scalars and ffadds rather than
 296             sv.ffmadds.
 297
 298             this represents an incremental step towards complex FFT
 299
 300             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 301
 302             * single fmuls FRT, FRA, FRC
 303             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 304               (FRS is implicit / hidden in ff* operations)
 305
 306             multiply:                         # sv.fmuls FRT, FRA, FRC
 307                 temp1 = vec[jh] * exptable[k]
 308                 temp2 = vec[jl]
 309             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 310                 vec[jh] = temp2 - temp1
 311                 vec[jl] = temp2 + temp1
 312
 313             also see notes in complex fft test: here svremap is done in
 314             "non-persistent" mode (as a demo) whereas in the complex fft
 315             svremap is used in "persistent" mode, where by a complete
 316             coincidence the REMAP arguments all happen to line up and
 317             only one persistent svremap is needed.  the exact same trick
 318             *could* be applied here but for illustrative purposes it is not.
 319         """
 320         lst = SVP64Asm( [
 321                         "svshape 8, 1, 1, 1, 1",
 322                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 323                          "svremap 5, 1, 0, 2, 0, 0, 0",
 324                          "sv.fmuls 24, 0.v, 8.v",
 325                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 326                          "svremap 26, 0, 0, 0, 0, 1, 0",
 327                         "sv.ffadds 0.v, 24, 0.v",
 328                         "setvl. 0, 0, 1, 1, 0, 0",
 329                         "bc 6, 3, -28"
 330                         ])
 331         lst = list(lst)
 332
 333         # array and coefficients to test
 334         av = [7.0, -9.8, 3.0, -32.3,
 335               -2.0, 5.0, -9.8, 31.3] # array 0..7
 336         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 337
 338         # store in regfile
 339         fprs = [0] * 32
 340         for i, c in enumerate(coe):
 341             fprs[i+8] = fp64toselectable(c)
 342         for i, a in enumerate(av):
 343             fprs[i+0] = fp64toselectable(a)
 344
 345         # set total. err don't know how to calculate how many there are...
 346         # do it manually for now
 347         VL = 0
 348         size = 2
 349         n = len(av)
 350         while size <= n:
 351             halfsize = size // 2
 352             tablestep = n // size
 353             for i in range(0, n, size):
 354                 for j in range(i, i + halfsize):
 355                     VL += 1
 356             size *= 2
 357
 358         # SVSTATE (calculated VL)
 359         svstate = SVP64State()
 360         svstate.vl = VL # VL
 361         svstate.maxvl = VL # MAXVL
 362         print ("SVSTATE", bin(svstate.asint()))
 363
 364         with Program(lst, bigendian=False) as program:
 365             sim = self.run_tst_program(program, svstate=svstate,
 366                                        initial_fprs=fprs)
 367             print ("spr svshape0", sim.spr['SVSHAPE0'])
 368             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 369             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 370             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 371             print ("spr svshape1", sim.spr['SVSHAPE1'])
 372             print ("spr svshape2", sim.spr['SVSHAPE2'])
 373             print ("spr svshape3", sim.spr['SVSHAPE3'])
 374
 375             # work out the results with the twin mul/add-sub
 376             res = transform_radix2(av, coe)
 377
 378             for i, expected in enumerate(res):
 379                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 380             for i, expected in enumerate(res):
 381                 # convert to Power single
 382                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 383                 expected = float(expected)
 384                 actual = float(sim.fpr(i))
 385                 # approximate error calculation, good enough test
 386                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 387                 # and the rounding is different
 388                 err = abs(actual - expected) / expected
 389                 self.assertTrue(err < 1e-7)
 390
 391     def test_sv_fpmadds_fft(self):
 392         """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 393                         ]
 394             four in-place vector mul-adds, four in-place vector mul-subs
 395
 396             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 397             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 398
 399             there is the *option* to target a different location (non-in-place)
 400             just in case.
 401
 402             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 403             FRS to perform the two multiplies.  one add, one subtract.
 404
 405             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 406                 fmadds  FRT   , FRA, FRC, FRA
 407                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 408
 409         """
 410         lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 411                         ])
 412         lst = list(lst)
 413
 414         fprs = [0] * 32
 415         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 416         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 417         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 418         res = []
 419         # work out the results with the twin mul/add-sub
 420         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 421             fprs[i+2] = fp64toselectable(a)
 422             fprs[i+6] = fp64toselectable(b)
 423             fprs[i+10] = fp64toselectable(c)
 424             mul = a * c
 425             t = b + mul
 426             u = b - mul
 427             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 428             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 429             res.append((t, u))
 430             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 431
 432         # SVSTATE (in this case, VL=2)
 433         svstate = SVP64State()
 434         svstate.vl = 4 # VL
 435         svstate.maxvl = 4 # MAXVL
 436         print ("SVSTATE", bin(svstate.asint()))
 437
 438         with Program(lst, bigendian=False) as program:
 439             sim = self.run_tst_program(program, svstate=svstate,
 440                                        initial_fprs=fprs)
 441             # confirm that the results are as expected
 442             for i, (t, u) in enumerate(res):
 443                 self.assertEqual(sim.fpr(i+2), t)
 444                 self.assertEqual(sim.fpr(i+6), u)
 445
 446     def test_sv_ffadds_fft(self):
 447         """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
 448                         ]
 449             four in-place vector adds, four in-place vector subs
 450
 451             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 452             FRS to perform the two multiplies.  one add, one subtract.
 453
 454             sv.ffadds FRT, FRA, FRB  actually does:
 455                 fadds FRT   , FRB, FRA
 456                 fsubs FRT+vl, FRA, FRB+vl
 457         """
 458         lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
 459                         ])
 460         lst = list(lst)
 461
 462         fprs = [0] * 32
 463         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 464         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 465         res = []
 466         # work out the results with the twin add-sub
 467         for i, (a, b) in enumerate(zip(av, bv)):
 468             fprs[i+2] = fp64toselectable(a)
 469             fprs[i+6] = fp64toselectable(b)
 470             t = b + a
 471             u = b - a
 472             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 473             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 474             res.append((t, u))
 475             print ("FFT", i, "in", a, b, "res", t, u)
 476
 477         # SVSTATE (in this case, VL=2)
 478         svstate = SVP64State()
 479         svstate.vl = 4 # VL
 480         svstate.maxvl = 4 # MAXVL
 481         print ("SVSTATE", bin(svstate.asint()))
 482
 483         with Program(lst, bigendian=False) as program:
 484             sim = self.run_tst_program(program, svstate=svstate,
 485                                        initial_fprs=fprs)
 486             # confirm that the results are as expected
 487             for i, (t, u) in enumerate(res):
 488                 a = float(sim.fpr(i+2))
 489                 b = float(sim.fpr(i+6))
 490                 t = float(t)
 491                 u = float(u)
 492                 print ("FFT", i, "in", a, b, "res", t, u)
 493             for i, (t, u) in enumerate(res):
 494                 self.assertEqual(sim.fpr(i+2), t)
 495                 self.assertEqual(sim.fpr(i+6), u)
 496
 497     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 498         """
 499             runs a full in-place O(N log2 N) butterfly schedule for
 500             Discrete Fourier Transform.  this version however uses
 501             SVP64 "Vertical-First" Mode and so needs an explicit
 502             branch, testing CR0.
 503
 504             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 505             (3 inputs, 2 outputs)
 506
 507             complex calculation (FFT):
 508
 509                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 510                 vec_r[jh] = vec_r[jl] - tpre
 511                 vec_r[jl] += tpre
 512
 513                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 514                 vec_i[jh] = vec_i[jl] - tpim
 515                 vec_i[jl] += tpim
 516
 517             real-only calculation (DFT):
 518
 519                 temp1 = vec[jh] * exptable[k]
 520                 temp2 = vec[jl]
 521                 vec[jh] = temp2 - temp1
 522                 vec[jl] = temp2 + temp1
 523
 524             note: a rather nice convenience / coincidence. the meaning of
 525             these two instructions is:
 526                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 527                 "svremap 5, 1, 0, 2, 0, 0, 1",
 528                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 529                 "svremap 26, 0, 0, 0, 0, 1, 1",
 530
 531             however it turns out that they can be *merged*, and for
 532             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 533             *ignore* their REMAPs (by definition, because you can't REMAP
 534             scalar operands), and for the second one (sv.ffads) exactly the
 535             right REMAPs are also ignored!
 536
 537             therefore we can merge:
 538                 "svremap 5, 1, 0, 2, 0, 0, 1",
 539                 "svremap 26, 0, 0, 0, 0, 1, 1",
 540             into:
 541                 "svremap 31, 1, 0, 2, 0, 1, 1",
 542             and save one instruction.
 543         """
 544         lst = SVP64Asm( [
 545                         # set triple butterfly mode with persistent "REMAP"
 546                         "svshape 8, 1, 1, 1, 1",
 547                         "svremap 31, 1, 0, 2, 0, 1, 1",
 548                         # tpre
 549                         "sv.fmuls 24, 0.v, 16.v",    # mul1_r = r*cos_r
 550                         "sv.fmadds 24, 8.v, 20.v, 24", # mul2_r = i*sin_i
 551                                                      # tpre = mul1_r + mul2_r
 552                         # tpim
 553                         "sv.fmuls 26, 0.v, 20.v",    # mul1_i = r*sin_i
 554                         "sv.fmsubs 26, 8.v, 16.v, 26", # mul2_i = i*cos_r
 555                                                      # tpim = mul2_i - mul1_i
 556                         # vec_r jh/jl
 557                         "sv.ffadds 0.v, 24, 0.v",    # vh/vl +/- tpre
 558                         # vec_i jh/jl
 559                         "sv.ffadds 8.v, 26, 8.v",    # vh/vl +- tpim
 560
 561                         # svstep loop
 562                         "setvl. 0, 0, 1, 1, 0, 0",
 563                         "bc 6, 3, -56"
 564                         ])
 565         lst = list(lst)
 566
 567         # array and coefficients to test
 568         ar = [7.0, -9.8, 3.0, -32.3,
 569               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 570         ai = [1.0, -1.8, 3.0, 19.3,
 571               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 572         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 573         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 574
 575         # store in regfile
 576         fprs = [0] * 64
 577         for i, a in enumerate(ar):
 578             fprs[i+0] = fp64toselectable(a)
 579         for i, a in enumerate(ai):
 580             fprs[i+8] = fp64toselectable(a)
 581         for i, cr in enumerate(coer):
 582             fprs[i+16] = fp64toselectable(cr)
 583         for i, ci in enumerate(coei):
 584             fprs[i+20] = fp64toselectable(ci)
 585
 586         # set total. err don't know how to calculate how many there are...
 587         # do it manually for now
 588         VL = 0
 589         size = 2
 590         n = len(ar)
 591         while size <= n:
 592             halfsize = size // 2
 593             tablestep = n // size
 594             for i in range(0, n, size):
 595                 for j in range(i, i + halfsize):
 596                     VL += 1
 597             size *= 2
 598
 599         # SVSTATE (calculated VL)
 600         svstate = SVP64State()
 601         svstate.vl = VL # VL
 602         svstate.maxvl = VL # MAXVL
 603         print ("SVSTATE", bin(svstate.asint()))
 604
 605         with Program(lst, bigendian=False) as program:
 606             sim = self.run_tst_program(program, svstate=svstate,
 607                                        initial_fprs=fprs)
 608             print ("spr svshape0", sim.spr['SVSHAPE0'])
 609             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 610             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 611             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 612             print ("spr svshape1", sim.spr['SVSHAPE1'])
 613             print ("spr svshape2", sim.spr['SVSHAPE2'])
 614             print ("spr svshape3", sim.spr['SVSHAPE3'])
 615
 616             # work out the results with the twin mul/add-sub, explicit
 617             # complex numbers
 618             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 619
 620             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 621                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 622                        "expected_r", expected_r,
 623                        "expected_i", expected_i)
 624             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 625                 # convert to Power single
 626                 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
 627                 expected_r = float(expected_r)
 628                 actual_r = float(sim.fpr(i))
 629                 # approximate error calculation, good enough test
 630                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 631                 # and the rounding is different
 632                 err = abs(actual_r - expected_r ) / expected_r
 633                 self.assertTrue(err < 1e-6)
 634                 # convert to Power single
 635                 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
 636                 expected_i = float(expected_i)
 637                 actual_i = float(sim.fpr(i+8))
 638                 # approximate error calculation, good enough test
 639                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 640                 # and the rounding is different
 641                 err = abs(actual_i - expected_i ) / expected_i
 642                 self.assertTrue(err < 1e-6)
 643
 644     def test_sv_ffadds_fft_scalar(self):
 645         """>>> lst = ["sv.ffadds 2.v, 12, 13"
 646                         ]
 647             four in-place vector adds and subs, but done with a scalar
 648             pair (fp12, fp13)
 649         """
 650         lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
 651                         ])
 652         lst = list(lst)
 653
 654         fprs = [0] * 32
 655         scalar_a = 1.3
 656         scalar_b = -2.0
 657         fprs[12] = fp64toselectable(scalar_a)
 658         fprs[13] = fp64toselectable(scalar_b)
 659         res = []
 660         # work out the results with the twin add-sub
 661         for i in range(4):
 662             t = scalar_b + scalar_a
 663             u = scalar_b - scalar_a
 664             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 665             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 666             res.append((t, u))
 667             print ("FFT", i, "res", t, u)
 668
 669         # SVSTATE (in this case, VL=2)
 670         svstate = SVP64State()
 671         svstate.vl = 4 # VL
 672         svstate.maxvl = 4 # MAXVL
 673         print ("SVSTATE", bin(svstate.asint()))
 674
 675         with Program(lst, bigendian=False) as program:
 676             sim = self.run_tst_program(program, svstate=svstate,
 677                                        initial_fprs=fprs)
 678             # confirm that the results are as expected
 679             for i, (t, u) in enumerate(res):
 680                 a = float(sim.fpr(i+2))
 681                 b = float(sim.fpr(i+6))
 682                 t = float(t)
 683                 u = float(u)
 684                 print ("FFT", i, "in", a, b, "res", t, u)
 685             for i, (t, u) in enumerate(res):
 686                 self.assertEqual(sim.fpr(i+2), t)
 687                 self.assertEqual(sim.fpr(i+6), u)
 688
 689     def test_sv_remap_fpmadds_fft_ldst(self):
 690         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 691                          "sv.lfssh 0.v, 4(0), 20", # bit-reversed
 692                          "svshape 8, 1, 1, 1, 0",
 693                          "svremap 31, 1, 0, 2, 0, 1, 0",
 694                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 695
 696             runs a full in-place O(N log2 N) butterfly schedule for
 697             Discrete Fourier Transform, using bit-reversed LD/ST
 698         """
 699         lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
 700                          "svremap 1, 0, 0, 0, 0, 0, 0, 0",
 701                          "sv.lfssh 0.v, 4(0), 20", # shifted
 702                          "svshape 8, 1, 1, 1, 0",
 703                          "svremap 31, 1, 0, 2, 0, 1, 0",
 704                          "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 705                         ])
 706         lst = list(lst)
 707
 708         # array and coefficients to test
 709         av = [7.0, -9.8, 3.0, -32.3,
 710               -2.0, 5.0, -9.8, 31.3] # array 0..7
 711         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 712
 713         # store in regfile
 714         fprs = [0] * 32
 715         for i, c in enumerate(coe):
 716             fprs[i+8] = fp64toselectable(c)
 717         # store in memory
 718         mem = {}
 719         val = 0
 720         for i, a in enumerate(av):
 721             a = SINGLE(fp64toselectable(a)).value
 722             shift = (i % 2) == 1
 723             if shift == 0:
 724                 val = a
 725             else:
 726                 mem[(i//2)*8] = val | (a << 32)
 727
 728         with Program(lst, bigendian=False) as program:
 729             sim = self.run_tst_program(program, initial_mem=mem,
 730                                                 initial_fprs=fprs)
 731             print ("spr svshape0", sim.spr['SVSHAPE0'])
 732             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 733             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 734             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 735             print ("spr svshape1", sim.spr['SVSHAPE1'])
 736             print ("spr svshape2", sim.spr['SVSHAPE2'])
 737             print ("spr svshape3", sim.spr['SVSHAPE3'])
 738
 739             print ("mem dump")
 740             print (sim.mem.dump())
 741
 742             # work out the results with the twin mul/add-sub,
 743             # note bit-reverse mode requested
 744             res = transform_radix2(av, coe, reverse=True)
 745
 746             for i, expected in enumerate(res):
 747                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 748             for i, expected in enumerate(res):
 749                 # convert to Power single
 750                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 751                 expected = float(expected)
 752                 actual = float(sim.fpr(i))
 753                 # approximate error calculation, good enough test
 754                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 755                 # and the rounding is different
 756                 err = abs(actual - expected) / expected
 757                 self.assertTrue(err < 1e-6)
 758
 759     def run_tst_program(self, prog, initial_regs=None,
 760                               svstate=None,
 761                               initial_mem=None,
 762                               initial_fprs=None):
 763         if initial_regs is None:
 764             initial_regs = [0] * 32
 765         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 766                                                 initial_fprs=initial_fprs,
 767                                                 svstate=svstate)
 768
 769         print ("GPRs")
 770         simulator.gpr.dump()
 771         print ("FPRs")
 772         simulator.fpr.dump()
 773
 774         return simulator
 775
 776
 777 if __name__ == "__main__":
 778     unittest.main()