src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.back.pysim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable
  13 from openpower.decoder.isafunctions.double2single import DOUBLE2SINGLE
  14
  15
  16 def transform_radix2(vec, exptable):
  17     """
  18     # FFT and convolution test (Python), based on Project Nayuki
  19     #
  20     # Copyright (c) 2020 Project Nayuki. (MIT License)
  21     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  22
  23     """
  24     # bits of the integer 'val'.
  25     def reverse_bits(val, width):
  26         result = 0
  27         for _ in range(width):
  28             result = (result << 1) | (val & 1)
  29             val >>= 1
  30         return result
  31
  32     # Initialization
  33     n = len(vec)
  34     levels = n.bit_length() - 1
  35
  36     # Copy with bit-reversed permutation
  37     #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  38
  39     size = 2
  40     while size <= n:
  41         halfsize = size // 2
  42         tablestep = n // size
  43         for i in range(0, n, size):
  44             k = 0
  45             for j in range(i, i + halfsize):
  46                 # exact same actual computation, just embedded in
  47                 # triple-nested for-loops
  48                 jl, jh = j, j+halfsize
  49                 vjh = vec[jh]
  50                 temp1 = vec[jh] * exptable[k]
  51                 temp2 = vec[jl]
  52                 vec[jh] = temp2 - temp1
  53                 vec[jl] = temp2 + temp1
  54                 print ("xform jl jh k", jl, jh, k,
  55                        "vj vjh ek", temp2, vjh, exptable[k],
  56                        "t1, t2", temp1, temp2,
  57                        "v[jh] v[jl]", vec[jh], vec[jl])
  58                 k += tablestep
  59         size *= 2
  60
  61     return vec
  62
  63
  64 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i):
  65     """
  66     # FFT and convolution test (Python), based on Project Nayuki
  67     #
  68     # Copyright (c) 2020 Project Nayuki. (MIT License)
  69     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  70
  71     """
  72     # bits of the integer 'val'.
  73     def reverse_bits(val, width):
  74         result = 0
  75         for _ in range(width):
  76             result = (result << 1) | (val & 1)
  77             val >>= 1
  78         return result
  79
  80     # Initialization
  81     n = len(vec_r)
  82     levels = n.bit_length() - 1
  83
  84     # Copy with bit-reversed permutation
  85     #vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  86
  87     size = 2
  88     while size <= n:
  89         halfsize = size // 2
  90         tablestep = n // size
  91         for i in range(0, n, size):
  92             k = 0
  93             for j in range(i, i + halfsize):
  94                 # exact same actual computation, just embedded in
  95                 # triple-nested for-loops
  96                 jl, jh = j, j+halfsize
  97
  98                 print ("xform jl jh k", jl, jh, k,
  99                         "vr h l", vec_r[jh], vec_r[jl],
 100                         "vi h l", vec_i[jh], vec_i[jl])
 101                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 102                 mul1_r =  vec_r[jh] * cos_r[k]
 103                 mul2_r = vec_i[jh] * sin_i[k]
 104                 tpre =  mul1_r + mul2_r
 105                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 106                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 107                 print ("    tpre", tpre)
 108                 mul1_i = vec_r[jh] * sin_i[k]
 109                 mul2_i = vec_i[jh] * cos_r[k]
 110                 tpim = -mul1_i + mul2_i
 111                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 112                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 113                 print ("    tpim", tpim)
 114                 vec_r[jh] = vec_r[jl] - tpre
 115                 vec_i[jh] = vec_i[jl] - tpim
 116                 vec_r[jl] += tpre
 117                 vec_i[jl] += tpim
 118
 119                 print ("    xform jl jh k", jl, jh, k,
 120                         "\n       vr h l", vec_r[jh], vec_r[jl],
 121                         "\n       vi h l", vec_i[jh], vec_i[jl])
 122                 k += tablestep
 123         size *= 2
 124
 125     return vec_r, vec_i
 126
 127
 128 class FFTTestCase(FHDLTestCase):
 129
 130     def _check_regs(self, sim, expected):
 131         for i in range(32):
 132             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 133
 134     def test_sv_remap_fpmadds_fft(self):
 135         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 136                      "svremap 31, 1, 0, 2, 0, 1",
 137                       "sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 138                      ]
 139             runs a full in-place O(N log2 N) butterfly schedule for
 140             Discrete Fourier Transform.
 141
 142             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 143             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 144
 145             there is the *option* to target a different location (non-in-place)
 146             just in case.
 147
 148             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 149             (3 inputs, 2 outputs)
 150         """
 151         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 152                          "svremap 31, 1, 0, 2, 0, 1",
 153                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v"
 154                         ])
 155         lst = list(lst)
 156
 157         # array and coefficients to test
 158         av = [7.0, -9.8, 3.0, -32.3,
 159               -2.0, 5.0, -9.8, 31.3] # array 0..7
 160         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 161
 162         # store in regfile
 163         fprs = [0] * 32
 164         for i, c in enumerate(coe):
 165             fprs[i+8] = fp64toselectable(c)
 166         for i, a in enumerate(av):
 167             fprs[i+0] = fp64toselectable(a)
 168
 169         with Program(lst, bigendian=False) as program:
 170             sim = self.run_tst_program(program, initial_fprs=fprs)
 171             print ("spr svshape0", sim.spr['SVSHAPE0'])
 172             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 173             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 174             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 175             print ("spr svshape1", sim.spr['SVSHAPE1'])
 176             print ("spr svshape2", sim.spr['SVSHAPE2'])
 177             print ("spr svshape3", sim.spr['SVSHAPE3'])
 178
 179             # work out the results with the twin mul/add-sub
 180             res = transform_radix2(av, coe)
 181
 182             for i, expected in enumerate(res):
 183                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 184             for i, expected in enumerate(res):
 185                 # convert to Power single
 186                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 187                 expected = float(expected)
 188                 actual = float(sim.fpr(i))
 189                 # approximate error calculation, good enough test
 190                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 191                 # and the rounding is different
 192                 err = abs(actual - expected) / expected
 193                 self.assertTrue(err < 1e-7)
 194
 195     def test_sv_remap_fpmadds_fft_svstep(self):
 196         """>>> lst = SVP64Asm( [
 197                             "svshape 8, 1, 1, 1, 1",
 198                              "svremap 31, 1, 0, 2, 0, 1",
 199                             "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 200                             "setvl. 0, 0, 0, 1, 0, 0",
 201                             "bc 4, 2, -16"
 202                             ])
 203             runs a full in-place O(N log2 N) butterfly schedule for
 204             Discrete Fourier Transform.  this version however uses
 205             SVP64 "Vertical-First" Mode and so needs an explicit
 206             branch, testing CR0.
 207
 208             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 209             (3 inputs, 2 outputs)
 210         """
 211         lst = SVP64Asm( [
 212                         "svshape 8, 1, 1, 1, 1",
 213                          "svremap 31, 1, 0, 2, 0, 1",
 214                         "sv.ffmadds 0.v, 0.v, 0.v, 8.v",
 215                         "setvl. 0, 0, 0, 1, 0, 0",
 216                         "bc 4, 2, -16"
 217                         ])
 218         lst = list(lst)
 219
 220         # array and coefficients to test
 221         av = [7.0, -9.8, 3.0, -32.3,
 222               -2.0, 5.0, -9.8, 31.3] # array 0..7
 223         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 224
 225         # store in regfile
 226         fprs = [0] * 32
 227         for i, c in enumerate(coe):
 228             fprs[i+8] = fp64toselectable(c)
 229         for i, a in enumerate(av):
 230             fprs[i+0] = fp64toselectable(a)
 231
 232         # set total. err don't know how to calculate how many there are...
 233         # do it manually for now
 234         VL = 0
 235         size = 2
 236         n = len(av)
 237         while size <= n:
 238             halfsize = size // 2
 239             tablestep = n // size
 240             for i in range(0, n, size):
 241                 for j in range(i, i + halfsize):
 242                     VL += 1
 243             size *= 2
 244
 245         # SVSTATE (calculated VL)
 246         svstate = SVP64State()
 247         svstate.vl[0:7] = VL # VL
 248         svstate.maxvl[0:7] = VL # MAXVL
 249         print ("SVSTATE", bin(svstate.spr.asint()))
 250
 251         with Program(lst, bigendian=False) as program:
 252             sim = self.run_tst_program(program, svstate=svstate,
 253                                        initial_fprs=fprs)
 254             print ("spr svshape0", sim.spr['SVSHAPE0'])
 255             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 256             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 257             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 258             print ("spr svshape1", sim.spr['SVSHAPE1'])
 259             print ("spr svshape2", sim.spr['SVSHAPE2'])
 260             print ("spr svshape3", sim.spr['SVSHAPE3'])
 261
 262             # work out the results with the twin mul/add-sub
 263             res = transform_radix2(av, coe)
 264
 265             for i, expected in enumerate(res):
 266                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 267             for i, expected in enumerate(res):
 268                 # convert to Power single
 269                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 270                 expected = float(expected)
 271                 actual = float(sim.fpr(i))
 272                 # approximate error calculation, good enough test
 273                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 274                 # and the rounding is different
 275                 err = abs(actual - expected) / expected
 276                 self.assertTrue(err < 1e-7)
 277
 278     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 279         """>>> lst = SVP64Asm( [
 280                         "svshape 8, 1, 1, 1, 1",
 281                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 282                          "svremap 5, 1, 0, 2, 0, 0",
 283                          "sv.fmuls 24, 0.v, 8.v",
 284                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 285                          "svremap 26, 0, 0, 0, 0, 1",
 286                         "sv.ffadds 0.v, 24, 0.v",
 287                         "setvl. 0, 0, 0, 1, 0, 0",
 288                         "bc 4, 2, -28"
 289                             ])
 290
 291             runs a full in-place O(N log2 N) butterfly schedule for
 292             Discrete Fourier Transform.  also uses "Vertical First"
 293             but also uses temporary scalars and ffadds rather than
 294             sv.ffmadds.
 295
 296             this represents an incremental step towards complex FFT
 297
 298             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 299
 300             * single fmuls FRT, FRA, FRC
 301             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 302               (FRS is implicit / hidden in ff* operations)
 303
 304             multiply:                         # sv.fmuls FRT, FRA, FRC
 305                 temp1 = vec[jh] * exptable[k]
 306                 temp2 = vec[jl]
 307             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 308                 vec[jh] = temp2 - temp1
 309                 vec[jl] = temp2 + temp1
 310         """
 311         lst = SVP64Asm( [
 312                         "svshape 8, 1, 1, 1, 1",
 313                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 314                          "svremap 5, 1, 0, 2, 0, 0",
 315                          "sv.fmuls 24, 0.v, 8.v",
 316                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 317                          "svremap 26, 0, 0, 0, 0, 1",
 318                         "sv.ffadds 0.v, 24, 0.v",
 319                         "setvl. 0, 0, 0, 1, 0, 0",
 320                         "bc 4, 2, -28"
 321                         ])
 322         lst = list(lst)
 323
 324         # array and coefficients to test
 325         av = [7.0, -9.8, 3.0, -32.3,
 326               -2.0, 5.0, -9.8, 31.3] # array 0..7
 327         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 328
 329         # store in regfile
 330         fprs = [0] * 32
 331         for i, c in enumerate(coe):
 332             fprs[i+8] = fp64toselectable(c)
 333         for i, a in enumerate(av):
 334             fprs[i+0] = fp64toselectable(a)
 335
 336         # set total. err don't know how to calculate how many there are...
 337         # do it manually for now
 338         VL = 0
 339         size = 2
 340         n = len(av)
 341         while size <= n:
 342             halfsize = size // 2
 343             tablestep = n // size
 344             for i in range(0, n, size):
 345                 for j in range(i, i + halfsize):
 346                     VL += 1
 347             size *= 2
 348
 349         # SVSTATE (calculated VL)
 350         svstate = SVP64State()
 351         svstate.vl[0:7] = VL # VL
 352         svstate.maxvl[0:7] = VL # MAXVL
 353         print ("SVSTATE", bin(svstate.spr.asint()))
 354
 355         with Program(lst, bigendian=False) as program:
 356             sim = self.run_tst_program(program, svstate=svstate,
 357                                        initial_fprs=fprs)
 358             print ("spr svshape0", sim.spr['SVSHAPE0'])
 359             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 360             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 361             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 362             print ("spr svshape1", sim.spr['SVSHAPE1'])
 363             print ("spr svshape2", sim.spr['SVSHAPE2'])
 364             print ("spr svshape3", sim.spr['SVSHAPE3'])
 365
 366             # work out the results with the twin mul/add-sub
 367             res = transform_radix2(av, coe)
 368
 369             for i, expected in enumerate(res):
 370                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 371             for i, expected in enumerate(res):
 372                 # convert to Power single
 373                 expected = DOUBLE2SINGLE(fp64toselectable(expected))
 374                 expected = float(expected)
 375                 actual = float(sim.fpr(i))
 376                 # approximate error calculation, good enough test
 377                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 378                 # and the rounding is different
 379                 err = abs(actual - expected) / expected
 380                 self.assertTrue(err < 1e-7)
 381
 382     def test_sv_fpmadds_fft(self):
 383         """>>> lst = ["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 384                         ]
 385             four in-place vector mul-adds, four in-place vector mul-subs
 386
 387             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 388             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 389
 390             there is the *option* to target a different location (non-in-place)
 391             just in case.
 392
 393             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 394             FRS to perform the two multiplies.  one add, one subtract.
 395
 396             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 397                 fmadds  FRT   , FRA, FRC, FRA
 398                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 399         """
 400         lst = SVP64Asm(["sv.ffmadds 2.v, 2.v, 2.v, 10.v"
 401                         ])
 402         lst = list(lst)
 403
 404         fprs = [0] * 32
 405         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 406         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 407         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 408         res = []
 409         # work out the results with the twin mul/add-sub
 410         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 411             fprs[i+2] = fp64toselectable(a)
 412             fprs[i+6] = fp64toselectable(b)
 413             fprs[i+10] = fp64toselectable(c)
 414             mul = a * c
 415             t = b + mul
 416             u = b - mul
 417             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 418             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 419             res.append((t, u))
 420             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 421
 422         # SVSTATE (in this case, VL=2)
 423         svstate = SVP64State()
 424         svstate.vl[0:7] = 4 # VL
 425         svstate.maxvl[0:7] = 4 # MAXVL
 426         print ("SVSTATE", bin(svstate.spr.asint()))
 427
 428         with Program(lst, bigendian=False) as program:
 429             sim = self.run_tst_program(program, svstate=svstate,
 430                                        initial_fprs=fprs)
 431             # confirm that the results are as expected
 432             for i, (t, u) in enumerate(res):
 433                 self.assertEqual(sim.fpr(i+2), t)
 434                 self.assertEqual(sim.fpr(i+6), u)
 435
 436     def test_sv_ffadds_fft(self):
 437         """>>> lst = ["sv.ffadds 2.v, 2.v, 2.v"
 438                         ]
 439             four in-place vector adds, four in-place vector subs
 440
 441             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 442             FRS to perform the two multiplies.  one add, one subtract.
 443
 444             sv.ffadds FRT, FRA, FRB  actually does:
 445                 fadds FRT   , FRB, FRA
 446                 fsubs FRT+vl, FRA, FRB+vl
 447         """
 448         lst = SVP64Asm(["sv.ffadds 2.v, 2.v, 2.v"
 449                         ])
 450         lst = list(lst)
 451
 452         fprs = [0] * 32
 453         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 454         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 455         res = []
 456         # work out the results with the twin add-sub
 457         for i, (a, b) in enumerate(zip(av, bv)):
 458             fprs[i+2] = fp64toselectable(a)
 459             fprs[i+6] = fp64toselectable(b)
 460             t = b + a
 461             u = b - a
 462             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 463             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 464             res.append((t, u))
 465             print ("FFT", i, "in", a, b, "res", t, u)
 466
 467         # SVSTATE (in this case, VL=2)
 468         svstate = SVP64State()
 469         svstate.vl[0:7] = 4 # VL
 470         svstate.maxvl[0:7] = 4 # MAXVL
 471         print ("SVSTATE", bin(svstate.spr.asint()))
 472
 473         with Program(lst, bigendian=False) as program:
 474             sim = self.run_tst_program(program, svstate=svstate,
 475                                        initial_fprs=fprs)
 476             # confirm that the results are as expected
 477             for i, (t, u) in enumerate(res):
 478                 a = float(sim.fpr(i+2))
 479                 b = float(sim.fpr(i+6))
 480                 t = float(t)
 481                 u = float(u)
 482                 print ("FFT", i, "in", a, b, "res", t, u)
 483             for i, (t, u) in enumerate(res):
 484                 self.assertEqual(sim.fpr(i+2), t)
 485                 self.assertEqual(sim.fpr(i+6), u)
 486
 487     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 488         """
 489             runs a full in-place O(N log2 N) butterfly schedule for
 490             Discrete Fourier Transform.  this version however uses
 491             SVP64 "Vertical-First" Mode and so needs an explicit
 492             branch, testing CR0.
 493
 494             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 495             (3 inputs, 2 outputs)
 496
 497             complex calculation (FFT):
 498
 499                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 500                 vec_r[jh] = vec_r[jl] - tpre
 501                 vec_r[jl] += tpre
 502
 503                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 504                 vec_i[jh] = vec_i[jl] - tpim
 505                 vec_i[jl] += tpim
 506
 507             real-only calculation (DFT):
 508
 509                 temp1 = vec[jh] * exptable[k]
 510                 temp2 = vec[jl]
 511                 vec[jh] = temp2 - temp1
 512                 vec[jl] = temp2 + temp1
 513         """
 514         lst = SVP64Asm( [
 515                         # set triple butterfly mode
 516                         "svshape 8, 1, 1, 1, 1",
 517                         # tpre
 518                         "svremap 5, 1, 0, 2, 0, 0",
 519                         "sv.fmuls 24, 0.v, 16.v",    # mul1_r = r*cos_r
 520                         "svremap 5, 1, 0, 2, 0, 0",
 521                         "sv.fmuls 25, 8.v, 20.v",    # mul2_r = i*sin_i
 522                         "fadds 24, 24, 25",          # tpre = mul1_r + mul2_r
 523                         # tpim
 524                          "svremap 5, 1, 0, 2, 0, 0",
 525                         "sv.fmuls 26, 0.v, 20.v",    # mul1_i = r*sin_i
 526                          "svremap 5, 1, 0, 2, 0, 0",
 527                         "sv.fmuls 27, 8.v, 16.v",    # mul2_i = i*cos_r
 528                         "fsubs 26, 27, 26",          # tpim = mul2_i - mul1_i
 529                         # vec_r jh/jl
 530                          "svremap 26, 0, 0, 0, 0, 1",
 531                         "sv.ffadds 0.v, 24, 0.v",    # vh/vl +/- tpre
 532                         # vec_i jh/jl
 533                          "svremap 26, 0, 0, 0, 0, 1",
 534                         "sv.ffadds 8.v, 26, 8.v",    # vh/vl +- tpim
 535
 536                         # svstep loop
 537                         "setvl. 0, 0, 0, 1, 0, 0",
 538                         "bc 4, 2, -84"
 539                         ])
 540         lst = list(lst)
 541
 542         # array and coefficients to test
 543         ar = [7.0, -9.8, 3.0, -32.3,
 544               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 545         ai = [1.0, -1.8, 3.0, 19.3,
 546               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 547         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 548         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 549
 550         # store in regfile
 551         fprs = [0] * 64
 552         for i, a in enumerate(ar):
 553             fprs[i+0] = fp64toselectable(a)
 554         for i, a in enumerate(ai):
 555             fprs[i+8] = fp64toselectable(a)
 556         for i, cr in enumerate(coer):
 557             fprs[i+16] = fp64toselectable(cr)
 558         for i, ci in enumerate(coei):
 559             fprs[i+20] = fp64toselectable(ci)
 560
 561         # set total. err don't know how to calculate how many there are...
 562         # do it manually for now
 563         VL = 0
 564         size = 2
 565         n = len(ar)
 566         while size <= n:
 567             halfsize = size // 2
 568             tablestep = n // size
 569             for i in range(0, n, size):
 570                 for j in range(i, i + halfsize):
 571                     VL += 1
 572             size *= 2
 573
 574         # SVSTATE (calculated VL)
 575         svstate = SVP64State()
 576         svstate.vl[0:7] = VL # VL
 577         svstate.maxvl[0:7] = VL # MAXVL
 578         print ("SVSTATE", bin(svstate.spr.asint()))
 579
 580         with Program(lst, bigendian=False) as program:
 581             sim = self.run_tst_program(program, svstate=svstate,
 582                                        initial_fprs=fprs)
 583             print ("spr svshape0", sim.spr['SVSHAPE0'])
 584             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 585             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 586             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 587             print ("spr svshape1", sim.spr['SVSHAPE1'])
 588             print ("spr svshape2", sim.spr['SVSHAPE2'])
 589             print ("spr svshape3", sim.spr['SVSHAPE3'])
 590
 591             # work out the results with the twin mul/add-sub, explicit
 592             # complex numbers
 593             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 594
 595             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 596                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 597                        "expected_r", expected_r,
 598                        "expected_i", expected_i)
 599             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 600                 # convert to Power single
 601                 expected_r = DOUBLE2SINGLE(fp64toselectable(expected_r ))
 602                 expected_r = float(expected_r)
 603                 actual_r = float(sim.fpr(i))
 604                 # approximate error calculation, good enough test
 605                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 606                 # and the rounding is different
 607                 err = abs(actual_r - expected_r ) / expected_r
 608                 self.assertTrue(err < 1e-6)
 609                 # convert to Power single
 610                 expected_i = DOUBLE2SINGLE(fp64toselectable(expected_i ))
 611                 expected_i = float(expected_i)
 612                 actual_i = float(sim.fpr(i+8))
 613                 # approximate error calculation, good enough test
 614                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 615                 # and the rounding is different
 616                 err = abs(actual_i - expected_i ) / expected_i
 617                 self.assertTrue(err < 1e-6)
 618
 619     def test_sv_ffadds_fft_scalar(self):
 620         """>>> lst = ["sv.ffadds 2.v, 12, 13"
 621                         ]
 622             four in-place vector adds and subs, but done with a scalar
 623             pair (fp12, fp13)
 624         """
 625         lst = SVP64Asm(["sv.ffadds 2.v, 12, 13"
 626                         ])
 627         lst = list(lst)
 628
 629         fprs = [0] * 32
 630         scalar_a = 1.3
 631         scalar_b = -2.0
 632         fprs[12] = fp64toselectable(scalar_a)
 633         fprs[13] = fp64toselectable(scalar_b)
 634         res = []
 635         # work out the results with the twin add-sub
 636         for i in range(4):
 637             t = scalar_b + scalar_a
 638             u = scalar_b - scalar_a
 639             t = DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 640             u = DOUBLE2SINGLE(fp64toselectable(u)) # from double
 641             res.append((t, u))
 642             print ("FFT", i, "res", t, u)
 643
 644         # SVSTATE (in this case, VL=2)
 645         svstate = SVP64State()
 646         svstate.vl[0:7] = 4 # VL
 647         svstate.maxvl[0:7] = 4 # MAXVL
 648         print ("SVSTATE", bin(svstate.spr.asint()))
 649
 650         with Program(lst, bigendian=False) as program:
 651             sim = self.run_tst_program(program, svstate=svstate,
 652                                        initial_fprs=fprs)
 653             # confirm that the results are as expected
 654             for i, (t, u) in enumerate(res):
 655                 a = float(sim.fpr(i+2))
 656                 b = float(sim.fpr(i+6))
 657                 t = float(t)
 658                 u = float(u)
 659                 print ("FFT", i, "in", a, b, "res", t, u)
 660             for i, (t, u) in enumerate(res):
 661                 self.assertEqual(sim.fpr(i+2), t)
 662                 self.assertEqual(sim.fpr(i+6), u)
 663
 664     def run_tst_program(self, prog, initial_regs=None,
 665                               svstate=None,
 666                               initial_mem=None,
 667                               initial_fprs=None):
 668         if initial_regs is None:
 669             initial_regs = [0] * 32
 670         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 671                                                 initial_fprs=initial_fprs,
 672                                                 svstate=svstate)
 673
 674         print ("GPRs")
 675         simulator.gpr.dump()
 676         print ("FPRs")
 677         simulator.fpr.dump()
 678
 679         return simulator
 680
 681
 682 if __name__ == "__main__":
 683     unittest.main()