src/openpower/decoder/isa/test_caller_svp64_fft.py

   1 from nmigen import Module, Signal
   2 from nmigen.back.pysim import Simulator, Delay, Settle
   3 from nmutil.formaltest import FHDLTestCase
   4 import unittest
   5 from openpower.decoder.power_decoder import (create_pdecode)
   6 from openpower.simulator.program import Program
   7 from openpower.decoder.isa.caller import SVP64State
   8 from openpower.decoder.selectable_int import SelectableInt
   9 from openpower.decoder.isa.test_caller import run_tst
  10 from openpower.sv.trans.svp64 import SVP64Asm
  11 from copy import deepcopy
  12 from openpower.decoder.helpers import fp64toselectable, SINGLE
  13 from openpower.decoder.isafunctions.double2single import ISACallerFnHelper
  14
  15 # really bad hack.  need to access the DOUBLE2SINGLE function auto-generated
  16 # from pseudo-code.
  17 fph = ISACallerFnHelper(XLEN=64)
  18
  19
  20 def transform_radix2(vec, exptable, reverse=False):
  21     """
  22     # FFT and convolution test (Python), based on Project Nayuki
  23     #
  24     # Copyright (c) 2020 Project Nayuki. (MIT License)
  25     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  26
  27     """
  28     # bits of the integer 'val'.
  29     def reverse_bits(val, width):
  30         result = 0
  31         for _ in range(width):
  32             result = (result << 1) | (val & 1)
  33             val >>= 1
  34         return result
  35
  36     # Initialization
  37     n = len(vec)
  38     levels = n.bit_length() - 1
  39
  40     # Copy with bit-reversed permutation
  41     if reverse:
  42         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  43
  44     size = 2
  45     while size <= n:
  46         halfsize = size // 2
  47         tablestep = n // size
  48         for i in range(0, n, size):
  49             k = 0
  50             for j in range(i, i + halfsize):
  51                 # exact same actual computation, just embedded in
  52                 # triple-nested for-loops
  53                 jl, jh = j, j+halfsize
  54                 vjh = vec[jh]
  55                 temp1 = vec[jh] * exptable[k]
  56                 temp2 = vec[jl]
  57                 vec[jh] = temp2 - temp1
  58                 vec[jl] = temp2 + temp1
  59                 print ("xform jl jh k", jl, jh, k,
  60                        "vj vjh ek", temp2, vjh, exptable[k],
  61                        "t1, t2", temp1, temp2,
  62                        "v[jh] v[jl]", vec[jh], vec[jl])
  63                 k += tablestep
  64         size *= 2
  65
  66     return vec
  67
  68
  69 def transform_radix2_complex(vec_r, vec_i, cos_r, sin_i, reverse=False):
  70     """
  71     # FFT and convolution test (Python), based on Project Nayuki
  72     #
  73     # Copyright (c) 2020 Project Nayuki. (MIT License)
  74     # https://www.nayuki.io/page/free-small-fft-in-multiple-languages
  75
  76     """
  77     # bits of the integer 'val'.
  78     def reverse_bits(val, width):
  79         result = 0
  80         for _ in range(width):
  81             result = (result << 1) | (val & 1)
  82             val >>= 1
  83         return result
  84
  85     # Initialization
  86     n = len(vec_r)
  87     levels = n.bit_length() - 1
  88
  89     # Copy with bit-reversed permutation
  90     if reverse:
  91         vec = [vec[reverse_bits(i, levels)] for i in range(n)]
  92
  93     size = 2
  94     while size <= n:
  95         halfsize = size // 2
  96         tablestep = n // size
  97         for i in range(0, n, size):
  98             k = 0
  99             for j in range(i, i + halfsize):
 100                 # exact same actual computation, just embedded in
 101                 # triple-nested for-loops
 102                 jl, jh = j, j+halfsize
 103
 104                 print ("xform jl jh k", jl, jh, k,
 105                         "vr h l", vec_r[jh], vec_r[jl],
 106                         "vi h l", vec_i[jh], vec_i[jl])
 107                 print ("    cr k", cos_r[k], "si k", sin_i[k])
 108                 mul1_r =  vec_r[jh] * cos_r[k]
 109                 mul2_r = vec_i[jh] * sin_i[k]
 110                 tpre =  mul1_r + mul2_r
 111                 print ("        vec_r[jh] * cos_r[k]", mul1_r)
 112                 print ("        vec_i[jh] * sin_i[k]", mul2_r)
 113                 print ("    tpre", tpre)
 114                 mul1_i = vec_r[jh] * sin_i[k]
 115                 mul2_i = vec_i[jh] * cos_r[k]
 116                 tpim = -mul1_i + mul2_i
 117                 print ("        vec_r[jh] * sin_i[k]", mul1_i)
 118                 print ("        vec_i[jh] * cos_r[k]", mul2_i)
 119                 print ("    tpim", tpim)
 120                 vec_r[jh] = vec_r[jl] - tpre
 121                 vec_i[jh] = vec_i[jl] - tpim
 122                 vec_r[jl] += tpre
 123                 vec_i[jl] += tpim
 124
 125                 print ("    xform jl jh k", jl, jh, k,
 126                         "\n       vr h l", vec_r[jh], vec_r[jl],
 127                         "\n       vi h l", vec_i[jh], vec_i[jl])
 128                 k += tablestep
 129         size *= 2
 130
 131     return vec_r, vec_i
 132
 133
 134 class FFTTestCase(FHDLTestCase):
 135
 136     def _check_regs(self, sim, expected):
 137         for i in range(32):
 138             self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
 139
 140     def test_sv_remap_fpmadds_fft(self):
 141         """>>> lst = ["svshape 8, 1, 1, 1, 0",
 142                      "svremap 31, 1, 0, 2, 0, 1, 0",
 143                       "sv.ffmadds *2, *2, *2, *10"
 144                      ]
 145             runs a full in-place O(N log2 N) butterfly schedule for
 146             Discrete Fourier Transform.
 147
 148             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 149             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 150
 151             there is the *option* to target a different location (non-in-place)
 152             just in case.
 153
 154             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 155             (3 inputs, 2 outputs)
 156         """
 157         lst = SVP64Asm( ["svshape 8, 1, 1, 1, 0",
 158                          "svremap 31, 1, 0, 2, 0, 1, 0",
 159                         "sv.ffmadds *0, *0, *0, *8"
 160                         ])
 161         lst = list(lst)
 162
 163         # array and coefficients to test
 164         av = [7.0, -9.8, 3.0, -32.3,
 165               -2.0, 5.0, -9.8, 31.3] # array 0..7
 166         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 167
 168         # store in regfile
 169         fprs = [0] * 32
 170         for i, c in enumerate(coe):
 171             fprs[i+8] = fp64toselectable(c)
 172         for i, a in enumerate(av):
 173             fprs[i+0] = fp64toselectable(a)
 174
 175         with Program(lst, bigendian=False) as program:
 176             sim = self.run_tst_program(program, initial_fprs=fprs)
 177             print ("spr svshape0", sim.spr['SVSHAPE0'])
 178             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 179             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 180             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 181             print ("spr svshape1", sim.spr['SVSHAPE1'])
 182             print ("spr svshape2", sim.spr['SVSHAPE2'])
 183             print ("spr svshape3", sim.spr['SVSHAPE3'])
 184
 185             # work out the results with the twin mul/add-sub
 186             res = transform_radix2(av, coe)
 187
 188             for i, expected in enumerate(res):
 189                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 190             for i, expected in enumerate(res):
 191                 # convert to Power single
 192                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 193                 expected = float(expected)
 194                 actual = float(sim.fpr(i))
 195                 # approximate error calculation, good enough test
 196                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 197                 # and the rounding is different
 198                 err = abs(actual - expected) / expected
 199                 self.assertTrue(err < 1e-7)
 200
 201     def test_sv_remap_fpmadds_fft_svstep(self):
 202         """>>> lst = SVP64Asm( [
 203                             "svshape 8, 1, 1, 1, 1",
 204                              "svremap 31, 1, 0, 2, 0, 1, 0",
 205                             "sv.ffmadds *0, *0, *0, *8",
 206                             "setvl. 0, 0, 1, 1, 0, 0",
 207                             "bc 6, 3, -16"
 208                             ])
 209             runs a full in-place O(N log2 N) butterfly schedule for
 210             Discrete Fourier Transform.  this version however uses
 211             SVP64 "Vertical-First" Mode and so needs an explicit
 212             branch, testing CR0.
 213
 214             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 215             (3 inputs, 2 outputs)
 216         """
 217         lst = SVP64Asm( [
 218                         "svshape 8, 1, 1, 1, 1",
 219                          "svremap 31, 1, 0, 2, 0, 1, 0",
 220                         "sv.ffmadds *0, *0, *0, *8",
 221                         "setvl. 0, 0, 1, 1, 0, 0",
 222                         "bc 6, 3, -16"
 223                         ])
 224         lst = list(lst)
 225
 226         # array and coefficients to test
 227         av = [7.0, -9.8, 3.0, -32.3,
 228               -2.0, 5.0, -9.8, 31.3] # array 0..7
 229         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 230
 231         # store in regfile
 232         fprs = [0] * 32
 233         for i, c in enumerate(coe):
 234             fprs[i+8] = fp64toselectable(c)
 235         for i, a in enumerate(av):
 236             fprs[i+0] = fp64toselectable(a)
 237
 238         # set total. err don't know how to calculate how many there are...
 239         # do it manually for now
 240         VL = 0
 241         size = 2
 242         n = len(av)
 243         while size <= n:
 244             halfsize = size // 2
 245             tablestep = n // size
 246             for i in range(0, n, size):
 247                 for j in range(i, i + halfsize):
 248                     VL += 1
 249             size *= 2
 250
 251         # SVSTATE (calculated VL)
 252         svstate = SVP64State()
 253         svstate.vl = VL # VL
 254         svstate.maxvl = VL # MAXVL
 255         print ("SVSTATE", bin(svstate.asint()))
 256
 257         with Program(lst, bigendian=False) as program:
 258             sim = self.run_tst_program(program, svstate=svstate,
 259                                        initial_fprs=fprs)
 260             print ("spr svshape0", sim.spr['SVSHAPE0'])
 261             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 262             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 263             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 264             print ("spr svshape1", sim.spr['SVSHAPE1'])
 265             print ("spr svshape2", sim.spr['SVSHAPE2'])
 266             print ("spr svshape3", sim.spr['SVSHAPE3'])
 267
 268             # work out the results with the twin mul/add-sub
 269             res = transform_radix2(av, coe)
 270
 271             for i, expected in enumerate(res):
 272                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 273             for i, expected in enumerate(res):
 274                 # convert to Power single
 275                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 276                 expected = float(expected)
 277                 actual = float(sim.fpr(i))
 278                 # approximate error calculation, good enough test
 279                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 280                 # and the rounding is different
 281                 err = abs(actual - expected) / expected
 282                 self.assertTrue(err < 1e-7)
 283
 284     def test_sv_remap_fpmadds_fft_svstep_scalar_temp(self):
 285         """>>> lst = SVP64Asm( [
 286                         "svshape 8, 1, 1, 1, 1",
 287                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 288                          "svremap 5, 1, 0, 2, 0, 0, 1",
 289                          "sv.fmuls 24, *0, *8",
 290                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 291                          "svremap 26, 0, 0, 0, 0, 1, 1",
 292                         "sv.ffadds *0, 24, *0",
 293                         "setvl. 0, 0, 1, 1, 0, 0",
 294                         "bc 6, 3, -28"
 295                             ])
 296
 297             runs a full in-place O(N log2 N) butterfly schedule for
 298             Discrete Fourier Transform.  also uses "Vertical First"
 299             but also uses temporary scalars and ffadds rather than
 300             sv.ffmadds.
 301
 302             this represents an incremental step towards complex FFT
 303
 304             SVP64 "REMAP" in Butterfly Mode is applied to two instructions:
 305
 306             * single fmuls FRT, FRA, FRC
 307             * twin in-place ffadds +/- ADD/SUB (2 inputs, 2 outputs)
 308               (FRS is implicit / hidden in ff* operations)
 309
 310             multiply:                         # sv.fmuls FRT, FRA, FRC
 311                 temp1 = vec[jh] * exptable[k]
 312                 temp2 = vec[jl]
 313             twin-add:                         # sv.ffadds FRT(/FRS), FRA, FRB
 314                 vec[jh] = temp2 - temp1
 315                 vec[jl] = temp2 + temp1
 316
 317             also see notes in complex fft test: here svremap is done in
 318             "non-persistent" mode (as a demo) whereas in the complex fft
 319             svremap is used in "persistent" mode, where by a complete
 320             coincidence the REMAP arguments all happen to line up and
 321             only one persistent svremap is needed.  the exact same trick
 322             *could* be applied here but for illustrative purposes it is not.
 323         """
 324         lst = SVP64Asm( [
 325                         "svshape 8, 1, 1, 1, 1",
 326                          # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 327                          "svremap 5, 1, 0, 2, 0, 0, 0",
 328                          "sv.fmuls 24, *0, *8",
 329                          # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 330                          "svremap 26, 0, 0, 0, 0, 1, 0",
 331                         "sv.ffadds *0, 24, *0",
 332                         "setvl. 0, 0, 1, 1, 0, 0",
 333                         "bc 6, 3, -28"
 334                         ])
 335         lst = list(lst)
 336
 337         # array and coefficients to test
 338         av = [7.0, -9.8, 3.0, -32.3,
 339               -2.0, 5.0, -9.8, 31.3] # array 0..7
 340         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 341
 342         # store in regfile
 343         fprs = [0] * 32
 344         for i, c in enumerate(coe):
 345             fprs[i+8] = fp64toselectable(c)
 346         for i, a in enumerate(av):
 347             fprs[i+0] = fp64toselectable(a)
 348
 349         # set total. err don't know how to calculate how many there are...
 350         # do it manually for now
 351         VL = 0
 352         size = 2
 353         n = len(av)
 354         while size <= n:
 355             halfsize = size // 2
 356             tablestep = n // size
 357             for i in range(0, n, size):
 358                 for j in range(i, i + halfsize):
 359                     VL += 1
 360             size *= 2
 361
 362         # SVSTATE (calculated VL)
 363         svstate = SVP64State()
 364         svstate.vl = VL # VL
 365         svstate.maxvl = VL # MAXVL
 366         print ("SVSTATE", bin(svstate.asint()))
 367
 368         with Program(lst, bigendian=False) as program:
 369             sim = self.run_tst_program(program, svstate=svstate,
 370                                        initial_fprs=fprs)
 371             print ("spr svshape0", sim.spr['SVSHAPE0'])
 372             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 373             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 374             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 375             print ("spr svshape1", sim.spr['SVSHAPE1'])
 376             print ("spr svshape2", sim.spr['SVSHAPE2'])
 377             print ("spr svshape3", sim.spr['SVSHAPE3'])
 378
 379             # work out the results with the twin mul/add-sub
 380             res = transform_radix2(av, coe)
 381
 382             for i, expected in enumerate(res):
 383                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 384             for i, expected in enumerate(res):
 385                 # convert to Power single
 386                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 387                 expected = float(expected)
 388                 actual = float(sim.fpr(i))
 389                 # approximate error calculation, good enough test
 390                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 391                 # and the rounding is different
 392                 err = abs(actual - expected) / expected
 393                 self.assertTrue(err < 1e-7)
 394
 395     def test_sv_fpmadds_fft(self):
 396         """>>> lst = ["sv.ffmadds *2, *2, *2, *10"
 397                         ]
 398             four in-place vector mul-adds, four in-place vector mul-subs
 399
 400             this is the twin "butterfly" mul-add-sub from Cooley-Tukey
 401             https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
 402
 403             there is the *option* to target a different location (non-in-place)
 404             just in case.
 405
 406             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 407             FRS to perform the two multiplies.  one add, one subtract.
 408
 409             sv.ffmadds FRT, FRA, FRC, FRB  actually does:
 410                 fmadds  FRT   , FRA, FRC, FRA
 411                 fnmsubs FRT+vl, FRA, FRC, FRB+vl
 412
 413         """
 414         lst = SVP64Asm(["sv.ffmadds *2, *2, *2, *10"
 415                         ])
 416         lst = list(lst)
 417
 418         fprs = [0] * 32
 419         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 420         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 421         coe = [-1.0, 4.0, 3.1, 6.2]  # coefficients
 422         res = []
 423         # work out the results with the twin mul/add-sub
 424         for i, (a, b, c) in enumerate(zip(av, bv, coe)):
 425             fprs[i+2] = fp64toselectable(a)
 426             fprs[i+6] = fp64toselectable(b)
 427             fprs[i+10] = fp64toselectable(c)
 428             mul = a * c
 429             t = b + mul
 430             u = b - mul
 431             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 432             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 433             res.append((t, u))
 434             print ("FFT", i, "in", a, b, "coeff", c, "mul", mul, "res", t, u)
 435
 436         # SVSTATE (in this case, VL=2)
 437         svstate = SVP64State()
 438         svstate.vl = 4 # VL
 439         svstate.maxvl = 4 # MAXVL
 440         print ("SVSTATE", bin(svstate.asint()))
 441
 442         with Program(lst, bigendian=False) as program:
 443             sim = self.run_tst_program(program, svstate=svstate,
 444                                        initial_fprs=fprs)
 445             # confirm that the results are as expected
 446             for i, (t, u) in enumerate(res):
 447                 self.assertEqual(sim.fpr(i+2), t)
 448                 self.assertEqual(sim.fpr(i+6), u)
 449
 450     def test_sv_ffadds_fft(self):
 451         """>>> lst = ["sv.ffadds *2, *2, *2"
 452                         ]
 453             four in-place vector adds, four in-place vector subs
 454
 455             SVP64 "FFT" mode will *automatically* offset FRB and an implicit
 456             FRS to perform the two multiplies.  one add, one subtract.
 457
 458             sv.ffadds FRT, FRA, FRB  actually does:
 459                 fadds FRT   , FRB, FRA
 460                 fsubs FRT+vl, FRA, FRB+vl
 461         """
 462         lst = SVP64Asm(["sv.ffadds *2, *2, *2"
 463                         ])
 464         lst = list(lst)
 465
 466         fprs = [0] * 32
 467         av = [7.0, -9.8, 2.0, -32.3] # first half of array 0..3
 468         bv = [-2.0, 2.0, -9.8, 32.3] # second half of array 4..7
 469         res = []
 470         # work out the results with the twin add-sub
 471         for i, (a, b) in enumerate(zip(av, bv)):
 472             fprs[i+2] = fp64toselectable(a)
 473             fprs[i+6] = fp64toselectable(b)
 474             t = b + a
 475             u = b - a
 476             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 477             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 478             res.append((t, u))
 479             print ("FFT", i, "in", a, b, "res", t, u)
 480
 481         # SVSTATE (in this case, VL=2)
 482         svstate = SVP64State()
 483         svstate.vl = 4 # VL
 484         svstate.maxvl = 4 # MAXVL
 485         print ("SVSTATE", bin(svstate.asint()))
 486
 487         with Program(lst, bigendian=False) as program:
 488             sim = self.run_tst_program(program, svstate=svstate,
 489                                        initial_fprs=fprs)
 490             # confirm that the results are as expected
 491             for i, (t, u) in enumerate(res):
 492                 a = float(sim.fpr(i+2))
 493                 b = float(sim.fpr(i+6))
 494                 t = float(t)
 495                 u = float(u)
 496                 print ("FFT", i, "in", a, b, "res", t, u)
 497             for i, (t, u) in enumerate(res):
 498                 self.assertEqual(sim.fpr(i+2), t)
 499                 self.assertEqual(sim.fpr(i+6), u)
 500
 501     def test_sv_remap_fpmadds_fft_svstep_complex(self):
 502         """
 503             runs a full in-place O(N log2 N) butterfly schedule for
 504             Discrete Fourier Transform.  this version however uses
 505             SVP64 "Vertical-First" Mode and so needs an explicit
 506             branch, testing CR0.
 507
 508             SVP64 "REMAP" in Butterfly Mode is applied to a twin +/- FMAC
 509             (3 inputs, 2 outputs)
 510
 511             complex calculation (FFT):
 512
 513                 tpre =  vec_r[jh] * cos_r[k] + vec_i[jh] * sin_i[k]
 514                 vec_r[jh] = vec_r[jl] - tpre
 515                 vec_r[jl] += tpre
 516
 517                 tpim = -vec_r[jh] * sin_i[k] + vec_i[jh] * cos_r[k]
 518                 vec_i[jh] = vec_i[jl] - tpim
 519                 vec_i[jl] += tpim
 520
 521             real-only calculation (DFT):
 522
 523                 temp1 = vec[jh] * exptable[k]
 524                 temp2 = vec[jl]
 525                 vec[jh] = temp2 - temp1
 526                 vec[jl] = temp2 + temp1
 527
 528             note: a rather nice convenience / coincidence. the meaning of
 529             these two instructions is:
 530                 # RA: jh (S1) RB: n/a RC: k (S2) RT: scalar EA: n/a
 531                 "svremap 5, 1, 0, 2, 0, 0, 1",
 532                 # RA: scal RB: jl (S0) RC: n/a RT: jl (S0) EA: jh (S1)
 533                 "svremap 26, 0, 0, 0, 0, 1, 1",
 534
 535             however it turns out that they can be *merged*, and for
 536             the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB)
 537             *ignore* their REMAPs (by definition, because you can't REMAP
 538             scalar operands), and for the second one (sv.ffads) exactly the
 539             right REMAPs are also ignored!
 540
 541             therefore we can merge:
 542                 "svremap 5, 1, 0, 2, 0, 0, 1",
 543                 "svremap 26, 0, 0, 0, 0, 1, 1",
 544             into:
 545                 "svremap 31, 1, 0, 2, 0, 1, 1",
 546             and save one instruction.
 547         """
 548         lst = SVP64Asm( [
 549                         # set triple butterfly mode with persistent "REMAP"
 550                         "svshape 8, 1, 1, 1, 1",
 551                         "svremap 31, 1, 0, 2, 0, 1, 1",
 552                         # tpre
 553                         "sv.fmuls 24, *0, *16",    # mul1_r = r*cos_r
 554                         "sv.fmadds 24, *8, *20, 24", # mul2_r = i*sin_i
 555                                                      # tpre = mul1_r + mul2_r
 556                         # tpim
 557                         "sv.fmuls 26, *0, *20",    # mul1_i = r*sin_i
 558                         "sv.fmsubs 26, *8, *16, 26", # mul2_i = i*cos_r
 559                                                      # tpim = mul2_i - mul1_i
 560                         # vec_r jh/jl
 561                         "sv.ffadds *0, 24, *0",    # vh/vl +/- tpre
 562                         # vec_i jh/jl
 563                         "sv.ffadds *8, 26, *8",    # vh/vl +- tpim
 564
 565                         # svstep loop
 566                         "setvl. 0, 0, 1, 1, 0, 0",
 567                         "bc 6, 3, -56"
 568                         ])
 569         lst = list(lst)
 570
 571         # array and coefficients to test
 572         ar = [7.0, -9.8, 3.0, -32.3,
 573               -2.0, 5.0, -9.8, 31.3] # array 0..7 real
 574         ai = [1.0, -1.8, 3.0, 19.3,
 575               4.0, -2.0, -0.8, 1.3] # array 0..7 imaginary
 576         coer = [-0.25, 0.5, 3.1, 6.2] # coefficients real
 577         coei = [0.21, -0.1, 1.1, -4.0] # coefficients imaginary
 578
 579         # store in regfile
 580         fprs = [0] * 64
 581         for i, a in enumerate(ar):
 582             fprs[i+0] = fp64toselectable(a)
 583         for i, a in enumerate(ai):
 584             fprs[i+8] = fp64toselectable(a)
 585         for i, cr in enumerate(coer):
 586             fprs[i+16] = fp64toselectable(cr)
 587         for i, ci in enumerate(coei):
 588             fprs[i+20] = fp64toselectable(ci)
 589
 590         # set total. err don't know how to calculate how many there are...
 591         # do it manually for now
 592         VL = 0
 593         size = 2
 594         n = len(ar)
 595         while size <= n:
 596             halfsize = size // 2
 597             tablestep = n // size
 598             for i in range(0, n, size):
 599                 for j in range(i, i + halfsize):
 600                     VL += 1
 601             size *= 2
 602
 603         # SVSTATE (calculated VL)
 604         svstate = SVP64State()
 605         svstate.vl = VL # VL
 606         svstate.maxvl = VL # MAXVL
 607         print ("SVSTATE", bin(svstate.asint()))
 608
 609         with Program(lst, bigendian=False) as program:
 610             sim = self.run_tst_program(program, svstate=svstate,
 611                                        initial_fprs=fprs)
 612             print ("spr svshape0", sim.spr['SVSHAPE0'])
 613             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 614             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 615             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 616             print ("spr svshape1", sim.spr['SVSHAPE1'])
 617             print ("spr svshape2", sim.spr['SVSHAPE2'])
 618             print ("spr svshape3", sim.spr['SVSHAPE3'])
 619
 620             # work out the results with the twin mul/add-sub, explicit
 621             # complex numbers
 622             res_r, res_i = transform_radix2_complex(ar, ai, coer, coei)
 623
 624             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 625                 print ("i", i, float(sim.fpr(i)), float(sim.fpr(i+8)),
 626                        "expected_r", expected_r,
 627                        "expected_i", expected_i)
 628             for i, (expected_r, expected_i) in enumerate(zip(res_r, res_i)):
 629                 # convert to Power single
 630                 expected_r = fph.DOUBLE2SINGLE(fp64toselectable(expected_r ))
 631                 expected_r = float(expected_r)
 632                 actual_r = float(sim.fpr(i))
 633                 # approximate error calculation, good enough test
 634                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 635                 # and the rounding is different
 636                 err = abs(actual_r - expected_r ) / expected_r
 637                 self.assertTrue(err < 1e-6)
 638                 # convert to Power single
 639                 expected_i = fph.DOUBLE2SINGLE(fp64toselectable(expected_i ))
 640                 expected_i = float(expected_i)
 641                 actual_i = float(sim.fpr(i+8))
 642                 # approximate error calculation, good enough test
 643                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 644                 # and the rounding is different
 645                 err = abs(actual_i - expected_i ) / expected_i
 646                 self.assertTrue(err < 1e-6)
 647
 648     def test_sv_ffadds_fft_scalar(self):
 649         """>>> lst = ["sv.ffadds *2, 12, 13"
 650                         ]
 651             four in-place vector adds and subs, but done with a scalar
 652             pair (fp12, fp13)
 653         """
 654         lst = SVP64Asm(["sv.ffadds *2, 12, 13"
 655                         ])
 656         lst = list(lst)
 657
 658         fprs = [0] * 32
 659         scalar_a = 1.3
 660         scalar_b = -2.0
 661         fprs[12] = fp64toselectable(scalar_a)
 662         fprs[13] = fp64toselectable(scalar_b)
 663         res = []
 664         # work out the results with the twin add-sub
 665         for i in range(4):
 666             t = scalar_b + scalar_a
 667             u = scalar_b - scalar_a
 668             t = fph.DOUBLE2SINGLE(fp64toselectable(t)) # convert to Power single
 669             u = fph.DOUBLE2SINGLE(fp64toselectable(u)) # from double
 670             res.append((t, u))
 671             print ("FFT", i, "res", t, u)
 672
 673         # SVSTATE (in this case, VL=2)
 674         svstate = SVP64State()
 675         svstate.vl = 4 # VL
 676         svstate.maxvl = 4 # MAXVL
 677         print ("SVSTATE", bin(svstate.asint()))
 678
 679         with Program(lst, bigendian=False) as program:
 680             sim = self.run_tst_program(program, svstate=svstate,
 681                                        initial_fprs=fprs)
 682             # confirm that the results are as expected
 683             for i, (t, u) in enumerate(res):
 684                 a = float(sim.fpr(i+2))
 685                 b = float(sim.fpr(i+6))
 686                 t = float(t)
 687                 u = float(u)
 688                 print ("FFT", i, "in", a, b, "res", t, u)
 689             for i, (t, u) in enumerate(res):
 690                 self.assertEqual(sim.fpr(i+2), t)
 691                 self.assertEqual(sim.fpr(i+6), u)
 692
 693     def test_sv_remap_fpmadds_fft_ldst(self):
 694         """>>>lst = ["setvl 0, 0, 8, 0, 1, 1",
 695                          "sv.lfssh *0, 4(0), 20", # bit-reversed
 696                          "svshape 8, 1, 1, 1, 0",
 697                          "svremap 31, 1, 0, 2, 0, 1, 0",
 698                          "sv.ffmadds *0, *0, *0, *8"
 699
 700             runs a full in-place O(N log2 N) butterfly schedule for
 701             Discrete Fourier Transform, using bit-reversed LD/ST
 702         """
 703         lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0",
 704                          "svremap 1, 0, 0, 0, 0, 0, 0",
 705                          "sv.lfssh *0, 4(0), 20", # shifted
 706                          "svshape 8, 1, 1, 1, 0",
 707                          "svremap 31, 1, 0, 2, 0, 1, 0",
 708                          "sv.ffmadds *0, *0, *0, *8"
 709                         ])
 710         lst = list(lst)
 711
 712         # array and coefficients to test
 713         av = [7.0, -9.8, 3.0, -32.3,
 714               -2.0, 5.0, -9.8, 31.3] # array 0..7
 715         coe = [-0.25, 0.5, 3.1, 6.2] # coefficients
 716
 717         # store in regfile
 718         fprs = [0] * 32
 719         for i, c in enumerate(coe):
 720             fprs[i+8] = fp64toselectable(c)
 721         # store in memory
 722         mem = {}
 723         val = 0
 724         for i, a in enumerate(av):
 725             a = SINGLE(fp64toselectable(a)).value
 726             shift = (i % 2) == 1
 727             if shift == 0:
 728                 val = a
 729             else:
 730                 mem[(i//2)*8] = val | (a << 32)
 731
 732         with Program(lst, bigendian=False) as program:
 733             sim = self.run_tst_program(program, initial_mem=mem,
 734                                                 initial_fprs=fprs)
 735             print ("spr svshape0", sim.spr['SVSHAPE0'])
 736             print ("    xdimsz", sim.spr['SVSHAPE0'].xdimsz)
 737             print ("    ydimsz", sim.spr['SVSHAPE0'].ydimsz)
 738             print ("    zdimsz", sim.spr['SVSHAPE0'].zdimsz)
 739             print ("spr svshape1", sim.spr['SVSHAPE1'])
 740             print ("spr svshape2", sim.spr['SVSHAPE2'])
 741             print ("spr svshape3", sim.spr['SVSHAPE3'])
 742
 743             print ("mem dump")
 744             print (sim.mem.dump())
 745
 746             # work out the results with the twin mul/add-sub,
 747             # note bit-reverse mode requested
 748             res = transform_radix2(av, coe, reverse=True)
 749
 750             for i, expected in enumerate(res):
 751                 print ("i", i, float(sim.fpr(i)), "expected", expected)
 752             for i, expected in enumerate(res):
 753                 # convert to Power single
 754                 expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
 755                 expected = float(expected)
 756                 actual = float(sim.fpr(i))
 757                 # approximate error calculation, good enough test
 758                 # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
 759                 # and the rounding is different
 760                 err = abs(actual - expected) / expected
 761                 self.assertTrue(err < 1e-6)
 762
 763     def run_tst_program(self, prog, initial_regs=None,
 764                               svstate=None,
 765                               initial_mem=None,
 766                               initial_fprs=None):
 767         if initial_regs is None:
 768             initial_regs = [0] * 32
 769         simulator = run_tst(prog, initial_regs, mem=initial_mem,
 770                                                 initial_fprs=initial_fprs,
 771                                                 svstate=svstate)
 772
 773         print ("GPRs")
 774         simulator.gpr.dump()
 775         print ("FPRs")
 776         simulator.fpr.dump()
 777
 778         return simulator
 779
 780
 781 if __name__ == "__main__":
 782     unittest.main()