src/openpower/decoder/isa/remap_dct_yield.py

   1 # DCT "REMAP" scheduler
   2 #
   3 # Modifications made to create an in-place iterative DCT:
   4 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   5 #
   6 # SPDX: LGPLv3+
   7 #
   8 # Original fastdctlee.py by Nayuki:
   9 # Copyright (c) 2020 Project Nayuki. (MIT License)
  10 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
  11
  12 import math
  13
  14 # bits of the integer 'val'.
  15 def reverse_bits(val, width):
  16     result = 0
  17     for _ in range(width):
  18         result = (result << 1) | (val & 1)
  19         val >>= 1
  20     return result
  21
  22
  23 # iterative version of [recursively-applied] half-rev.
  24 # relies on the list lengths being power-of-two and the fact
  25 # that bit-inversion of a list of binary numbers is the same
  26 # as reversing the order of the list
  27 # this version is dead easy to implement in hardware.
  28 # a big surprise is that the half-reversal can be done with
  29 # such a simple XOR. the inverse operation is slightly trickier
  30 def halfrev2(vec, pre_rev=True):
  31     res = []
  32     for i in range(len(vec)):
  33         if pre_rev:
  34             res.append(i ^ (i>>1))
  35         else:
  36             ri = i
  37             bl = i.bit_length()
  38             for ji in range(1, bl):
  39                 ri ^= (i >> ji)
  40             res.append(vec[ri])
  41     return res
  42
  43
  44 # python "yield" can be iterated. use this to make it clear how
  45 # the indices are generated by using natural-looking nested loops
  46 def iterate_dct_inner_halfswap_loadstore(SVSHAPE):
  47     # get indices to iterate over, in the required order
  48     n = SVSHAPE.lims[0]
  49     mode = SVSHAPE.lims[1]
  50     print ("inner halfswap loadstore", n, mode, SVSHAPE.skip)
  51
  52     # reference list for not needing to do data-swaps, just swap what
  53     # *indices* are referenced (two levels of indirection at the moment)
  54     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
  55     ji = list(range(n))
  56
  57     levels = n.bit_length() - 1
  58     ji = halfrev2(ji, False)
  59     if False: # swap: TODO, add extra bit-reverse mode
  60         ri = [reverse_bits(i, levels) for i in range(n)]
  61         ji = [ji[ri[i]] for i in range(n)]
  62
  63
  64     # invert order if requested
  65     if SVSHAPE.invxyz[0]:
  66         ji.reverse()
  67
  68     for i, jl in enumerate(ji):
  69         y_end = jl == ji[-1]
  70         yield jl, (0b111 if y_end else 0b000)
  71
  72
  73 # python "yield" can be iterated. use this to make it clear how
  74 # the indices are generated by using natural-looking nested loops
  75 def iterate_dct_inner_costable_indices(SVSHAPE):
  76     # get indices to iterate over, in the required order
  77     n = SVSHAPE.lims[0]
  78     mode = SVSHAPE.lims[1]
  79     print ("inner costable", mode)
  80     # creating lists of indices to iterate over in each dimension
  81     # has to be done dynamically, because it depends on the size
  82     # first, the size-based loop (which can be done statically)
  83     x_r = []
  84     size = 2
  85     while size <= n:
  86         x_r.append(size)
  87         size *= 2
  88     # invert order if requested
  89     if SVSHAPE.invxyz[0]:
  90         x_r.reverse()
  91
  92     if len(x_r) == 0:
  93         return
  94
  95     #print ("ri", ri)
  96     #print ("ji", ji)
  97
  98     # start an infinite (wrapping) loop
  99     skip = 0
 100     z_end = 1 # doesn't exist in this, only 2 loops
 101     k = 0
 102     while True:
 103         for size in x_r:           # loop over 3rd order dimension (size)
 104             x_end = size == x_r[-1]
 105             # y_r schedule depends on size
 106             halfsize = size // 2
 107             y_r = []
 108             for i in range(0, n, size):
 109                 y_r.append(i)
 110             # invert if requested
 111             if SVSHAPE.invxyz[1]: y_r.reverse()
 112             # two lists of half-range indices, e.g. j 0123, jr 7654
 113             j = list(range(0, halfsize))
 114             # invert if requested
 115             if SVSHAPE.invxyz[2]: j_r.reverse()
 116             #print ("xform jr", jr)
 117             # loop over 1st order dimension
 118             for ci, jl in enumerate(j):
 119                 y_end = jl == j[-1]
 120                 # now depending on MODE return the index.  inner butterfly
 121                 if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
 122                     result = k  # offset into COS table
 123                 elif SVSHAPE.skip == 0b10: #
 124                     result = ci # coefficient helper
 125                 elif SVSHAPE.skip == 0b11: #
 126                     result = size # coefficient helper
 127                 loopends = (z_end |
 128                            ((y_end and z_end)<<1) |
 129                             ((y_end and x_end and z_end)<<2))
 130
 131                 yield result + SVSHAPE.offset, loopends
 132                 k += 1
 133
 134 # python "yield" can be iterated. use this to make it clear how
 135 # the indices are generated by using natural-looking nested loops
 136 def iterate_dct_inner_butterfly_indices(SVSHAPE):
 137     # get indices to iterate over, in the required order
 138     n = SVSHAPE.lims[0]
 139     mode = SVSHAPE.lims[1]
 140     #print ("inner butterfly", mode, SVSHAPE.skip)
 141     # creating lists of indices to iterate over in each dimension
 142     # has to be done dynamically, because it depends on the size
 143     # first, the size-based loop (which can be done statically)
 144     x_r = []
 145     size = 2
 146     while size <= n:
 147         x_r.append(size)
 148         size *= 2
 149     # invert order if requested
 150     if SVSHAPE.invxyz[0]:
 151         x_r.reverse()
 152
 153     if len(x_r) == 0:
 154         return
 155
 156     # reference (read/write) the in-place data in *reverse-bit-order*
 157     ri = list(range(n))
 158     if SVSHAPE.submode2 == 0b01:
 159         levels = n.bit_length() - 1
 160         ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 161
 162     # reference list for not needing to do data-swaps, just swap what
 163     # *indices* are referenced (two levels of indirection at the moment)
 164     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
 165     ji = list(range(n))
 166     inplace_mode = SVSHAPE.submode2 == 0b01
 167     #                     and SVSHAPE.skip not in [0b10, 0b11]
 168     if inplace_mode:
 169         #print ("inplace mode")
 170         ji = halfrev2(ji, True)
 171
 172     #print ("ri", ri)
 173     #print ("ji", ji)
 174
 175     # start an infinite (wrapping) loop
 176     skip = 0
 177     k = 0
 178     k_start = 0
 179     while True:
 180         for size in x_r:           # loop over 3rd order dimension (size)
 181             x_end = size == x_r[-1]
 182             # y_r schedule depends on size
 183             halfsize = size // 2
 184             y_r = []
 185             for i in range(0, n, size):
 186                 y_r.append(i)
 187             # invert if requested
 188             if SVSHAPE.invxyz[1]: y_r.reverse()
 189             for i in y_r:       # loop over 2nd order dimension
 190                 y_end = i == y_r[-1]
 191                 # two lists of half-range indices, e.g. j 0123, jr 7654
 192                 j = list(range(i, i + halfsize))
 193                 jr = list(range(i+halfsize, i + size))
 194                 jr.reverse()
 195                 # invert if requested
 196                 if SVSHAPE.invxyz[2]: j_r.reverse()
 197                 hz2 = halfsize // 2 # zero stops reversing 1-item lists
 198                 # if you *really* want to do the in-place swapping manually,
 199                 # this allows you to do it.  good luck...
 200                 if SVSHAPE.submode2 == 0b01 and not inplace_mode:
 201                     #print ("swap mode")
 202                     jr = j_r[:hz2]
 203                 #print ("xform jr", jr)
 204                 # loop over 1st order dimension
 205                 k = k_start
 206                 for ci, (jl, jh) in enumerate(zip(j, jr)):
 207                     z_end = jl == j[-1]
 208                     # now depending on MODE return the index.  inner butterfly
 209                     if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
 210                         result = ri[ji[jl]]        # lower half
 211                     elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
 212                         result = ri[ji[jh]] # upper half
 213                     elif mode == 4:
 214                         # COS table pre-generated mode
 215                         if SVSHAPE.skip == 0b10: #
 216                             result = k # cos table offset
 217                     else: # mode 2
 218                         # COS table generated on-demand ("Vertical-First") mode
 219                         if SVSHAPE.skip == 0b10: #
 220                             result = ci # coefficient helper
 221                         elif SVSHAPE.skip == 0b11: #
 222                             result = size # coefficient helper
 223                     loopends = (z_end |
 224                                ((y_end and z_end)<<1) |
 225                                 ((y_end and x_end and z_end)<<2))
 226
 227                     yield result + SVSHAPE.offset, loopends
 228                     k += 1
 229
 230                 # now in-place swap
 231                 if inplace_mode:
 232                     for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
 233                         jlh = jl+halfsize
 234                         #print ("inplace swap", jh, jlh)
 235                         tmp1, tmp2 = ji[jlh], ji[jh]
 236                         ji[jlh], ji[jh] = tmp2, tmp1
 237
 238             # new k_start point for cos tables( runs inside x_r loop NOT i loop)
 239             k_start += halfsize
 240
 241
 242 # python "yield" can be iterated. use this to make it clear how
 243 # the indices are generated by using natural-looking nested loops
 244 def iterate_dct_outer_butterfly_indices(SVSHAPE):
 245     # get indices to iterate over, in the required order
 246     n = SVSHAPE.lims[0]
 247     mode = SVSHAPE.lims[1]
 248     # createing lists of indices to iterate over in each dimension
 249     # has to be done dynamically, because it depends on the size
 250     # first, the size-based loop (which can be done statically)
 251     x_r = []
 252     size = n // 2
 253     while size >= 2:
 254         x_r.append(size)
 255         size //= 2
 256     # invert order if requested
 257     if SVSHAPE.invxyz[0]:
 258         x_r.reverse()
 259
 260     if len(x_r) == 0:
 261         return
 262
 263     #print ("outer butterfly")
 264
 265     # reference (read/write) the in-place data in *reverse-bit-order*
 266     ri = list(range(n))
 267     if SVSHAPE.submode2 == 0b11:
 268         levels = n.bit_length() - 1
 269         ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 270
 271     # reference list for not needing to do data-swaps, just swap what
 272     # *indices* are referenced (two levels of indirection at the moment)
 273     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
 274     ji = list(range(n))
 275     inplace_mode = False # need the space... SVSHAPE.skip in [0b10, 0b11]
 276     if inplace_mode:
 277         #print ("inplace mode", SVSHAPE.skip)
 278         ji = halfrev2(ji, True)
 279
 280     #print ("ri", ri)
 281     #print ("ji", ji)
 282
 283     # start an infinite (wrapping) loop
 284     skip = 0
 285     k = 0
 286     k_start = 0
 287     while True:
 288         for size in x_r:           # loop over 3rd order dimension (size)
 289             halfsize = size//2
 290             x_end = size == x_r[-1]
 291             y_r = list(range(0, halfsize))
 292             #print ("itersum", halfsize, size, y_r)
 293             # invert if requested
 294             if SVSHAPE.invxyz[1]: y_r.reverse()
 295             for i in y_r:       # loop over 2nd order dimension
 296                 y_end = i == y_r[-1]
 297                 # one list to create iterative-sum schedule
 298                 jr = list(range(i+halfsize, i+n-halfsize, size))
 299                 #print ("itersum     jr", i+halfsize, i+size, jr)
 300                 # invert if requested
 301                 if SVSHAPE.invxyz[2]: j_r.reverse()
 302                 hz2 = halfsize // 2 # zero stops reversing 1-item lists
 303                 k = k_start
 304                 for ci, jh in enumerate(jr):   # loop over 1st order dimension
 305                     z_end = jh == jr[-1]
 306                     #print ("     itersum", size, i, jh, jh+size)
 307                     if mode == 4:
 308                         # COS table pre-generated mode
 309                         if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
 310                             result = ri[ji[jh]]        # lower half
 311                         elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
 312                             result = ri[ji[jh+size]] # upper half
 313                         elif SVSHAPE.skip == 0b10: #
 314                             result = k # cos table offset
 315                     else:
 316                         # COS table generated on-demand ("Vertical-First") mode
 317                         if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
 318                             result = ri[ji[jh]]        # lower half
 319                         elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
 320                             result = ri[ji[jh+size]] # upper half
 321                         elif SVSHAPE.skip == 0b10: #
 322                             result = ci # coefficient helper
 323                         elif SVSHAPE.skip == 0b11: #
 324                             result = size # coefficient helper
 325                     loopends = (z_end |
 326                                ((y_end and z_end)<<1) |
 327                                 ((y_end and x_end and z_end)<<2))
 328
 329                     yield result + SVSHAPE.offset, loopends
 330                     k += 1
 331
 332                 # now in-place swap
 333                 if SVSHAPE.submode2 == 0b11 and inplace_mode:
 334                     j = list(range(i, i + halfsize))
 335                     jr = list(range(i+halfsize, i + size))
 336                     jr.reverse()
 337                     for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
 338                         jlh = jl+halfsize
 339                         #print ("inplace swap", jh, jlh)
 340                         tmp1, tmp2 = ji[jlh], ji[jh]
 341                         ji[jlh], ji[jh] = tmp2, tmp1
 342
 343             # new k_start point for cos tables( runs inside x_r loop NOT i loop)
 344             k_start += halfsize
 345
 346
 347 def pprint_schedule(schedule, n):
 348     size = 2
 349     idx = 0
 350     while size <= n:
 351         halfsize = size // 2
 352         tablestep = n // size
 353         print ("size %d halfsize %d tablestep %d" % \
 354                 (size, halfsize, tablestep))
 355         for i in range(0, n, size):
 356             prefix = "i %d\t" % i
 357             for j in range(i, i + halfsize):
 358                 (jl, je), (jh, he) = schedule[idx]
 359                 print ("  %-3d\t%s j=%-2d jh=%-2d "
 360                         "j[jl=%-2d] j[jh=%-2d]" % \
 361                                 (idx, prefix, j, j+halfsize,
 362                                       jl, jh,
 363                                 ),
 364                                 "end", bin(je)[2:], bin(je)[2:])
 365                 idx += 1
 366         size *= 2
 367
 368 def pprint_schedule_outer(schedule, n):
 369     size = 2
 370     idx = 0
 371     while size <= n//2:
 372         halfsize = size // 2
 373         tablestep = n // size
 374         print ("size %d halfsize %d tablestep %d" % \
 375                 (size, halfsize, tablestep))
 376         y_r = list(range(0, halfsize))
 377         for i in y_r:
 378             prefix = "i %d\t" % i
 379             jr = list(range(i+halfsize, i+n-halfsize, size))
 380             for j in jr:
 381                 (jl, je), (jh, he) = schedule[idx]
 382                 print ("  %-3d\t%s j=%-2d jh=%-2d "
 383                         "j[jl=%-2d] j[jh=%-2d]" % \
 384                                 (idx, prefix, j, j+halfsize,
 385                                       jl, jh,
 386                                 ),
 387                                 "end", bin(je)[2:], bin(je)[2:])
 388                 idx += 1
 389         size *= 2
 390
 391
 392 # totally cool *in-place* DCT algorithm using yield REMAPs
 393 def transform2(vec):
 394
 395     # Initialization
 396     n = len(vec)
 397     print ()
 398     print ("transform2", n)
 399     levels = n.bit_length() - 1
 400
 401     # set up dims
 402     xdim = n
 403
 404     # reference (read/write) the in-place data in *reverse-bit-order*
 405     ri = list(range(n))
 406     ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 407
 408     # and pretend we LDed data in half-swapped *and* bit-reversed order as well
 409     # TODO: merge these two
 410     vec = halfrev2(vec, False)
 411     vec = [vec[ri[i]] for i in range(n)]
 412
 413     # create a cos table: not strictly necessary but here for illustrative
 414     # purposes, to demonstrate the point that it really *is* iterative.
 415     # this table could be cached and used multiple times rather than
 416     # computed every time.
 417     ctable = []
 418     size = n
 419     while size >= 2:
 420         halfsize = size // 2
 421         for ci in range(halfsize):
 422             coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
 423             ctable.append(coeff)
 424             print ("coeff", size,  "ci", ci, "k", len(ctable)-1,
 425                    "i/n", (ci+0.5)/size, coeff)
 426         size //= 2
 427
 428     # set up an SVSHAPE
 429     class SVSHAPE:
 430         pass
 431     # ci schedule
 432     SVSHAPE0 = SVSHAPE()
 433     SVSHAPE0.lims = [xdim, 4, 0]
 434     SVSHAPE0.mode = 0b01
 435     SVSHAPE0.submode2 = 0b01
 436     SVSHAPE0.skip = 0b10
 437     SVSHAPE0.offset = 0       # experiment with different offset, here
 438     SVSHAPE0.invxyz = [1,0,0] # inversion if desired
 439     # size schedule
 440     SVSHAPE1 = SVSHAPE()
 441     SVSHAPE1.lims = [xdim, 4, 0]
 442     SVSHAPE1.mode = 0b01
 443     SVSHAPE1.submode2 = 0b01
 444     SVSHAPE1.skip = 0b11
 445     SVSHAPE1.offset = 0       # experiment with different offset, here
 446     SVSHAPE1.invxyz = [1,0,0] # inversion if desired
 447     # k schedule
 448     SVSHAPE2 = SVSHAPE()
 449     SVSHAPE2.lims = [xdim, 4, 0]
 450     SVSHAPE2.mode = 0b01
 451     SVSHAPE2.submode2 = 0b01
 452     SVSHAPE2.skip = 0b00
 453     SVSHAPE2.offset = 0       # experiment with different offset, here
 454     SVSHAPE2.invxyz = [1,0,0] # inversion if desired
 455
 456     # enumerate over the iterator function, getting new indices
 457     i0 = iterate_dct_inner_costable_indices(SVSHAPE0)
 458     i1 = iterate_dct_inner_costable_indices(SVSHAPE1)
 459     i2 = iterate_dct_inner_costable_indices(SVSHAPE2)
 460     for ((ci, cie), (size, sze), (k, ke)) in \
 461                 zip(i0, i1, i2):
 462         print ("xform2 cos", ci, size, k)
 463         coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
 464         assert coeff == ctable[k]
 465         print ("coeff", size,  "ci", ci, "k", k,
 466                "i/n", (ci+0.5)/size, coeff,
 467                 "end", bin(cie), bin(sze), bin(ke))
 468         if cie == 0b111: # all loops end
 469             break
 470
 471     ################
 472     # INNER butterfly
 473     ################
 474
 475     # j schedule
 476     SVSHAPE0 = SVSHAPE()
 477     SVSHAPE0.lims = [xdim, 0b000001, 0]
 478     SVSHAPE0.mode = 0b01
 479     SVSHAPE0.submode2 = 0b01
 480     SVSHAPE0.skip = 0b00
 481     SVSHAPE0.offset = 0       # experiment with different offset, here
 482     SVSHAPE0.invxyz = [1,0,0] # inversion if desired
 483     # j+halfstep schedule
 484     SVSHAPE1 = SVSHAPE()
 485     SVSHAPE1.lims = [xdim, 0b000001, 0]
 486     SVSHAPE1.mode = 0b01
 487     SVSHAPE1.submode2 = 0b01
 488     SVSHAPE1.skip = 0b01
 489     SVSHAPE1.offset = 0       # experiment with different offset, here
 490     SVSHAPE1.invxyz = [1,0,0] # inversion if desired
 491     # ci schedule
 492     SVSHAPE2 = SVSHAPE()
 493     SVSHAPE2.lims = [xdim, 0b000001, 0]
 494     SVSHAPE2.mode = 0b01
 495     SVSHAPE2.submode2 = 0b01
 496     SVSHAPE2.skip = 0b10
 497     SVSHAPE2.offset = 0       # experiment with different offset, here
 498     SVSHAPE2.invxyz = [1,0,0] # inversion if desired
 499     # size schedule
 500     SVSHAPE3 = SVSHAPE()
 501     SVSHAPE3.lims = [xdim, 0b000001, 0]
 502     SVSHAPE3.mode = 0b01
 503     SVSHAPE3.submode2 = 0b01
 504     SVSHAPE3.skip = 0b11
 505     SVSHAPE3.offset = 0       # experiment with different offset, here
 506     SVSHAPE3.invxyz = [1,0,0] # inversion if desired
 507
 508     # enumerate over the iterator function, getting new indices
 509     i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
 510     i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
 511     i2 = iterate_dct_inner_butterfly_indices(SVSHAPE2)
 512     i3 = iterate_dct_inner_butterfly_indices(SVSHAPE3)
 513     for k, ((jl, jle), (jh, jhe), (ci, cie), (size, sze)) in \
 514                 enumerate(zip(i0, i1, i2, i3)):
 515         t1, t2 = vec[jl], vec[jh]
 516         print ("xform2", jl, jh, ci, size)
 517         coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
 518         #assert coeff == ctable[k]
 519         vec[jl] = t1 + t2
 520         vec[jh] = (t1 - t2) * (1/coeff)
 521         print ("coeff", size, "ci", ci,
 522                 "jl", jl, "jh", jh,
 523                "i/n", (ci+0.5)/size, coeff, vec[jl],
 524                                             vec[jh],
 525                 "end", bin(jle), bin(jhe))
 526         if jle == 0b111: # all loops end
 527             break
 528
 529     print("transform2 pre-itersum", vec)
 530
 531     # now things are in the right order for the outer butterfly.
 532
 533     # j schedule
 534     SVSHAPE0 = SVSHAPE()
 535     SVSHAPE0.lims = [xdim, 0b0000010, 0]
 536     SVSHAPE0.submode2 = 0b100
 537     SVSHAPE0.mode = 0b01
 538     SVSHAPE0.skip = 0b00
 539     SVSHAPE0.offset = 0       # experiment with different offset, here
 540     SVSHAPE0.invxyz = [0,0,0] # inversion if desired
 541     # j+halfstep schedule
 542     SVSHAPE1 = SVSHAPE()
 543     SVSHAPE1.lims = [xdim, 0b0000010, 0]
 544     SVSHAPE1.mode = 0b01
 545     SVSHAPE1.submode2 = 0b100
 546     SVSHAPE1.skip = 0b01
 547     SVSHAPE1.offset = 0       # experiment with different offset, here
 548     SVSHAPE1.invxyz = [0,0,0] # inversion if desired
 549
 550     # enumerate over the iterator function, getting new indices
 551     i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
 552     i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
 553     for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
 554         print ("itersum    jr", jl, jh,
 555                 "end", bin(jle), bin(jhe))
 556         vec[jl] += vec[jh]
 557         size //= 2
 558         if jle == 0b111: # all loops end
 559             break
 560
 561     print("transform2 result", vec)
 562
 563     return vec
 564
 565
 566 def demo():
 567     # set the dimension sizes here
 568     n = 8
 569     xdim = n
 570     ydim = 0 # not needed
 571     zdim = 0 # again, not needed
 572
 573
 574     ################
 575     # INNER butterfly
 576     ################
 577
 578     # set up an SVSHAPE
 579     class SVSHAPE:
 580         pass
 581     # j schedule
 582     SVSHAPE0 = SVSHAPE()
 583     SVSHAPE0.lims = [xdim, 0b000001, zdim]
 584     SVSHAPE0.submode2 = 0b010
 585     SVSHAPE0.mode = 0b01
 586     SVSHAPE0.skip = 0b00
 587     SVSHAPE0.offset = 0       # experiment with different offset, here
 588     SVSHAPE0.invxyz = [0,0,0] # inversion if desired
 589     # j+halfstep schedule
 590     SVSHAPE1 = SVSHAPE()
 591     SVSHAPE1.lims = [xdim, 0b000001, zdim]
 592     SVSHAPE1.submode2 = 0b010
 593     SVSHAPE1.mode = 0b01
 594     SVSHAPE1.skip = 0b01
 595     SVSHAPE1.offset = 0       # experiment with different offset, here
 596     SVSHAPE1.invxyz = [0,0,0] # inversion if desired
 597
 598     # enumerate over the iterator function, getting new indices
 599     schedule = []
 600     i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
 601     i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
 602     for idx, (jl, jh) in enumerate(zip(i0, i1)):
 603         schedule.append((jl, jh))
 604         if jl[1] == 0b111: # end
 605             break
 606
 607     # ok now pretty-print the results, with some debug output
 608     print ("inner butterfly")
 609     pprint_schedule(schedule, n)
 610     print ("")
 611
 612     ################
 613     # outer butterfly
 614     ################
 615
 616     # j schedule
 617     SVSHAPE0 = SVSHAPE()
 618     SVSHAPE0.lims = [xdim, 0b000010, zdim]
 619     SVSHAPE0.mode = 0b01
 620     SVSHAPE0.submode2 = 0b100
 621     SVSHAPE0.skip = 0b10
 622     SVSHAPE0.offset = 0       # experiment with different offset, here
 623     SVSHAPE0.invxyz = [1,0,0] # inversion if desired
 624     # j+halfstep schedule
 625     SVSHAPE1 = SVSHAPE()
 626     SVSHAPE1.lims = [xdim, 0b000010, zdim]
 627     SVSHAPE1.mode = 0b01
 628     SVSHAPE1.submode2 = 0b100
 629     SVSHAPE1.skip = 0b11
 630     SVSHAPE1.offset = 0       # experiment with different offset, here
 631     SVSHAPE1.invxyz = [1,0,0] # inversion if desired
 632
 633     # enumerate over the iterator function, getting new indices
 634     schedule = []
 635     i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
 636     i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
 637     for idx, (jl, jh) in enumerate(zip(i0, i1)):
 638         schedule.append((jl, jh))
 639         if jl[1] == 0b111: # end
 640             break
 641
 642     # ok now pretty-print the results, with some debug output
 643     print ("outer butterfly")
 644     pprint_schedule_outer(schedule, n)
 645
 646     # for DCT half-swap LDs
 647     # j schedule
 648     SVSHAPE0 = SVSHAPE()
 649     SVSHAPE0.lims = [xdim, 0b000101, zdim]
 650     SVSHAPE0.mode = 0b01
 651     SVSHAPE0.submode2 = 0
 652     SVSHAPE0.skip = 0
 653     SVSHAPE0.offset = 0       # experiment with different offset, here
 654     SVSHAPE0.invxyz = [0,0,0] # inversion if desired
 655
 656     # expected results
 657     levels = n.bit_length() - 1
 658     avi = list(range(n))
 659     ri = [reverse_bits(i, levels) for i in range(n)]
 660     av = halfrev2(avi, False)
 661     av = [av[ri[i]] for i in range(n)]
 662
 663
 664     i0 = iterate_dct_inner_halfswap_loadstore(SVSHAPE0)
 665     for idx, (jl) in enumerate(i0):
 666         print ("inverse half-swap ld", idx, jl, av[idx])
 667         if jl[1] == 0b111: # end
 668             break
 669
 670
 671 # run the demo
 672 if __name__ == '__main__':
 673     demo()