src/openpower/decoder/isa/fastdctlee.py

   1 #
   2 # Fast discrete cosine transform algorithm (Python)
   3 #
   4 # Modifications made to create an in-place iterative DCT:
   5 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   6 #
   7 # License for modifications - SPDX: LGPLv3+
   8 #
   9 # Original fastdctlee.py by Nayuki:
  10 # Copyright (c) 2020 Project Nayuki. (MIT License)
  11 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
  12 #
  13 # License for original fastdctlee.py by Nayuki:
  14 #
  15 # Permission is hereby granted, free of charge, to any person obtaining
  16 # a copy of this software and associated documentation files (the
  17 # "Software"), to deal in the Software without restriction, including
  18 # without limitation the rights to use, copy, modify, merge, publish,
  19 # distribute, sublicense, and/or sell copies of the Software, and to
  20 # permit persons to whom the Software is furnished to do so, subject to
  21 # the following conditions:
  22 # - The above copyright notice and this permission notice shall be included in
  23 #   all copies or substantial portions of the Software.
  24 # - The Software is provided "as is", without warranty of any kind, express or
  25 #   implied, including but not limited to the warranties of merchantability,
  26 #   fitness for a particular purpose and noninfringement. In no event shall the
  27 #   authors or copyright holders be liable for any claim, damages or other
  28 #   liability, whether in an action of contract, tort or otherwise,
  29 #   arising from, out of or in connection with the Software or the use
  30 #   or other dealings in the Software.
  31 #
  32 #
  33 # The modifications made are firstly to create an iterative schedule,
  34 # rather than the more normal recursive algorithm.  Secondly, the
  35 # two butterflys are also separated out: inner butterfly does COS +/-
  36 # whilst outer butterfly does the iterative summing.
  37 #
  38 # However, to avoid data copying some additional tricks are played:
  39 # - firstly, the data is LOADed in bit-reversed order (which is normally
  40 #   covered by the recursive algorithm due to the odd-even reconstruction)
  41 #   but then to reference the data in the correct order an array of
  42 #   bit-reversed indices is created, as a level of indirection.
  43 #   the data is bit-reversed but so are the indices, making it all A-Ok.
  44 # - secondly, normally in DCT a 2nd target (copy) array is used where
  45 #   the top half is read in reverse order (7 6 5 4) and written out
  46 #   to the target 4 5 6 7.  the plan was to do this in two stages:
  47 #   write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
  48 #   however by leaving the data *in-place* and having subsequent
  49 #   loops refer to the data *where it now is*, the swap is avoided
  50 # - thirdly, arrange for the data to be *pre-swapped* (in an inverse
  51 #   order of how it would have got there, if that makes sense), such
  52 #   that *when* it gets swapped, it ends up in the right order.
  53 #   given that that will be a LD operation it's no big deal.
  54 #
  55 # End result is that once the first butterfly is done - bear in mind
  56 # it's in-place - the data is in the right order so that a second
  57 # dead-straightforward iterative sum can be done: again, in-place.
  58 # Really surprising.
  59
  60 import math
  61 from copy import deepcopy
  62
  63 # bits of the integer 'val'.
  64 def reverse_bits(val, width):
  65     result = 0
  66     for _ in range(width):
  67         result = (result << 1) | (val & 1)
  68         val >>= 1
  69     return result
  70
  71
  72 # reverse top half of a list, recursively.  the recursion can be
  73 # applied *after* or *before* the reversal of the top half.  these
  74 # are inverses of each other.
  75 # this function is unused except to test the iterative version (halfrev2)
  76 def halfrev(l, pre_rev=True):
  77     n = len(l)
  78     if n == 1:
  79         return l
  80     ll, lh = l[:n//2], l[n//2:]
  81     if pre_rev:
  82         ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
  83     lh.reverse()
  84     if not pre_rev:
  85         ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
  86     return ll + lh
  87
  88
  89 # iterative version of [recursively-applied] half-rev.
  90 # relies on the list lengths being power-of-two and the fact
  91 # that bit-inversion of a list of binary numbers is the same
  92 # as reversing the order of the list
  93 # this version is dead easy to implement in hardware.
  94 # a big surprise is that the half-reversal can be done with
  95 # such a simple XOR. the inverse operation is slightly trickier
  96 def halfrev2(vec, pre_rev=True):
  97     res = []
  98     for i in range(len(vec)):
  99         if pre_rev:
 100             res.append(vec[i ^ (i>>1)])
 101         else:
 102             ri = i
 103             bl = i.bit_length()
 104             for ji in range(1, bl):
 105                 ri ^= (i >> ji)
 106             res.append(vec[ri])
 107     return res
 108
 109
 110 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
 111 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
 112 # original (recursive) algorithm by Nayuki
 113 def transform(vector, indent=0):
 114     idt = "   " * indent
 115     n = len(vector)
 116     if n == 1:
 117         return list(vector)
 118     elif n == 0 or n % 2 != 0:
 119         raise ValueError()
 120     else:
 121         half = n // 2
 122         alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
 123         beta  = [(vector[i] - vector[-(i + 1)]) /
 124                  (math.cos((i + 0.5) * math.pi / n) * 2.0)
 125                     for i in range(half)]
 126         alpha = transform(alpha)
 127         beta  = transform(beta )
 128         result = []
 129         for i in range(half - 1):
 130             result.append(alpha[i])
 131             result.append(beta[i] + beta[i + 1])
 132         result.append(alpha[-1])
 133         result.append(beta [-1])
 134         return result
 135
 136
 137 # modified recursive algorithm, based on Nayuki original, which simply
 138 # prints out an awful lot of debug data.  used to work out the ordering
 139 # for the iterative version by analysing the indices printed out
 140 def transform(vector, indent=0):
 141     idt = "   " * indent
 142     n = len(vector)
 143     if n == 1:
 144         return list(vector)
 145     elif n == 0 or n % 2 != 0:
 146         raise ValueError()
 147     else:
 148         half = n // 2
 149         alpha = [0] * half
 150         beta = [0] * half
 151         print (idt, "xf", vector)
 152         print (idt, "coeff", n, "->", end=" ")
 153         for i in range(half):
 154             t1, t2 = vector[i], vector[n-i-1]
 155             k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
 156             print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
 157             alpha[i] = t1 + t2
 158             beta[i] = (t1 - t2) * (1/k)
 159         print ()
 160         print (idt, "n", n, "alpha", end=" ")
 161         for i in range(0, n, 2):
 162             print (i, i//2, alpha[i//2], end=" ")
 163         print()
 164         print (idt, "n", n, "beta", end=" ")
 165         for i in range(0, n, 2):
 166             print (i, beta[i//2], end=" ")
 167         print()
 168         alpha = transform(alpha, indent+1)
 169         beta  = transform(beta , indent+1)
 170         result = [0] * n
 171         for i in range(half):
 172             result[i*2] = alpha[i]
 173             result[i*2+1] = beta[i]
 174         print(idt, "merge", result)
 175         for i in range(half - 1):
 176             result[i*2+1] += result[i*2+3]
 177         print(idt, "result", result)
 178         return result
 179
 180
 181 # totally cool *in-place* DCT algorithm
 182 def transform2(vec):
 183
 184     # Initialization
 185     n = len(vec)
 186     print ()
 187     print ("transform2", n)
 188     levels = n.bit_length() - 1
 189
 190     # reference (read/write) the in-place data in *reverse-bit-order*
 191     ri = list(range(n))
 192     ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 193
 194     # reference list for not needing to do data-swaps, just swap what
 195     # *indices* are referenced (two levels of indirection at the moment)
 196     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
 197     ji = list(range(n))
 198     ji = halfrev2(ji, True)
 199
 200     # and pretend we LDed data in half-swapped *and* bit-reversed order as well
 201     # TODO: merge these two
 202     vec = halfrev2(vec, False)
 203     vec = [vec[ri[i]] for i in range(n)]
 204
 205     print ("ri", ri)
 206     print ("ji", ji)
 207
 208     # create a cos table: not strictly necessary but here for illustrative
 209     # purposes, to demonstrate the point that it really *is* iterative.
 210     # this table could be cached and used multiple times rather than
 211     # computed every time.
 212     ctable = []
 213     size = n
 214     while size >= 2:
 215         halfsize = size // 2
 216         for i in range(n//size):
 217             for ci in range(halfsize):
 218                 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
 219         size //= 2
 220
 221     # start the inner butterfly
 222     size = n
 223     k = 0
 224     while size >= 2:
 225         halfsize = size // 2
 226         tablestep = n // size
 227         ir = list(range(0, n, size))
 228         print ("  xform", size, ir)
 229         for i in ir:
 230             # two lists of half-range indices, e.g. j 0123, jr 7654
 231             j = list(range(i, i + halfsize))
 232             jr = list(range(i+halfsize, i + size))
 233             jr.reverse()
 234             print ("  xform jr", j, jr)
 235             for ci, (jl, jh) in enumerate(zip(j, jr)):
 236                 t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
 237                 #coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
 238                 coeff = ctable[k]
 239                 k += 1
 240                 # normally DCT would use jl+halfsize not jh, here.
 241                 # to be able to work in-place, the idea is to perform a
 242                 # swap afterwards.
 243                 vec[ri[ji[jl]]] = t1 + t2
 244                 vec[ri[ji[jh]]] = (t1 - t2) * (1/coeff)
 245                 print ("coeff", size, i, "ci", ci,
 246                         "jl", ri[ji[jl]], "jh", ri[ji[jh]],
 247                        "i/n", (ci+0.5)/size, coeff, vec[ri[ji[jl]]],
 248                                                     vec[ri[ji[jh]]])
 249             # instead of using jl+halfsize, perform a swap here.
 250             # use half of j/jr because actually jl+halfsize = reverse(j)
 251             hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
 252             for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
 253                 jlh = jl+halfsize
 254                 # swap indices, NOT the data
 255                 tmp1, tmp2 = ji[jlh], ji[jh]
 256                 ji[jlh], ji[jh] = tmp2, tmp1
 257                 print ("     swap", size, i, ji[jlh], ji[jh])
 258         size //= 2
 259
 260     print("post-swapped", ri)
 261     print("ji-swapped", ji)
 262     print("transform2 pre-itersum", vec)
 263
 264     # now things are in the right order for the outer butterfly.
 265     n = len(vec)
 266     size = n // 2
 267     while size >= 2:
 268         halfsize = size // 2
 269         ir = list(range(0, halfsize))
 270         print ("itersum", halfsize, size, ir)
 271         for i in ir:
 272             jr = list(range(i+halfsize, i+n-halfsize, size))
 273             print ("itersum    jr", i+halfsize, i+size, jr)
 274             for jh in jr:
 275                 vec[jh] += vec[jh+size]
 276                 print ("    itersum", size, i, jh, jh+size)
 277         size //= 2
 278
 279     print("transform2 result", vec)
 280
 281     return vec
 282
 283
 284 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
 285 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
 286 def inverse_transform(vector, root=True, indent=0):
 287     idt = "   " * indent
 288     if root:
 289         vector = list(vector)
 290         vector[0] /= 2
 291     n = len(vector)
 292     if n == 1:
 293         return vector, [0]
 294     elif n == 0 or n % 2 != 0:
 295         raise ValueError()
 296     else:
 297         half = n // 2
 298         alpha = [vector[0]]
 299         beta  = [vector[1]]
 300         for i in range(2, n, 2):
 301             alpha.append(vector[i])
 302             beta.append(vector[i - 1] + vector[i + 1])
 303         print (idt, "n", n, "alpha 0", end=" ")
 304         for i in range(2, n, 2):
 305             print (i, end=" ")
 306         print ("beta 1", end=" ")
 307         for i in range(2, n, 2):
 308             print ("%d+%d" % (i-1, i+1), end=" ")
 309         print()
 310         inverse_transform(alpha, False, indent+1)
 311         inverse_transform(beta , False, indent+1)
 312         for i in range(half):
 313             x, y = alpha[i], beta[i]
 314             coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
 315             y /= coeff
 316             vector[i] = x + y
 317             vector[n-(i+1)] = x - y
 318             print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
 319             print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
 320         return vector
 321
 322
 323 # totally cool *in-place* DCT algorithm
 324 def inverse_transform_iter(vec):
 325
 326     # Initialization
 327     n = len(vec)
 328     print ()
 329     print ("transform2 inv", n, vec)
 330     levels = n.bit_length() - 1
 331
 332     # reference (read/write) the in-place data in *reverse-bit-order*
 333     ri = list(range(n))
 334     ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 335
 336     # reference list for not needing to do data-swaps, just swap what
 337     # *indices* are referenced (two levels of indirection at the moment)
 338     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
 339     ji = list(range(n))
 340     ji = halfrev2(ji, False)
 341
 342     print ("ri", ri)
 343     print ("ji", ji)
 344
 345     # create a cos table: not strictly necessary but here for illustrative
 346     # purposes, to demonstrate the point that it really *is* iterative.
 347     # this table could be cached and used multiple times rather than
 348     # computed every time.
 349     ctable = []
 350     size = n
 351     while size >= 2:
 352         halfsize = size // 2
 353         for i in range(n//size):
 354             for ci in range(halfsize):
 355                 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
 356         size //= 2
 357
 358     # first divide element 0 by 2
 359     vec[0] /= 2.0
 360
 361     print("transform2-inv pre-itersum", vec)
 362     #vec = halfrev2(vec, True)
 363     #print("transform2-inv post-itersum-reorder", vec)
 364
 365     # first the outer butterfly (iterative sum thing)
 366     n = len(vec)
 367     size = 2
 368     while size <= n:
 369         halfsize = size // 2
 370         ir = list(range(0, halfsize))
 371         print ("itersum", halfsize, size, ir)
 372         for i in ir:
 373             jr = list(range(i+halfsize, i+n-halfsize, size))
 374             jr.reverse()
 375             print ("itersum    jr", i+halfsize, i+size, jr)
 376             for jh in jr:
 377                 #x = vec[ji[jh]]
 378                 #y = vec[ji[jh+size]]
 379                 #vec[ji[jh+size]] = x + y
 380                 x = vec[jh]
 381                 y = vec[jh+size]
 382                 vec[jh+size] = x + y
 383                 print ("    itersum", size, i, jh, jh+size,
 384                         x, y, "jh+sz", vec[ji[jh+size]])
 385         size *= 2
 386
 387     print("transform2-inv post-itersum", vec)
 388
 389     # and pretend we LDed data in half-swapped *and* bit-reversed order as well
 390     # TODO: merge these two
 391     vec = [vec[ri[i]] for i in range(n)]
 392     vec = halfrev2(vec, True)
 393     ri = list(range(n))
 394
 395     print("transform2-inv post-reorder", vec)
 396
 397     # start the inner butterfly (coefficients)
 398     size = 2
 399     k = 0
 400     while size <= n:
 401         halfsize = size // 2
 402         tablestep = n // size
 403         ir = list(range(0, n, size))
 404         print ("  xform", size, ir)
 405         for i in ir:
 406             # two lists of half-range indices, e.g. j 0123, jr 7654
 407             j = list(range(i, i + halfsize))
 408             jr = list(range(i+halfsize, i + size))
 409             jr.reverse()
 410             print ("  xform jr", j, jr)
 411             for ci, (jl, jh) in enumerate(zip(j, jr)):
 412                 #t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
 413                 t1, t2 = vec[ji[jl]], vec[ji[jl+halfsize]]
 414                 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
 415                 #coeff = ctable[k]
 416                 k += 1
 417                 # normally DCT would use jl+halfsize not jh, here.
 418                 # to be able to work in-place, the idea is to perform a
 419                 # swap afterwards.
 420                 #vec[ri[ji[jl]]] = t1 + t2/coeff
 421                 #vec[ri[ji[jh]]] = t1 - t2/coeff
 422                 vec[ji[jl]] = t1 + t2/coeff
 423                 vec[ji[jl+halfsize]] = t1 - t2/coeff
 424                 print ("coeff", size, i, "ci", ci,
 425                         "jl", ri[ji[jl]], "jh", ri[ji[jh]],
 426                        "i/n", (ci+0.5)/size, coeff,
 427                         "t1,t2", t1, t2,
 428                         "+/i", vec[ji[jl]], vec[ji[jh]])
 429                         #"+/i", vec2[ri[ji[jl]]], vec2[ri[ji[jh]]])
 430             # instead of using jl+halfsize, perform a swap here.
 431             # use half of j/jr because actually jl+halfsize = reverse(j)
 432             hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
 433             for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
 434                 jlh = jl+halfsize
 435                 # swap indices, NOT the data
 436                 tmp1, tmp2 = ji[jlh], ji[jh]
 437                 ji[jlh], ji[jh] = tmp2, tmp1
 438                 print ("     swap", size, i, ji[jlh], ji[jh])
 439         size *= 2
 440
 441     print("post-swapped", ri)
 442     print("ji-swapped", ji)
 443     ji = list(range(n))
 444     ji = halfrev2(ji, True)
 445     print("ji-calc   ", ji)
 446
 447     print("transform2-inv result", vec)
 448
 449     return vec
 450
 451
 452 def inverse_transform2(vector, root=True, indent=0):
 453     idt = "   " * indent
 454     n = len(vector)
 455     if root:
 456         vector = list(vector)
 457         vector[0] /= 2
 458     if n == 1:
 459         return vector
 460     elif n == 0 or n % 2 != 0:
 461         raise ValueError()
 462     else:
 463         print (idt, "inverse_xform2", vector)
 464         half = n // 2
 465         alpha = [vector[0]]
 466         beta  = [vector[1]]
 467         for i in range(2, n, 2):
 468             alpha.append(vector[i])
 469             beta.append(vector[i - 1] + vector[i + 1])
 470             print (idt, " alpha", i, vector[i])
 471             print (idt, " beta", i-1, i+1, vector[i-1], vector[i+1], "->",
 472                           beta[-1])
 473         inverse_transform2(alpha, False, indent+1)
 474         inverse_transform2(beta , False, indent+1)
 475         for i in range(half):
 476             x, y = alpha[i], beta[i]
 477             coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
 478             vector[i] = x + y / coeff
 479             vector[n-(i+1)] = x - y / coeff
 480             print (idt, " v[%d] = %f+%f/%f=%f" % (i, x, y, coeff, vector[i]))
 481             print (idt, " v[%d] = %f-%f/%f=%f" % (n-i-1, x, y,
 482                                                   coeff, vector[n-i-1]))
 483         return vector
 484
 485
 486 def inverse_transform2_explore(vector, root=True, indent=0):
 487     n = len(vector)
 488     if root:
 489         vector = list(vector)
 490     if n == 1:
 491         return vector
 492     elif n == 0 or n % 2 != 0:
 493         raise ValueError()
 494     else:
 495         half = n // 2
 496         alpha = [vector[0]]
 497         beta  = [vector[1]]
 498         for i in range(2, n, 2):
 499             alpha.append(vector[i])
 500             beta.append(("add%d" % indent, vector[i - 1], vector[i + 1]))
 501         inverse_transform2_explore(alpha, False, indent+1)
 502         inverse_transform2_explore(beta , False, indent+1)
 503         for i in range(half):
 504             x = alpha[i]
 505             y = ("cos%d" % indent, beta[i], i, n)
 506             vector[i] = ("add%d" % indent, x, y)
 507             vector[n-(i + 1)] = ("sub%d" % indent, x, y)
 508         return vector
 509
 510
 511
 512 # does the outer butterfly in a recursive fashion, used in an
 513 # intermediary development of the in-place DCT.
 514 def transform_itersum(vector, indent=0):
 515     idt = "   " * indent
 516     n = len(vector)
 517     if n == 1:
 518         return list(vector)
 519     elif n == 0 or n % 2 != 0:
 520         raise ValueError()
 521     else:
 522         half = n // 2
 523         alpha = [0] * half
 524         beta = [0] * half
 525         for i in range(half):
 526             t1, t2 = vector[i], vector[i+half]
 527             alpha[i] = t1
 528             beta[i] = t2
 529         alpha = transform_itersum(alpha, indent+1)
 530         beta  = transform_itersum(beta , indent+1)
 531         result = [0] * n
 532         for i in range(half):
 533             result[i*2] = alpha[i]
 534             result[i*2+1] = beta[i]
 535         print(idt, "iter-merge", result)
 536         for i in range(half - 1):
 537             result[i*2+1] += result[i*2+3]
 538         print(idt, "iter-result", result)
 539         return result
 540
 541
 542 # prints out an "add" schedule for the outer butterfly, recursively,
 543 # matching what transform_itersum does.
 544 def itersum_explore(vector, indent=0):
 545     idt = "   " * indent
 546     n = len(vector)
 547     if n == 1:
 548         return list(vector)
 549     elif n == 0 or n % 2 != 0:
 550         raise ValueError()
 551     else:
 552         half = n // 2
 553         alpha = [0] * half
 554         beta = [0] * half
 555         for i in range(half):
 556             t1, t2 = vector[i], vector[i+half]
 557             alpha[i] = t1
 558             beta[i] = t2
 559         alpha = itersum_explore(alpha, indent+1)
 560         beta  = itersum_explore(beta , indent+1)
 561         result = [0] * n
 562         for i in range(half):
 563             result[i*2] = alpha[i]
 564             result[i*2+1] = beta[i]
 565         print(idt, "iter-merge", result)
 566         for i in range(half - 1):
 567             result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
 568         print(idt, "iter-result", result)
 569         return result
 570
 571
 572 # prints out the exact same outer butterfly but does so iteratively.
 573 # by comparing the output from itersum_explore and itersum_explore2
 574 # and by drawing out the resultant ADDs as a graph it was possible
 575 # to deduce what the heck was going on.
 576 def itersum_explore2(vec, indent=0):
 577     n = len(vec)
 578     size = n // 2
 579     while size >= 2:
 580         halfsize = size // 2
 581         ir = list(range(0, halfsize))
 582         print ("itersum", halfsize, size, ir)
 583         for i in ir:
 584             jr = list(range(i+halfsize, i+n-halfsize, size))
 585             print ("itersum    jr", i+halfsize, i+size, jr)
 586             for jh in jr:
 587                 vec[jh] = ("add", vec[jh], vec[jh+size])
 588                 print ("    itersum", size, i, jh, jh+size)
 589         size //= 2
 590
 591     return vec
 592
 593 if __name__ == '__main__':
 594     n = 16
 595     vec = list(range(n))
 596     levels = n.bit_length() - 1
 597     vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 598     ops = itersum_explore(vec)
 599     for i, x in enumerate(ops):
 600         print (i, x)
 601
 602     n = 16
 603     vec = list(range(n))
 604     levels = n.bit_length() - 1
 605     ops = itersum_explore2(vec)
 606     for i, x in enumerate(ops):
 607         print (i, x)
 608
 609     # halfrev test
 610     vec = list(range(16))
 611     print ("orig vec", vec)
 612     vecr = halfrev(vec, True)
 613     print ("reversed", vecr)
 614     for i, v in enumerate(vecr):
 615         print ("%2d %2d   %04s %04s %04s" % (i, v,
 616                             bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
 617     vecrr = halfrev(vecr, False)
 618     assert vec == vecrr
 619     vecrr = halfrev(vec, False)
 620     print ("pre-reversed", vecrr)
 621     for i, v in enumerate(vecrr):
 622         print ("%2d %2d   %04s %04s %04s" % (i, v,
 623                             bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
 624     il = halfrev2(vec, False)
 625     print ("iterative rev", il)
 626     il = halfrev2(vec, True)
 627     print ("iterative rev-true", il)
 628
 629     n = 4
 630     vec = list(range(n))
 631     levels = n.bit_length() - 1
 632     ops = inverse_transform2_explore(vec)
 633     for i, x in enumerate(ops):
 634         print (i, x)
 635