src/openpower/decoder/isa/fastdctlee.py

   1 #
   2 # Fast discrete cosine transform algorithms (Python)
   3 #
   4 # Copyright (c) 2020 Project Nayuki. (MIT License)
   5 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
   6 #
   7 # License for original fastdctlee.py by Nayuki:
   8 #
   9 # Permission is hereby granted, free of charge, to any person obtaining
  10 # a copy of this software and associated documentation files (the
  11 # "Software"), to deal in the Software without restriction, including
  12 # without limitation the rights to use, copy, modify, merge, publish,
  13 # distribute, sublicense, and/or sell copies of the Software, and to
  14 # permit persons to whom the Software is furnished to do so, subject to
  15 # the following conditions:
  16 # - The above copyright notice and this permission notice shall be included in
  17 #   all copies or substantial portions of the Software.
  18 # - The Software is provided "as is", without warranty of any kind, express or
  19 #   implied, including but not limited to the warranties of merchantability,
  20 #   fitness for a particular purpose and noninfringement. In no event shall the
  21 #   authors or copyright holders be liable for any claim, damages or other
  22 #   liability, whether in an action of contract, tort or otherwise,
  23 #   arising from, out of or in connection with the Software or the use
  24 #   or other dealings in the Software.
  25 #
  26 #
  27 # Modifications made to in-place iterative DCT - SPDX: LGPLv3+
  28 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  29 #
  30 # The modifications made are firstly to create an iterative schedule,
  31 # rather than the more normal recursive algorithm.  Secondly, the
  32 # two butterflys are also separated out: inner butterfly does COS +/-
  33 # whilst outer butterfly does the iterative summing.
  34 #
  35 # However, to avoid data copying some additional tricks are played:
  36 # - firstly, the data is LOADed in bit-reversed order (which is normally
  37 #   covered by the recursive algorithm due to the odd-even reconstruction)
  38 #   but then to reference the data in the correct order an array of
  39 #   bit-reversed indices is created, as a level of indirection.
  40 #   the data is bit-reversed but so are the indices, making it all A-Ok.
  41 # - secondly, normally in DCT a 2nd target (copy) array is used where
  42 #   the top half is read in reverse order (7 6 5 4) and written out
  43 #   to the target 4 5 6 7.  the plan was to do this in two stages:
  44 #   write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
  45 #   the insight then was: to modify the *indirection* indices rather
  46 #   than swap the actual data, and then try the same trick as was done
  47 #   with the bit-reversed LOAD.  by a bizarre twist of good fortune
  48 #   *that was not needed*!  simply swapping the indices was enough!
  49 # End result is that once the first butterfly is done - bear in mind
  50 # it's in-place - the data is in the right order so that a second
  51 # dead-straightforward iterative sum can be done: again, in-place.
  52 # Really surprising.
  53
  54 import math
  55 from copy import deepcopy
  56
  57 # bits of the integer 'val'.
  58 def reverse_bits(val, width):
  59     result = 0
  60     for _ in range(width):
  61         result = (result << 1) | (val & 1)
  62         val >>= 1
  63     return result
  64
  65
  66 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
  67 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
  68 # original (recursive) algorithm by Nayuki
  69 def transform(vector, indent=0):
  70     idt = "   " * indent
  71     n = len(vector)
  72     if n == 1:
  73         return list(vector)
  74     elif n == 0 or n % 2 != 0:
  75         raise ValueError()
  76     else:
  77         half = n // 2
  78         alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
  79         beta  = [(vector[i] - vector[-(i + 1)]) /
  80                  (math.cos((i + 0.5) * math.pi / n) * 2.0)
  81                     for i in range(half)]
  82         alpha = transform(alpha)
  83         beta  = transform(beta )
  84         result = []
  85         for i in range(half - 1):
  86             result.append(alpha[i])
  87             result.append(beta[i] + beta[i + 1])
  88         result.append(alpha[-1])
  89         result.append(beta [-1])
  90         return result
  91
  92
  93 # modified (iterative) algorithm by lkcl, based on Nayuki original
  94 def transform(vector, indent=0):
  95     idt = "   " * indent
  96     n = len(vector)
  97     if n == 1:
  98         return list(vector)
  99     elif n == 0 or n % 2 != 0:
 100         raise ValueError()
 101     else:
 102         half = n // 2
 103         alpha = [0] * half
 104         beta = [0] * half
 105         print (idt, "xf", vector)
 106         print (idt, "coeff", n, "->", end=" ")
 107         for i in range(half):
 108             t1, t2 = vector[i], vector[n-i-1]
 109             k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
 110             print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
 111             alpha[i] = t1 + t2
 112             beta[i] = (t1 - t2) * (1/k)
 113         print ()
 114         print (idt, "n", n, "alpha", end=" ")
 115         for i in range(0, n, 2):
 116             print (i, i//2, alpha[i//2], end=" ")
 117         print()
 118         print (idt, "n", n, "beta", end=" ")
 119         for i in range(0, n, 2):
 120             print (i, beta[i//2], end=" ")
 121         print()
 122         alpha = transform(alpha, indent+1)
 123         beta  = transform(beta , indent+1)
 124         result = [0] * n
 125         for i in range(half):
 126             result[i*2] = alpha[i]
 127             result[i*2+1] = beta[i]
 128         print(idt, "merge", result)
 129         for i in range(half - 1):
 130             result[i*2+1] += result[i*2+3]
 131         print(idt, "result", result)
 132         return result
 133
 134
 135 # totally cool *in-place* DCT algorithm
 136 def transform2(vec):
 137
 138     vec = deepcopy(vec)
 139     # Initialization
 140     n = len(vec)
 141     print ()
 142     print ("transform2", n)
 143     levels = n.bit_length() - 1
 144
 145     # reference (read/write) the in-place data in *reverse-bit-order*
 146     ri = list(range(n))
 147     ri = [ri[reverse_bits(i, levels)] for i in range(n)]
 148
 149     # and pretend we LDed the data in bit-reversed order as well
 150     vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 151
 152     # create cos coefficient table
 153     coeffs = []
 154     for ci in range(n):
 155         coeffs.append((math.cos((ci + 0.5) * math.pi / n) * 2.0))
 156
 157     # start the inner butterfly
 158     size = n
 159     while size >= 2:
 160         halfsize = size // 2
 161         tablestep = n // size
 162         ir = list(range(0, n, size))
 163         print ("  xform", size, ir)
 164         for i in ir:
 165             k = 0
 166             j = list(range(i, i + halfsize))
 167             jr = list(range(i+halfsize, i + size))
 168             jr.reverse()
 169             print ("  xform jr", j, jr)
 170             for ci, (jl, jh) in enumerate(zip(j, jr)):
 171                 t1, t2 = vec[ri[jl]], vec[ri[jh]]
 172                 # normally DCT would use jl+halfsize not jh, here.
 173                 # to be able to work in-place, the idea is to perform a
 174                 # high-half reverse/swap afterwards.  however actually
 175                 # we swap the *indices*
 176                 coeff = coeffs[k]
 177                 vec[ri[jl]] = t1 + t2
 178                 vec[ri[jh]] = (t1 - t2) * (1/coeff)
 179                 print (" ", size, i, k, "ci", ci,
 180                         "jl", ri[jl], "jh", ri[jh],
 181                        "i/n", (k+0.5)/size, coeff, vec[ri[jl]], vec[ri[jh]])
 182                 k += tablestep
 183             # instead of using jl+halfsize, perform a swap here.
 184             # use half of j/jr because actually jl+halfsize = reverse(j)
 185             # actually: swap the *indices*... not the actual data.
 186             # incredibly... bizarrely... this works *without* having
 187             # to do anything else.
 188             hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
 189             for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
 190                 tmp = ri[jl+halfsize]
 191                 ri[jl+halfsize] = ri[jh]
 192                 ri[jh] = tmp
 193                 print ("     swap", size, i, ri[jl+halfsize], ri[jh])
 194         size //= 2
 195
 196     print("post-swapped", ri)
 197     print("transform2 pre-itersum", vec)
 198
 199     n = len(vec)
 200     size = n // 2
 201     while size >= 2:
 202         halfsize = size // 2
 203         ir = list(range(0, halfsize))
 204         print ("itersum", halfsize, size, ir)
 205         for i in ir:
 206             jr = list(range(i+halfsize, i+n-halfsize, size))
 207             print ("itersum    jr", i+halfsize, i+size, jr)
 208             for jh in jr:
 209                 vec[jh] += vec[jh+size]
 210                 print ("    itersum", size, i, jh, jh+size)
 211         size //= 2
 212
 213     print("transform2 result", vec)
 214
 215     return vec
 216
 217
 218 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
 219 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
 220 def inverse_transform(vector, root=True, indent=0):
 221     idt = "   " * indent
 222     if root:
 223         vector = list(vector)
 224         vector[0] /= 2
 225     n = len(vector)
 226     if n == 1:
 227         return vector, [0]
 228     elif n == 0 or n % 2 != 0:
 229         raise ValueError()
 230     else:
 231         half = n // 2
 232         alpha = [vector[0]]
 233         beta  = [vector[1]]
 234         for i in range(2, n, 2):
 235             alpha.append(vector[i])
 236             beta.append(vector[i - 1] + vector[i + 1])
 237         print (idt, "n", n, "alpha 0", end=" ")
 238         for i in range(2, n, 2):
 239             print (i, end=" ")
 240         print ("beta 1", end=" ")
 241         for i in range(2, n, 2):
 242             print ("%d+%d" % (i-1, i+1), end=" ")
 243         print()
 244         inverse_transform(alpha, False, indent+1)
 245         inverse_transform(beta , False, indent+1)
 246         for i in range(half):
 247             x = alpha[i]
 248             y = beta[i] / (math.cos((i + 0.5) * math.pi / n) * 2)
 249             vector[i] = x + y
 250             vector[-(i + 1)] = x - y
 251             print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
 252             print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
 253         return vector
 254
 255
 256 def inverse_transform2(vector, root=True):
 257     n = len(vector)
 258     if root:
 259         vector = list(vector)
 260     if n == 1:
 261         return vector
 262     elif n == 0 or n % 2 != 0:
 263         raise ValueError()
 264     else:
 265         half = n // 2
 266         alpha = [0]
 267         beta  = [1]
 268         for i in range(2, n, 2):
 269             alpha.append(i)
 270             beta.append(("add", i - 1, i + 1))
 271         inverse_transform2(alpha, False)
 272         inverse_transform2(beta , False)
 273         for i in range(half):
 274             x = alpha[i]
 275             y = ("cos", beta[i], i)
 276             vector[i] = ("add", x, y)
 277             vector[-(i + 1)] = ("sub", x, y)
 278         return vector
 279
 280
 281 # does the outer butterfly in a recursive fashion, used in an
 282 # intermediary development of the in-place DCT.
 283 def transform_itersum(vector, indent=0):
 284     idt = "   " * indent
 285     n = len(vector)
 286     if n == 1:
 287         return list(vector)
 288     elif n == 0 or n % 2 != 0:
 289         raise ValueError()
 290     else:
 291         half = n // 2
 292         alpha = [0] * half
 293         beta = [0] * half
 294         for i in range(half):
 295             t1, t2 = vector[i], vector[i+half]
 296             alpha[i] = t1
 297             beta[i] = t2
 298         alpha = transform_itersum(alpha, indent+1)
 299         beta  = transform_itersum(beta , indent+1)
 300         result = [0] * n
 301         for i in range(half):
 302             result[i*2] = alpha[i]
 303             result[i*2+1] = beta[i]
 304         print(idt, "iter-merge", result)
 305         for i in range(half - 1):
 306             result[i*2+1] += result[i*2+3]
 307         print(idt, "iter-result", result)
 308         return result
 309
 310
 311 # prints out an "add" schedule for the outer butterfly, recursively,
 312 # matching what transform_itersum does.
 313 def itersum_explore(vector, indent=0):
 314     idt = "   " * indent
 315     n = len(vector)
 316     if n == 1:
 317         return list(vector)
 318     elif n == 0 or n % 2 != 0:
 319         raise ValueError()
 320     else:
 321         half = n // 2
 322         alpha = [0] * half
 323         beta = [0] * half
 324         for i in range(half):
 325             t1, t2 = vector[i], vector[i+half]
 326             alpha[i] = t1
 327             beta[i] = t2
 328         alpha = itersum_explore(alpha, indent+1)
 329         beta  = itersum_explore(beta , indent+1)
 330         result = [0] * n
 331         for i in range(half):
 332             result[i*2] = alpha[i]
 333             result[i*2+1] = beta[i]
 334         print(idt, "iter-merge", result)
 335         for i in range(half - 1):
 336             result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
 337         print(idt, "iter-result", result)
 338         return result
 339
 340
 341 # prints out the exact same outer butterfly but does so iteratively.
 342 # by comparing the output from itersum_explore and itersum_explore2
 343 # and by drawing out the resultant ADDs as a graph it was possible
 344 # to deduce what the heck was going on.
 345 def itersum_explore2(vec, indent=0):
 346     n = len(vec)
 347     size = n // 2
 348     while size >= 2:
 349         halfsize = size // 2
 350         ir = list(range(0, halfsize))
 351         print ("itersum", halfsize, size, ir)
 352         for i in ir:
 353             jr = list(range(i+halfsize, i+n-halfsize, size))
 354             print ("itersum    jr", i+halfsize, i+size, jr)
 355             for jh in jr:
 356                 vec[jh] = ("add", vec[jh], vec[jh+size])
 357                 print ("    itersum", size, i, jh, jh+size)
 358         size //= 2
 359
 360     return vec
 361
 362 if __name__ == '__main__':
 363     n = 16
 364     vec = list(range(n))
 365     levels = n.bit_length() - 1
 366     vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 367     ops = itersum_explore(vec)
 368     for i, x in enumerate(ops):
 369         print (i, x)
 370
 371     n = 16
 372     vec = list(range(n))
 373     levels = n.bit_length() - 1
 374     ops = itersum_explore2(vec)
 375     for i, x in enumerate(ops):
 376         print (i, x)