code comments
[openpower-isa.git] / src / openpower / decoder / isa / fastdctlee.py
1 #
2 # Fast discrete cosine transform algorithm (Python)
3 #
4 # Modifications made to create an in-place iterative DCT:
5 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
6 #
7 # License for modifications - SPDX: LGPLv3+
8 #
9 # Original fastdctlee.py by Nayuki:
10 # Copyright (c) 2020 Project Nayuki. (MIT License)
11 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
12 #
13 # License for original fastdctlee.py by Nayuki:
14 #
15 # Permission is hereby granted, free of charge, to any person obtaining
16 # a copy of this software and associated documentation files (the
17 # "Software"), to deal in the Software without restriction, including
18 # without limitation the rights to use, copy, modify, merge, publish,
19 # distribute, sublicense, and/or sell copies of the Software, and to
20 # permit persons to whom the Software is furnished to do so, subject to
21 # the following conditions:
22 # - The above copyright notice and this permission notice shall be included in
23 # all copies or substantial portions of the Software.
24 # - The Software is provided "as is", without warranty of any kind, express or
25 # implied, including but not limited to the warranties of merchantability,
26 # fitness for a particular purpose and noninfringement. In no event shall the
27 # authors or copyright holders be liable for any claim, damages or other
28 # liability, whether in an action of contract, tort or otherwise,
29 # arising from, out of or in connection with the Software or the use
30 # or other dealings in the Software.
31 #
32 #
33 # The modifications made are firstly to create an iterative schedule,
34 # rather than the more normal recursive algorithm. Secondly, the
35 # two butterflys are also separated out: inner butterfly does COS +/-
36 # whilst outer butterfly does the iterative summing.
37 #
38 # However, to avoid data copying some additional tricks are played:
39 # - firstly, the data is LOADed in bit-reversed order (which is normally
40 # covered by the recursive algorithm due to the odd-even reconstruction)
41 # but then to reference the data in the correct order an array of
42 # bit-reversed indices is created, as a level of indirection.
43 # the data is bit-reversed but so are the indices, making it all A-Ok.
44 # - secondly, normally in DCT a 2nd target (copy) array is used where
45 # the top half is read in reverse order (7 6 5 4) and written out
46 # to the target 4 5 6 7. the plan was to do this in two stages:
47 # write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
48 # however by leaving the data *in-place* and having subsequent
49 # loops refer to the data *where it now is*, the swap is avoided
50 # - thirdly, arrange for the data to be *pre-swapped* (in an inverse
51 # order of how it would have got there, if that makes sense), such
52 # that *when* it gets swapped, it ends up in the right order.
53 # given that that will be a LD operation it's no big deal.
54 #
55 # End result is that once the first butterfly is done - bear in mind
56 # it's in-place - the data is in the right order so that a second
57 # dead-straightforward iterative sum can be done: again, in-place.
58 # Really surprising.
59
60 import math
61 from copy import deepcopy
62
63 # bits of the integer 'val'.
64 def reverse_bits(val, width):
65 result = 0
66 for _ in range(width):
67 result = (result << 1) | (val & 1)
68 val >>= 1
69 return result
70
71
72 # reverse top half of a list, recursively. the recursion can be
73 # applied *after* or *before* the reversal of the top half. these
74 # are inverses of each other.
75 # this function is unused except to test the iterative version (halfrev2)
76 def halfrev(l, pre_rev=True):
77 n = len(l)
78 if n == 1:
79 return l
80 ll, lh = l[:n//2], l[n//2:]
81 if pre_rev:
82 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
83 lh.reverse()
84 if not pre_rev:
85 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
86 return ll + lh
87
88
89 # iterative version of [recursively-applied] half-rev.
90 # relies on the list lengths being power-of-two and the fact
91 # that bit-inversion of a list of binary numbers is the same
92 # as reversing the order of the list
93 # this version is dead easy to implement in hardware.
94 # a big surprise is that the half-reversal can be done with
95 # such a simple XOR. the inverse operation is slightly trickier
96 def halfrev2(vec, pre_rev=True):
97 res = []
98 for i in range(len(vec)):
99 if pre_rev:
100 res.append(i ^ (i>>1))
101 else:
102 ri = i
103 bl = i.bit_length()
104 for ji in range(1, bl):
105 ri ^= (i >> ji)
106 res.append(vec[ri])
107 return res
108
109
110 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
111 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
112 # original (recursive) algorithm by Nayuki
113 def transform(vector, indent=0):
114 idt = " " * indent
115 n = len(vector)
116 if n == 1:
117 return list(vector)
118 elif n == 0 or n % 2 != 0:
119 raise ValueError()
120 else:
121 half = n // 2
122 alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
123 beta = [(vector[i] - vector[-(i + 1)]) /
124 (math.cos((i + 0.5) * math.pi / n) * 2.0)
125 for i in range(half)]
126 alpha = transform(alpha)
127 beta = transform(beta )
128 result = []
129 for i in range(half - 1):
130 result.append(alpha[i])
131 result.append(beta[i] + beta[i + 1])
132 result.append(alpha[-1])
133 result.append(beta [-1])
134 return result
135
136
137 # modified recursive algorithm, based on Nayuki original, which simply
138 # prints out an awful lot of debug data. used to work out the ordering
139 # for the iterative version by analysing the indices printed out
140 def transform(vector, indent=0):
141 idt = " " * indent
142 n = len(vector)
143 if n == 1:
144 return list(vector)
145 elif n == 0 or n % 2 != 0:
146 raise ValueError()
147 else:
148 half = n // 2
149 alpha = [0] * half
150 beta = [0] * half
151 print (idt, "xf", vector)
152 print (idt, "coeff", n, "->", end=" ")
153 for i in range(half):
154 t1, t2 = vector[i], vector[n-i-1]
155 k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
156 print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
157 alpha[i] = t1 + t2
158 beta[i] = (t1 - t2) * (1/k)
159 print ()
160 print (idt, "n", n, "alpha", end=" ")
161 for i in range(0, n, 2):
162 print (i, i//2, alpha[i//2], end=" ")
163 print()
164 print (idt, "n", n, "beta", end=" ")
165 for i in range(0, n, 2):
166 print (i, beta[i//2], end=" ")
167 print()
168 alpha = transform(alpha, indent+1)
169 beta = transform(beta , indent+1)
170 result = [0] * n
171 for i in range(half):
172 result[i*2] = alpha[i]
173 result[i*2+1] = beta[i]
174 print(idt, "merge", result)
175 for i in range(half - 1):
176 result[i*2+1] += result[i*2+3]
177 print(idt, "result", result)
178 return result
179
180
181 # totally cool *in-place* DCT algorithm
182 def transform2(vec):
183
184 # Initialization
185 n = len(vec)
186 print ()
187 print ("transform2", n)
188 levels = n.bit_length() - 1
189
190 # reference (read/write) the in-place data in *reverse-bit-order*
191 ri = list(range(n))
192 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
193
194 # reference list for not needing to do data-swaps, just swap what
195 # *indices* are referenced (two levels of indirection at the moment)
196 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
197 ji = list(range(n))
198 ji = halfrev2(ji, True)
199
200 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
201 # TODO: merge these two
202 vec = halfrev2(vec, False)
203 vec = [vec[ri[i]] for i in range(n)]
204
205 print ("ri", ri)
206 print ("ji", ji)
207
208 # create a cos table: not strictly necessary but here for illustrative
209 # purposes, to demonstrate the point that it really *is* iterative.
210 # this table could be cached and used multiple times rather than
211 # computed every time.
212 ctable = []
213 size = n
214 while size >= 2:
215 halfsize = size // 2
216 for i in range(n//size):
217 for ci in range(halfsize):
218 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
219 size //= 2
220
221 # start the inner butterfly
222 size = n
223 k = 0
224 while size >= 2:
225 halfsize = size // 2
226 tablestep = n // size
227 ir = list(range(0, n, size))
228 print (" xform", size, ir)
229 for i in ir:
230 # two lists of half-range indices, e.g. j 0123, jr 7654
231 j = list(range(i, i + halfsize))
232 jr = list(range(i+halfsize, i + size))
233 jr.reverse()
234 print (" xform jr", j, jr)
235 for ci, (jl, jh) in enumerate(zip(j, jr)):
236 t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
237 #coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
238 coeff = ctable[k]
239 k += 1
240 # normally DCT would use jl+halfsize not jh, here.
241 # to be able to work in-place, the idea is to perform a
242 # swap afterwards.
243 vec[ri[ji[jl]]] = t1 + t2
244 vec[ri[ji[jh]]] = (t1 - t2) * (1/coeff)
245 print ("coeff", size, i, "ci", ci,
246 "jl", ri[ji[jl]], "jh", ri[ji[jh]],
247 "i/n", (ci+0.5)/size, coeff, vec[ri[ji[jl]]],
248 vec[ri[ji[jh]]])
249 # instead of using jl+halfsize, perform a swap here.
250 # use half of j/jr because actually jl+halfsize = reverse(j)
251 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
252 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
253 jlh = jl+halfsize
254 # swap indices, NOT the data
255 tmp1, tmp2 = ji[jlh], ji[jh]
256 ji[jlh], ji[jh] = tmp2, tmp1
257 print (" swap", size, i, ji[jlh], ji[jh])
258 size //= 2
259
260 print("post-swapped", ri)
261 print("ji-swapped", ji)
262 print("transform2 pre-itersum", vec)
263
264 # now things are in the right order for the outer butterfly.
265 n = len(vec)
266 size = n // 2
267 while size >= 2:
268 halfsize = size // 2
269 ir = list(range(0, halfsize))
270 print ("itersum", halfsize, size, ir)
271 for i in ir:
272 jr = list(range(i+halfsize, i+n-halfsize, size))
273 print ("itersum jr", i+halfsize, i+size, jr)
274 for jh in jr:
275 vec[jh] += vec[jh+size]
276 print (" itersum", size, i, jh, jh+size)
277 size //= 2
278
279 print("transform2 result", vec)
280
281 return vec
282
283
284 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
285 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
286 def inverse_transform(vector, root=True, indent=0):
287 idt = " " * indent
288 if root:
289 vector = list(vector)
290 vector[0] /= 2
291 n = len(vector)
292 if n == 1:
293 return vector, [0]
294 elif n == 0 or n % 2 != 0:
295 raise ValueError()
296 else:
297 half = n // 2
298 alpha = [vector[0]]
299 beta = [vector[1]]
300 for i in range(2, n, 2):
301 alpha.append(vector[i])
302 beta.append(vector[i - 1] + vector[i + 1])
303 print (idt, "n", n, "alpha 0", end=" ")
304 for i in range(2, n, 2):
305 print (i, end=" ")
306 print ("beta 1", end=" ")
307 for i in range(2, n, 2):
308 print ("%d+%d" % (i-1, i+1), end=" ")
309 print()
310 inverse_transform(alpha, False, indent+1)
311 inverse_transform(beta , False, indent+1)
312 for i in range(half):
313 x = alpha[i]
314 y = beta[i] / (math.cos((i + 0.5) * math.pi / n) * 2)
315 vector[i] = x + y
316 vector[-(i + 1)] = x - y
317 print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
318 print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
319 return vector
320
321
322 def inverse_transform2(vector, root=True):
323 n = len(vector)
324 if root:
325 vector = list(vector)
326 if n == 1:
327 return vector
328 elif n == 0 or n % 2 != 0:
329 raise ValueError()
330 else:
331 half = n // 2
332 alpha = [0]
333 beta = [1]
334 for i in range(2, n, 2):
335 alpha.append(i)
336 beta.append(("add", i - 1, i + 1))
337 inverse_transform2(alpha, False)
338 inverse_transform2(beta , False)
339 for i in range(half):
340 x = alpha[i]
341 y = ("cos", beta[i], i)
342 vector[i] = ("add", x, y)
343 vector[-(i + 1)] = ("sub", x, y)
344 return vector
345
346
347 # does the outer butterfly in a recursive fashion, used in an
348 # intermediary development of the in-place DCT.
349 def transform_itersum(vector, indent=0):
350 idt = " " * indent
351 n = len(vector)
352 if n == 1:
353 return list(vector)
354 elif n == 0 or n % 2 != 0:
355 raise ValueError()
356 else:
357 half = n // 2
358 alpha = [0] * half
359 beta = [0] * half
360 for i in range(half):
361 t1, t2 = vector[i], vector[i+half]
362 alpha[i] = t1
363 beta[i] = t2
364 alpha = transform_itersum(alpha, indent+1)
365 beta = transform_itersum(beta , indent+1)
366 result = [0] * n
367 for i in range(half):
368 result[i*2] = alpha[i]
369 result[i*2+1] = beta[i]
370 print(idt, "iter-merge", result)
371 for i in range(half - 1):
372 result[i*2+1] += result[i*2+3]
373 print(idt, "iter-result", result)
374 return result
375
376
377 # prints out an "add" schedule for the outer butterfly, recursively,
378 # matching what transform_itersum does.
379 def itersum_explore(vector, indent=0):
380 idt = " " * indent
381 n = len(vector)
382 if n == 1:
383 return list(vector)
384 elif n == 0 or n % 2 != 0:
385 raise ValueError()
386 else:
387 half = n // 2
388 alpha = [0] * half
389 beta = [0] * half
390 for i in range(half):
391 t1, t2 = vector[i], vector[i+half]
392 alpha[i] = t1
393 beta[i] = t2
394 alpha = itersum_explore(alpha, indent+1)
395 beta = itersum_explore(beta , indent+1)
396 result = [0] * n
397 for i in range(half):
398 result[i*2] = alpha[i]
399 result[i*2+1] = beta[i]
400 print(idt, "iter-merge", result)
401 for i in range(half - 1):
402 result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
403 print(idt, "iter-result", result)
404 return result
405
406
407 # prints out the exact same outer butterfly but does so iteratively.
408 # by comparing the output from itersum_explore and itersum_explore2
409 # and by drawing out the resultant ADDs as a graph it was possible
410 # to deduce what the heck was going on.
411 def itersum_explore2(vec, indent=0):
412 n = len(vec)
413 size = n // 2
414 while size >= 2:
415 halfsize = size // 2
416 ir = list(range(0, halfsize))
417 print ("itersum", halfsize, size, ir)
418 for i in ir:
419 jr = list(range(i+halfsize, i+n-halfsize, size))
420 print ("itersum jr", i+halfsize, i+size, jr)
421 for jh in jr:
422 vec[jh] = ("add", vec[jh], vec[jh+size])
423 print (" itersum", size, i, jh, jh+size)
424 size //= 2
425
426 return vec
427
428 if __name__ == '__main__':
429 n = 16
430 vec = list(range(n))
431 levels = n.bit_length() - 1
432 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
433 ops = itersum_explore(vec)
434 for i, x in enumerate(ops):
435 print (i, x)
436
437 n = 16
438 vec = list(range(n))
439 levels = n.bit_length() - 1
440 ops = itersum_explore2(vec)
441 for i, x in enumerate(ops):
442 print (i, x)
443
444 # halfrev test
445 vec = list(range(16))
446 print ("orig vec", vec)
447 vecr = halfrev(vec, True)
448 print ("reversed", vecr)
449 for i, v in enumerate(vecr):
450 print ("%2d %2d %04s %04s %04s" % (i, v,
451 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
452 vecrr = halfrev(vecr, False)
453 assert vec == vecrr
454 vecrr = halfrev(vec, False)
455 print ("pre-reversed", vecrr)
456 for i, v in enumerate(vecrr):
457 print ("%2d %2d %04s %04s %04s" % (i, v,
458 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
459 il = halfrev2(vec, False)
460 print ("iterative rev", il)
461 il = halfrev2(vec, True)
462 print ("iterative rev-true", il)