create coefficient table for DCT outside of loops
[openpower-isa.git] / src / openpower / decoder / isa / fastdctlee.py
1 #
2 # Fast discrete cosine transform algorithms (Python)
3 #
4 # Copyright (c) 2020 Project Nayuki. (MIT License)
5 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
6 #
7 # License for original fastdctlee.py by Nayuki:
8 #
9 # Permission is hereby granted, free of charge, to any person obtaining
10 # a copy of this software and associated documentation files (the
11 # "Software"), to deal in the Software without restriction, including
12 # without limitation the rights to use, copy, modify, merge, publish,
13 # distribute, sublicense, and/or sell copies of the Software, and to
14 # permit persons to whom the Software is furnished to do so, subject to
15 # the following conditions:
16 # - The above copyright notice and this permission notice shall be included in
17 # all copies or substantial portions of the Software.
18 # - The Software is provided "as is", without warranty of any kind, express or
19 # implied, including but not limited to the warranties of merchantability,
20 # fitness for a particular purpose and noninfringement. In no event shall the
21 # authors or copyright holders be liable for any claim, damages or other
22 # liability, whether in an action of contract, tort or otherwise,
23 # arising from, out of or in connection with the Software or the use
24 # or other dealings in the Software.
25 #
26 #
27 # Modifications made to in-place iterative DCT - SPDX: LGPLv3+
28 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
29 #
30 # The modifications made are firstly to create an iterative schedule,
31 # rather than the more normal recursive algorithm. Secondly, the
32 # two butterflys are also separated out: inner butterfly does COS +/-
33 # whilst outer butterfly does the iterative summing.
34 #
35 # However, to avoid data copying some additional tricks are played:
36 # - firstly, the data is LOADed in bit-reversed order (which is normally
37 # covered by the recursive algorithm due to the odd-even reconstruction)
38 # but then to reference the data in the correct order an array of
39 # bit-reversed indices is created, as a level of indirection.
40 # the data is bit-reversed but so are the indices, making it all A-Ok.
41 # - secondly, normally in DCT a 2nd target (copy) array is used where
42 # the top half is read in reverse order (7 6 5 4) and written out
43 # to the target 4 5 6 7. the plan was to do this in two stages:
44 # write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
45 # the insight then was: to modify the *indirection* indices rather
46 # than swap the actual data, and then try the same trick as was done
47 # with the bit-reversed LOAD. by a bizarre twist of good fortune
48 # *that was not needed*! simply swapping the indices was enough!
49 # End result is that once the first butterfly is done - bear in mind
50 # it's in-place - the data is in the right order so that a second
51 # dead-straightforward iterative sum can be done: again, in-place.
52 # Really surprising.
53
54 import math
55 from copy import deepcopy
56
57 # bits of the integer 'val'.
58 def reverse_bits(val, width):
59 result = 0
60 for _ in range(width):
61 result = (result << 1) | (val & 1)
62 val >>= 1
63 return result
64
65
66 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
67 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
68 # original (recursive) algorithm by Nayuki
69 def transform(vector, indent=0):
70 idt = " " * indent
71 n = len(vector)
72 if n == 1:
73 return list(vector)
74 elif n == 0 or n % 2 != 0:
75 raise ValueError()
76 else:
77 half = n // 2
78 alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
79 beta = [(vector[i] - vector[-(i + 1)]) /
80 (math.cos((i + 0.5) * math.pi / n) * 2.0)
81 for i in range(half)]
82 alpha = transform(alpha)
83 beta = transform(beta )
84 result = []
85 for i in range(half - 1):
86 result.append(alpha[i])
87 result.append(beta[i] + beta[i + 1])
88 result.append(alpha[-1])
89 result.append(beta [-1])
90 return result
91
92
93 # modified (iterative) algorithm by lkcl, based on Nayuki original
94 def transform(vector, indent=0):
95 idt = " " * indent
96 n = len(vector)
97 if n == 1:
98 return list(vector)
99 elif n == 0 or n % 2 != 0:
100 raise ValueError()
101 else:
102 half = n // 2
103 alpha = [0] * half
104 beta = [0] * half
105 print (idt, "xf", vector)
106 print (idt, "coeff", n, "->", end=" ")
107 for i in range(half):
108 t1, t2 = vector[i], vector[n-i-1]
109 k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
110 print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
111 alpha[i] = t1 + t2
112 beta[i] = (t1 - t2) * (1/k)
113 print ()
114 print (idt, "n", n, "alpha", end=" ")
115 for i in range(0, n, 2):
116 print (i, i//2, alpha[i//2], end=" ")
117 print()
118 print (idt, "n", n, "beta", end=" ")
119 for i in range(0, n, 2):
120 print (i, beta[i//2], end=" ")
121 print()
122 alpha = transform(alpha, indent+1)
123 beta = transform(beta , indent+1)
124 result = [0] * n
125 for i in range(half):
126 result[i*2] = alpha[i]
127 result[i*2+1] = beta[i]
128 print(idt, "merge", result)
129 for i in range(half - 1):
130 result[i*2+1] += result[i*2+3]
131 print(idt, "result", result)
132 return result
133
134
135 # totally cool *in-place* DCT algorithm
136 def transform2(vec):
137
138 vec = deepcopy(vec)
139 # Initialization
140 n = len(vec)
141 print ()
142 print ("transform2", n)
143 levels = n.bit_length() - 1
144
145 # reference (read/write) the in-place data in *reverse-bit-order*
146 ri = list(range(n))
147 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
148
149 # and pretend we LDed the data in bit-reversed order as well
150 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
151
152 # create cos coefficient table
153 coeffs = []
154 for ci in range(n):
155 coeffs.append((math.cos((ci + 0.5) * math.pi / n) * 2.0))
156
157 # start the inner butterfly
158 size = n
159 while size >= 2:
160 halfsize = size // 2
161 tablestep = n // size
162 ir = list(range(0, n, size))
163 print (" xform", size, ir)
164 for i in ir:
165 k = 0
166 j = list(range(i, i + halfsize))
167 jr = list(range(i+halfsize, i + size))
168 jr.reverse()
169 print (" xform jr", j, jr)
170 for ci, (jl, jh) in enumerate(zip(j, jr)):
171 t1, t2 = vec[ri[jl]], vec[ri[jh]]
172 # normally DCT would use jl+halfsize not jh, here.
173 # to be able to work in-place, the idea is to perform a
174 # high-half reverse/swap afterwards. however actually
175 # we swap the *indices*
176 coeff = coeffs[k]
177 vec[ri[jl]] = t1 + t2
178 vec[ri[jh]] = (t1 - t2) * (1/coeff)
179 print (" ", size, i, k, "ci", ci,
180 "jl", ri[jl], "jh", ri[jh],
181 "i/n", (k+0.5)/size, coeff, vec[ri[jl]], vec[ri[jh]])
182 k += tablestep
183 # instead of using jl+halfsize, perform a swap here.
184 # use half of j/jr because actually jl+halfsize = reverse(j)
185 # actually: swap the *indices*... not the actual data.
186 # incredibly... bizarrely... this works *without* having
187 # to do anything else.
188 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
189 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
190 tmp = ri[jl+halfsize]
191 ri[jl+halfsize] = ri[jh]
192 ri[jh] = tmp
193 print (" swap", size, i, ri[jl+halfsize], ri[jh])
194 size //= 2
195
196 print("post-swapped", ri)
197 print("transform2 pre-itersum", vec)
198
199 n = len(vec)
200 size = n // 2
201 while size >= 2:
202 halfsize = size // 2
203 ir = list(range(0, halfsize))
204 print ("itersum", halfsize, size, ir)
205 for i in ir:
206 jr = list(range(i+halfsize, i+n-halfsize, size))
207 print ("itersum jr", i+halfsize, i+size, jr)
208 for jh in jr:
209 vec[jh] += vec[jh+size]
210 print (" itersum", size, i, jh, jh+size)
211 size //= 2
212
213 print("transform2 result", vec)
214
215 return vec
216
217
218 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
219 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
220 def inverse_transform(vector, root=True, indent=0):
221 idt = " " * indent
222 if root:
223 vector = list(vector)
224 vector[0] /= 2
225 n = len(vector)
226 if n == 1:
227 return vector, [0]
228 elif n == 0 or n % 2 != 0:
229 raise ValueError()
230 else:
231 half = n // 2
232 alpha = [vector[0]]
233 beta = [vector[1]]
234 for i in range(2, n, 2):
235 alpha.append(vector[i])
236 beta.append(vector[i - 1] + vector[i + 1])
237 print (idt, "n", n, "alpha 0", end=" ")
238 for i in range(2, n, 2):
239 print (i, end=" ")
240 print ("beta 1", end=" ")
241 for i in range(2, n, 2):
242 print ("%d+%d" % (i-1, i+1), end=" ")
243 print()
244 inverse_transform(alpha, False, indent+1)
245 inverse_transform(beta , False, indent+1)
246 for i in range(half):
247 x = alpha[i]
248 y = beta[i] / (math.cos((i + 0.5) * math.pi / n) * 2)
249 vector[i] = x + y
250 vector[-(i + 1)] = x - y
251 print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
252 print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
253 return vector
254
255
256 def inverse_transform2(vector, root=True):
257 n = len(vector)
258 if root:
259 vector = list(vector)
260 if n == 1:
261 return vector
262 elif n == 0 or n % 2 != 0:
263 raise ValueError()
264 else:
265 half = n // 2
266 alpha = [0]
267 beta = [1]
268 for i in range(2, n, 2):
269 alpha.append(i)
270 beta.append(("add", i - 1, i + 1))
271 inverse_transform2(alpha, False)
272 inverse_transform2(beta , False)
273 for i in range(half):
274 x = alpha[i]
275 y = ("cos", beta[i], i)
276 vector[i] = ("add", x, y)
277 vector[-(i + 1)] = ("sub", x, y)
278 return vector
279
280
281 # does the outer butterfly in a recursive fashion, used in an
282 # intermediary development of the in-place DCT.
283 def transform_itersum(vector, indent=0):
284 idt = " " * indent
285 n = len(vector)
286 if n == 1:
287 return list(vector)
288 elif n == 0 or n % 2 != 0:
289 raise ValueError()
290 else:
291 half = n // 2
292 alpha = [0] * half
293 beta = [0] * half
294 for i in range(half):
295 t1, t2 = vector[i], vector[i+half]
296 alpha[i] = t1
297 beta[i] = t2
298 alpha = transform_itersum(alpha, indent+1)
299 beta = transform_itersum(beta , indent+1)
300 result = [0] * n
301 for i in range(half):
302 result[i*2] = alpha[i]
303 result[i*2+1] = beta[i]
304 print(idt, "iter-merge", result)
305 for i in range(half - 1):
306 result[i*2+1] += result[i*2+3]
307 print(idt, "iter-result", result)
308 return result
309
310
311 # prints out an "add" schedule for the outer butterfly, recursively,
312 # matching what transform_itersum does.
313 def itersum_explore(vector, indent=0):
314 idt = " " * indent
315 n = len(vector)
316 if n == 1:
317 return list(vector)
318 elif n == 0 or n % 2 != 0:
319 raise ValueError()
320 else:
321 half = n // 2
322 alpha = [0] * half
323 beta = [0] * half
324 for i in range(half):
325 t1, t2 = vector[i], vector[i+half]
326 alpha[i] = t1
327 beta[i] = t2
328 alpha = itersum_explore(alpha, indent+1)
329 beta = itersum_explore(beta , indent+1)
330 result = [0] * n
331 for i in range(half):
332 result[i*2] = alpha[i]
333 result[i*2+1] = beta[i]
334 print(idt, "iter-merge", result)
335 for i in range(half - 1):
336 result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
337 print(idt, "iter-result", result)
338 return result
339
340
341 # prints out the exact same outer butterfly but does so iteratively.
342 # by comparing the output from itersum_explore and itersum_explore2
343 # and by drawing out the resultant ADDs as a graph it was possible
344 # to deduce what the heck was going on.
345 def itersum_explore2(vec, indent=0):
346 n = len(vec)
347 size = n // 2
348 while size >= 2:
349 halfsize = size // 2
350 ir = list(range(0, halfsize))
351 print ("itersum", halfsize, size, ir)
352 for i in ir:
353 jr = list(range(i+halfsize, i+n-halfsize, size))
354 print ("itersum jr", i+halfsize, i+size, jr)
355 for jh in jr:
356 vec[jh] = ("add", vec[jh], vec[jh+size])
357 print (" itersum", size, i, jh, jh+size)
358 size //= 2
359
360 return vec
361
362 if __name__ == '__main__':
363 n = 16
364 vec = list(range(n))
365 levels = n.bit_length() - 1
366 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
367 ops = itersum_explore(vec)
368 for i, x in enumerate(ops):
369 print (i, x)
370
371 n = 16
372 vec = list(range(n))
373 levels = n.bit_length() - 1
374 ops = itersum_explore2(vec)
375 for i, x in enumerate(ops):
376 print (i, x)