get byte-swapping functional in inverse-dct proof-of-concept
[openpower-isa.git] / src / openpower / decoder / isa / fastdctlee.py
1 #
2 # Fast discrete cosine transform algorithm (Python)
3 #
4 # Modifications made to create an in-place iterative DCT:
5 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
6 #
7 # License for modifications - SPDX: LGPLv3+
8 #
9 # Original fastdctlee.py by Nayuki:
10 # Copyright (c) 2020 Project Nayuki. (MIT License)
11 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
12 #
13 # License for original fastdctlee.py by Nayuki:
14 #
15 # Permission is hereby granted, free of charge, to any person obtaining
16 # a copy of this software and associated documentation files (the
17 # "Software"), to deal in the Software without restriction, including
18 # without limitation the rights to use, copy, modify, merge, publish,
19 # distribute, sublicense, and/or sell copies of the Software, and to
20 # permit persons to whom the Software is furnished to do so, subject to
21 # the following conditions:
22 # - The above copyright notice and this permission notice shall be included in
23 # all copies or substantial portions of the Software.
24 # - The Software is provided "as is", without warranty of any kind, express or
25 # implied, including but not limited to the warranties of merchantability,
26 # fitness for a particular purpose and noninfringement. In no event shall the
27 # authors or copyright holders be liable for any claim, damages or other
28 # liability, whether in an action of contract, tort or otherwise,
29 # arising from, out of or in connection with the Software or the use
30 # or other dealings in the Software.
31 #
32 #
33 # The modifications made are firstly to create an iterative schedule,
34 # rather than the more normal recursive algorithm. Secondly, the
35 # two butterflys are also separated out: inner butterfly does COS +/-
36 # whilst outer butterfly does the iterative summing.
37 #
38 # However, to avoid data copying some additional tricks are played:
39 # - firstly, the data is LOADed in bit-reversed order (which is normally
40 # covered by the recursive algorithm due to the odd-even reconstruction)
41 # but then to reference the data in the correct order an array of
42 # bit-reversed indices is created, as a level of indirection.
43 # the data is bit-reversed but so are the indices, making it all A-Ok.
44 # - secondly, normally in DCT a 2nd target (copy) array is used where
45 # the top half is read in reverse order (7 6 5 4) and written out
46 # to the target 4 5 6 7. the plan was to do this in two stages:
47 # write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
48 # however by leaving the data *in-place* and having subsequent
49 # loops refer to the data *where it now is*, the swap is avoided
50 # - thirdly, arrange for the data to be *pre-swapped* (in an inverse
51 # order of how it would have got there, if that makes sense), such
52 # that *when* it gets swapped, it ends up in the right order.
53 # given that that will be a LD operation it's no big deal.
54 #
55 # End result is that once the first butterfly is done - bear in mind
56 # it's in-place - the data is in the right order so that a second
57 # dead-straightforward iterative sum can be done: again, in-place.
58 # Really surprising.
59
60 import math
61 from copy import deepcopy
62
63 # bits of the integer 'val'.
64 def reverse_bits(val, width):
65 result = 0
66 for _ in range(width):
67 result = (result << 1) | (val & 1)
68 val >>= 1
69 return result
70
71
72 # reverse top half of a list, recursively. the recursion can be
73 # applied *after* or *before* the reversal of the top half. these
74 # are inverses of each other.
75 # this function is unused except to test the iterative version (halfrev2)
76 def halfrev(l, pre_rev=True):
77 n = len(l)
78 if n == 1:
79 return l
80 ll, lh = l[:n//2], l[n//2:]
81 if pre_rev:
82 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
83 lh.reverse()
84 if not pre_rev:
85 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
86 return ll + lh
87
88
89 # iterative version of [recursively-applied] half-rev.
90 # relies on the list lengths being power-of-two and the fact
91 # that bit-inversion of a list of binary numbers is the same
92 # as reversing the order of the list
93 # this version is dead easy to implement in hardware.
94 # a big surprise is that the half-reversal can be done with
95 # such a simple XOR. the inverse operation is slightly trickier
96 def halfrev2(vec, pre_rev=True):
97 res = []
98 for i in range(len(vec)):
99 if pre_rev:
100 res.append(vec[i ^ (i>>1)])
101 else:
102 ri = i
103 bl = i.bit_length()
104 for ji in range(1, bl):
105 ri ^= (i >> ji)
106 res.append(vec[ri])
107 return res
108
109
110 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
111 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
112 # original (recursive) algorithm by Nayuki
113 def transform(vector, indent=0):
114 idt = " " * indent
115 n = len(vector)
116 if n == 1:
117 return list(vector)
118 elif n == 0 or n % 2 != 0:
119 raise ValueError()
120 else:
121 half = n // 2
122 alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
123 beta = [(vector[i] - vector[-(i + 1)]) /
124 (math.cos((i + 0.5) * math.pi / n) * 2.0)
125 for i in range(half)]
126 alpha = transform(alpha)
127 beta = transform(beta )
128 result = []
129 for i in range(half - 1):
130 result.append(alpha[i])
131 result.append(beta[i] + beta[i + 1])
132 result.append(alpha[-1])
133 result.append(beta [-1])
134 return result
135
136
137 # modified recursive algorithm, based on Nayuki original, which simply
138 # prints out an awful lot of debug data. used to work out the ordering
139 # for the iterative version by analysing the indices printed out
140 def transform(vector, indent=0):
141 idt = " " * indent
142 n = len(vector)
143 if n == 1:
144 return list(vector)
145 elif n == 0 or n % 2 != 0:
146 raise ValueError()
147 else:
148 half = n // 2
149 alpha = [0] * half
150 beta = [0] * half
151 print (idt, "xf", vector)
152 print (idt, "coeff", n, "->", end=" ")
153 for i in range(half):
154 t1, t2 = vector[i], vector[n-i-1]
155 k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
156 print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
157 alpha[i] = t1 + t2
158 beta[i] = (t1 - t2) * (1/k)
159 print ()
160 print (idt, "n", n, "alpha", end=" ")
161 for i in range(0, n, 2):
162 print (i, i//2, alpha[i//2], end=" ")
163 print()
164 print (idt, "n", n, "beta", end=" ")
165 for i in range(0, n, 2):
166 print (i, beta[i//2], end=" ")
167 print()
168 alpha = transform(alpha, indent+1)
169 beta = transform(beta , indent+1)
170 result = [0] * n
171 for i in range(half):
172 result[i*2] = alpha[i]
173 result[i*2+1] = beta[i]
174 print(idt, "merge", result)
175 for i in range(half - 1):
176 result[i*2+1] += result[i*2+3]
177 print(idt, "result", result)
178 return result
179
180
181 # totally cool *in-place* DCT algorithm
182 def transform2(vec):
183
184 # Initialization
185 n = len(vec)
186 print ()
187 print ("transform2", n)
188 levels = n.bit_length() - 1
189
190 # reference (read/write) the in-place data in *reverse-bit-order*
191 ri = list(range(n))
192 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
193
194 # reference list for not needing to do data-swaps, just swap what
195 # *indices* are referenced (two levels of indirection at the moment)
196 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
197 ji = list(range(n))
198 ji = halfrev2(ji, True)
199
200 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
201 # TODO: merge these two
202 vec = halfrev2(vec, False)
203 vec = [vec[ri[i]] for i in range(n)]
204
205 print ("ri", ri)
206 print ("ji", ji)
207
208 # create a cos table: not strictly necessary but here for illustrative
209 # purposes, to demonstrate the point that it really *is* iterative.
210 # this table could be cached and used multiple times rather than
211 # computed every time.
212 ctable = []
213 size = n
214 while size >= 2:
215 halfsize = size // 2
216 for i in range(n//size):
217 for ci in range(halfsize):
218 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
219 size //= 2
220
221 # start the inner butterfly
222 size = n
223 k = 0
224 while size >= 2:
225 halfsize = size // 2
226 tablestep = n // size
227 ir = list(range(0, n, size))
228 print (" xform", size, ir)
229 for i in ir:
230 # two lists of half-range indices, e.g. j 0123, jr 7654
231 j = list(range(i, i + halfsize))
232 jr = list(range(i+halfsize, i + size))
233 jr.reverse()
234 print (" xform jr", j, jr)
235 for ci, (jl, jh) in enumerate(zip(j, jr)):
236 t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
237 #coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
238 coeff = ctable[k]
239 k += 1
240 # normally DCT would use jl+halfsize not jh, here.
241 # to be able to work in-place, the idea is to perform a
242 # swap afterwards.
243 vec[ri[ji[jl]]] = t1 + t2
244 vec[ri[ji[jh]]] = (t1 - t2) * (1/coeff)
245 print ("coeff", size, i, "ci", ci,
246 "jl", ri[ji[jl]], "jh", ri[ji[jh]],
247 "i/n", (ci+0.5)/size, coeff, vec[ri[ji[jl]]],
248 vec[ri[ji[jh]]])
249 # instead of using jl+halfsize, perform a swap here.
250 # use half of j/jr because actually jl+halfsize = reverse(j)
251 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
252 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
253 jlh = jl+halfsize
254 # swap indices, NOT the data
255 tmp1, tmp2 = ji[jlh], ji[jh]
256 ji[jlh], ji[jh] = tmp2, tmp1
257 print (" swap", size, i, ji[jlh], ji[jh])
258 size //= 2
259
260 print("post-swapped", ri)
261 print("ji-swapped", ji)
262 print("transform2 pre-itersum", vec)
263
264 # now things are in the right order for the outer butterfly.
265 n = len(vec)
266 size = n // 2
267 while size >= 2:
268 halfsize = size // 2
269 ir = list(range(0, halfsize))
270 print ("itersum", halfsize, size, ir)
271 for i in ir:
272 jr = list(range(i+halfsize, i+n-halfsize, size))
273 print ("itersum jr", i+halfsize, i+size, jr)
274 for jh in jr:
275 vec[jh] += vec[jh+size]
276 print (" itersum", size, i, jh, jh+size)
277 size //= 2
278
279 print("transform2 result", vec)
280
281 return vec
282
283
284 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
285 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
286 def inverse_transform(vector, root=True, indent=0):
287 idt = " " * indent
288 if root:
289 vector = list(vector)
290 vector[0] /= 2
291 n = len(vector)
292 if n == 1:
293 return vector, [0]
294 elif n == 0 or n % 2 != 0:
295 raise ValueError()
296 else:
297 half = n // 2
298 alpha = [vector[0]]
299 beta = [vector[1]]
300 for i in range(2, n, 2):
301 alpha.append(vector[i])
302 beta.append(vector[i - 1] + vector[i + 1])
303 print (idt, "n", n, "alpha 0", end=" ")
304 for i in range(2, n, 2):
305 print (i, end=" ")
306 print ("beta 1", end=" ")
307 for i in range(2, n, 2):
308 print ("%d+%d" % (i-1, i+1), end=" ")
309 print()
310 inverse_transform(alpha, False, indent+1)
311 inverse_transform(beta , False, indent+1)
312 for i in range(half):
313 x, y = alpha[i], beta[i]
314 coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
315 y /= coeff
316 vector[i] = x + y
317 vector[n-(i+1)] = x - y
318 print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
319 print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
320 return vector
321
322
323 # totally cool *in-place* DCT algorithm
324 def inverse_transform_iter(vec):
325
326 # Initialization
327 n = len(vec)
328 print ()
329 print ("transform2 inv", n, vec)
330 levels = n.bit_length() - 1
331
332 # reference (read/write) the in-place data in *reverse-bit-order*
333 ri = list(range(n))
334 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
335
336 # reference list for not needing to do data-swaps, just swap what
337 # *indices* are referenced (two levels of indirection at the moment)
338 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
339 ji = list(range(n))
340 ji = halfrev2(ji, False)
341
342 print ("ri", ri)
343 print ("ji", ji)
344
345 # create a cos table: not strictly necessary but here for illustrative
346 # purposes, to demonstrate the point that it really *is* iterative.
347 # this table could be cached and used multiple times rather than
348 # computed every time.
349 ctable = []
350 size = n
351 while size >= 2:
352 halfsize = size // 2
353 for i in range(n//size):
354 for ci in range(halfsize):
355 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
356 size //= 2
357
358 # first divide element 0 by 2
359 vec[0] /= 2.0
360
361 print("transform2-inv pre-itersum", vec)
362 #vec = halfrev2(vec, True)
363 #print("transform2-inv post-itersum-reorder", vec)
364
365 # first the outer butterfly (iterative sum thing)
366 n = len(vec)
367 size = 2
368 while size <= n:
369 halfsize = size // 2
370 ir = list(range(0, halfsize))
371 print ("itersum", halfsize, size, ir)
372 for i in ir:
373 jr = list(range(i+halfsize, i+n-halfsize, size))
374 jr.reverse()
375 print ("itersum jr", i+halfsize, i+size, jr)
376 for jh in jr:
377 #x = vec[ji[jh]]
378 #y = vec[ji[jh+size]]
379 #vec[ji[jh+size]] = x + y
380 x = vec[jh]
381 y = vec[jh+size]
382 vec[jh+size] = x + y
383 print (" itersum", size, i, jh, jh+size,
384 x, y, "jh+sz", vec[ji[jh+size]])
385 size *= 2
386
387 print("transform2-inv post-itersum", vec)
388
389 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
390 # TODO: merge these two
391 vec = [vec[ri[i]] for i in range(n)]
392 vec = halfrev2(vec, True)
393 ri = list(range(n))
394
395 print("transform2-inv post-reorder", vec)
396
397 # start the inner butterfly (coefficients)
398 size = 2
399 k = 0
400 while size <= n:
401 halfsize = size // 2
402 tablestep = n // size
403 ir = list(range(0, n, size))
404 print (" xform", size, ir)
405 for i in ir:
406 # two lists of half-range indices, e.g. j 0123, jr 7654
407 j = list(range(i, i + halfsize))
408 jr = list(range(i+halfsize, i + size))
409 jr.reverse()
410 print (" xform jr", j, jr)
411 for ci, (jl, jh) in enumerate(zip(j, jr)):
412 #t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
413 t1, t2 = vec[ji[jl]], vec[ji[jl+halfsize]]
414 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
415 #coeff = ctable[k]
416 k += 1
417 # normally DCT would use jl+halfsize not jh, here.
418 # to be able to work in-place, the idea is to perform a
419 # swap afterwards.
420 #vec[ri[ji[jl]]] = t1 + t2/coeff
421 #vec[ri[ji[jh]]] = t1 - t2/coeff
422 vec[ji[jl]] = t1 + t2/coeff
423 vec[ji[jl+halfsize]] = t1 - t2/coeff
424 print ("coeff", size, i, "ci", ci,
425 "jl", ri[ji[jl]], "jh", ri[ji[jh]],
426 "i/n", (ci+0.5)/size, coeff,
427 "t1,t2", t1, t2,
428 "+/i", vec[ji[jl]], vec[ji[jh]])
429 #"+/i", vec2[ri[ji[jl]]], vec2[ri[ji[jh]]])
430 # instead of using jl+halfsize, perform a swap here.
431 # use half of j/jr because actually jl+halfsize = reverse(j)
432 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
433 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
434 jlh = jl+halfsize
435 # swap indices, NOT the data
436 tmp1, tmp2 = ji[jlh], ji[jh]
437 ji[jlh], ji[jh] = tmp2, tmp1
438 print (" swap", size, i, ji[jlh], ji[jh])
439 size *= 2
440
441 print("post-swapped", ri)
442 print("ji-swapped", ji)
443 ji = list(range(n))
444 ji = halfrev2(ji, True)
445 print("ji-calc ", ji)
446
447 print("transform2-inv result", vec)
448
449 return vec
450
451
452 def inverse_transform2(vector, root=True, indent=0):
453 idt = " " * indent
454 n = len(vector)
455 if root:
456 vector = list(vector)
457 vector[0] /= 2
458 if n == 1:
459 return vector
460 elif n == 0 or n % 2 != 0:
461 raise ValueError()
462 else:
463 print (idt, "inverse_xform2", vector)
464 half = n // 2
465 alpha = [vector[0]]
466 beta = [vector[1]]
467 for i in range(2, n, 2):
468 alpha.append(vector[i])
469 beta.append(vector[i - 1] + vector[i + 1])
470 print (idt, " alpha", i, vector[i])
471 print (idt, " beta", i-1, i+1, vector[i-1], vector[i+1], "->",
472 beta[-1])
473 inverse_transform2(alpha, False, indent+1)
474 inverse_transform2(beta , False, indent+1)
475 for i in range(half):
476 x, y = alpha[i], beta[i]
477 coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
478 vector[i] = x + y / coeff
479 vector[n-(i+1)] = x - y / coeff
480 print (idt, " v[%d] = %f+%f/%f=%f" % (i, x, y, coeff, vector[i]))
481 print (idt, " v[%d] = %f-%f/%f=%f" % (n-i-1, x, y,
482 coeff, vector[n-i-1]))
483 return vector
484
485
486 def inverse_transform2_explore(vector, root=True, indent=0):
487 n = len(vector)
488 if root:
489 vector = list(vector)
490 if n == 1:
491 return vector
492 elif n == 0 or n % 2 != 0:
493 raise ValueError()
494 else:
495 half = n // 2
496 alpha = [vector[0]]
497 beta = [vector[1]]
498 for i in range(2, n, 2):
499 alpha.append(vector[i])
500 beta.append(("add%d" % indent, vector[i - 1], vector[i + 1]))
501 inverse_transform2_explore(alpha, False, indent+1)
502 inverse_transform2_explore(beta , False, indent+1)
503 for i in range(half):
504 x = alpha[i]
505 y = ("cos%d" % indent, beta[i], i, n)
506 vector[i] = ("add%d" % indent, x, y)
507 vector[n-(i + 1)] = ("sub%d" % indent, x, y)
508 return vector
509
510
511
512 # does the outer butterfly in a recursive fashion, used in an
513 # intermediary development of the in-place DCT.
514 def transform_itersum(vector, indent=0):
515 idt = " " * indent
516 n = len(vector)
517 if n == 1:
518 return list(vector)
519 elif n == 0 or n % 2 != 0:
520 raise ValueError()
521 else:
522 half = n // 2
523 alpha = [0] * half
524 beta = [0] * half
525 for i in range(half):
526 t1, t2 = vector[i], vector[i+half]
527 alpha[i] = t1
528 beta[i] = t2
529 alpha = transform_itersum(alpha, indent+1)
530 beta = transform_itersum(beta , indent+1)
531 result = [0] * n
532 for i in range(half):
533 result[i*2] = alpha[i]
534 result[i*2+1] = beta[i]
535 print(idt, "iter-merge", result)
536 for i in range(half - 1):
537 result[i*2+1] += result[i*2+3]
538 print(idt, "iter-result", result)
539 return result
540
541
542 # prints out an "add" schedule for the outer butterfly, recursively,
543 # matching what transform_itersum does.
544 def itersum_explore(vector, indent=0):
545 idt = " " * indent
546 n = len(vector)
547 if n == 1:
548 return list(vector)
549 elif n == 0 or n % 2 != 0:
550 raise ValueError()
551 else:
552 half = n // 2
553 alpha = [0] * half
554 beta = [0] * half
555 for i in range(half):
556 t1, t2 = vector[i], vector[i+half]
557 alpha[i] = t1
558 beta[i] = t2
559 alpha = itersum_explore(alpha, indent+1)
560 beta = itersum_explore(beta , indent+1)
561 result = [0] * n
562 for i in range(half):
563 result[i*2] = alpha[i]
564 result[i*2+1] = beta[i]
565 print(idt, "iter-merge", result)
566 for i in range(half - 1):
567 result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
568 print(idt, "iter-result", result)
569 return result
570
571
572 # prints out the exact same outer butterfly but does so iteratively.
573 # by comparing the output from itersum_explore and itersum_explore2
574 # and by drawing out the resultant ADDs as a graph it was possible
575 # to deduce what the heck was going on.
576 def itersum_explore2(vec, indent=0):
577 n = len(vec)
578 size = n // 2
579 while size >= 2:
580 halfsize = size // 2
581 ir = list(range(0, halfsize))
582 print ("itersum", halfsize, size, ir)
583 for i in ir:
584 jr = list(range(i+halfsize, i+n-halfsize, size))
585 print ("itersum jr", i+halfsize, i+size, jr)
586 for jh in jr:
587 vec[jh] = ("add", vec[jh], vec[jh+size])
588 print (" itersum", size, i, jh, jh+size)
589 size //= 2
590
591 return vec
592
593 if __name__ == '__main__':
594 n = 16
595 vec = list(range(n))
596 levels = n.bit_length() - 1
597 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
598 ops = itersum_explore(vec)
599 for i, x in enumerate(ops):
600 print (i, x)
601
602 n = 16
603 vec = list(range(n))
604 levels = n.bit_length() - 1
605 ops = itersum_explore2(vec)
606 for i, x in enumerate(ops):
607 print (i, x)
608
609 # halfrev test
610 vec = list(range(16))
611 print ("orig vec", vec)
612 vecr = halfrev(vec, True)
613 print ("reversed", vecr)
614 for i, v in enumerate(vecr):
615 print ("%2d %2d %04s %04s %04s" % (i, v,
616 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
617 vecrr = halfrev(vecr, False)
618 assert vec == vecrr
619 vecrr = halfrev(vec, False)
620 print ("pre-reversed", vecrr)
621 for i, v in enumerate(vecrr):
622 print ("%2d %2d %04s %04s %04s" % (i, v,
623 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
624 il = halfrev2(vec, False)
625 print ("iterative rev", il)
626 il = halfrev2(vec, True)
627 print ("iterative rev-true", il)
628
629 n = 4
630 vec = list(range(n))
631 levels = n.bit_length() - 1
632 ops = inverse_transform2_explore(vec)
633 for i, x in enumerate(ops):
634 print (i, x)
635