move half-reverse to before outer butterfly in I-DCT
[openpower-isa.git] / src / openpower / decoder / isa / fastdctlee.py
1 #
2 # Fast discrete cosine transform algorithm (Python)
3 #
4 # Modifications made to create an in-place iterative DCT:
5 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
6 #
7 # License for modifications - SPDX: LGPLv3+
8 #
9 # Original fastdctlee.py by Nayuki:
10 # Copyright (c) 2020 Project Nayuki. (MIT License)
11 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
12 #
13 # License for original fastdctlee.py by Nayuki:
14 #
15 # Permission is hereby granted, free of charge, to any person obtaining
16 # a copy of this software and associated documentation files (the
17 # "Software"), to deal in the Software without restriction, including
18 # without limitation the rights to use, copy, modify, merge, publish,
19 # distribute, sublicense, and/or sell copies of the Software, and to
20 # permit persons to whom the Software is furnished to do so, subject to
21 # the following conditions:
22 # - The above copyright notice and this permission notice shall be included in
23 # all copies or substantial portions of the Software.
24 # - The Software is provided "as is", without warranty of any kind, express or
25 # implied, including but not limited to the warranties of merchantability,
26 # fitness for a particular purpose and noninfringement. In no event shall the
27 # authors or copyright holders be liable for any claim, damages or other
28 # liability, whether in an action of contract, tort or otherwise,
29 # arising from, out of or in connection with the Software or the use
30 # or other dealings in the Software.
31 #
32 #
33 # The modifications made are firstly to create an iterative schedule,
34 # rather than the more normal recursive algorithm. Secondly, the
35 # two butterflys are also separated out: inner butterfly does COS +/-
36 # whilst outer butterfly does the iterative summing.
37 #
38 # However, to avoid data copying some additional tricks are played:
39 # - firstly, the data is LOADed in bit-reversed order (which is normally
40 # covered by the recursive algorithm due to the odd-even reconstruction)
41 # but then to reference the data in the correct order an array of
42 # bit-reversed indices is created, as a level of indirection.
43 # the data is bit-reversed but so are the indices, making it all A-Ok.
44 # - secondly, normally in DCT a 2nd target (copy) array is used where
45 # the top half is read in reverse order (7 6 5 4) and written out
46 # to the target 4 5 6 7. the plan was to do this in two stages:
47 # write in-place in order 4 5 6 7 then swap afterwards (7-4), (6-5).
48 # however by leaving the data *in-place* and having subsequent
49 # loops refer to the data *where it now is*, the swap is avoided
50 # - thirdly, arrange for the data to be *pre-swapped* (in an inverse
51 # order of how it would have got there, if that makes sense), such
52 # that *when* it gets swapped, it ends up in the right order.
53 # given that that will be a LD operation it's no big deal.
54 #
55 # End result is that once the first butterfly is done - bear in mind
56 # it's in-place - the data is in the right order so that a second
57 # dead-straightforward iterative sum can be done: again, in-place.
58 # Really surprising.
59
60 import math
61 from copy import deepcopy
62
63 # bits of the integer 'val'.
64 def reverse_bits(val, width):
65 result = 0
66 for _ in range(width):
67 result = (result << 1) | (val & 1)
68 val >>= 1
69 return result
70
71
72 # reverse top half of a list, recursively. the recursion can be
73 # applied *after* or *before* the reversal of the top half. these
74 # are inverses of each other.
75 # this function is unused except to test the iterative version (halfrev2)
76 def halfrev(l, pre_rev=True):
77 n = len(l)
78 if n == 1:
79 return l
80 ll, lh = l[:n//2], l[n//2:]
81 if pre_rev:
82 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
83 lh.reverse()
84 if not pre_rev:
85 ll, lh = halfrev(ll, pre_rev), halfrev(lh, pre_rev)
86 return ll + lh
87
88
89 # iterative version of [recursively-applied] half-rev.
90 # relies on the list lengths being power-of-two and the fact
91 # that bit-inversion of a list of binary numbers is the same
92 # as reversing the order of the list
93 # this version is dead easy to implement in hardware.
94 # a big surprise is that the half-reversal can be done with
95 # such a simple XOR. the inverse operation is slightly trickier
96 def halfrev2(vec, pre_rev=True):
97 res = []
98 for i in range(len(vec)):
99 if pre_rev:
100 res.append(vec[i ^ (i>>1)])
101 else:
102 ri = i
103 bl = i.bit_length()
104 for ji in range(1, bl):
105 ri ^= (i >> ji)
106 res.append(vec[ri])
107 return res
108
109
110 # DCT type II, unscaled. Algorithm by Byeong Gi Lee, 1984.
111 # See: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.118.3056&rep=rep1&type=pdf#page=34
112 # original (recursive) algorithm by Nayuki
113 def transform(vector, indent=0):
114 idt = " " * indent
115 n = len(vector)
116 if n == 1:
117 return list(vector)
118 elif n == 0 or n % 2 != 0:
119 raise ValueError()
120 else:
121 half = n // 2
122 alpha = [(vector[i] + vector[-(i + 1)]) for i in range(half)]
123 beta = [(vector[i] - vector[-(i + 1)]) /
124 (math.cos((i + 0.5) * math.pi / n) * 2.0)
125 for i in range(half)]
126 alpha = transform(alpha)
127 beta = transform(beta )
128 result = []
129 for i in range(half - 1):
130 result.append(alpha[i])
131 result.append(beta[i] + beta[i + 1])
132 result.append(alpha[-1])
133 result.append(beta [-1])
134 return result
135
136
137 # modified recursive algorithm, based on Nayuki original, which simply
138 # prints out an awful lot of debug data. used to work out the ordering
139 # for the iterative version by analysing the indices printed out
140 def transform(vector, indent=0):
141 idt = " " * indent
142 n = len(vector)
143 if n == 1:
144 return list(vector)
145 elif n == 0 or n % 2 != 0:
146 raise ValueError()
147 else:
148 half = n // 2
149 alpha = [0] * half
150 beta = [0] * half
151 print (idt, "xf", vector)
152 print (idt, "coeff", n, "->", end=" ")
153 for i in range(half):
154 t1, t2 = vector[i], vector[n-i-1]
155 k = (math.cos((i + 0.5) * math.pi / n) * 2.0)
156 print (i, n-i-1, "i/n", (i+0.5)/n, ":", k, end= " ")
157 alpha[i] = t1 + t2
158 beta[i] = (t1 - t2) * (1/k)
159 print ()
160 print (idt, "n", n, "alpha", end=" ")
161 for i in range(0, n, 2):
162 print (i, i//2, alpha[i//2], end=" ")
163 print()
164 print (idt, "n", n, "beta", end=" ")
165 for i in range(0, n, 2):
166 print (i, beta[i//2], end=" ")
167 print()
168 alpha = transform(alpha, indent+1)
169 beta = transform(beta , indent+1)
170 result = [0] * n
171 for i in range(half):
172 result[i*2] = alpha[i]
173 result[i*2+1] = beta[i]
174 print(idt, "merge", result)
175 for i in range(half - 1):
176 result[i*2+1] += result[i*2+3]
177 print(idt, "result", result)
178 return result
179
180
181 # totally cool *in-place* DCT algorithm
182 def transform2(vec):
183
184 # Initialization
185 n = len(vec)
186 print ()
187 print ("transform2", n)
188 levels = n.bit_length() - 1
189
190 # reference (read/write) the in-place data in *reverse-bit-order*
191 ri = list(range(n))
192 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
193
194 # reference list for not needing to do data-swaps, just swap what
195 # *indices* are referenced (two levels of indirection at the moment)
196 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
197 ji = list(range(n))
198 ji = halfrev2(ji, True)
199
200 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
201 # TODO: merge these two
202 vec = halfrev2(vec, False)
203 vec = [vec[ri[i]] for i in range(n)]
204
205 print ("ri", ri)
206 print ("ji", ji)
207
208 # create a cos table: not strictly necessary but here for illustrative
209 # purposes, to demonstrate the point that it really *is* iterative.
210 # this table could be cached and used multiple times rather than
211 # computed every time.
212 ctable = []
213 size = n
214 while size >= 2:
215 halfsize = size // 2
216 for i in range(n//size):
217 for ci in range(halfsize):
218 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
219 size //= 2
220
221 # start the inner butterfly
222 size = n
223 k = 0
224 while size >= 2:
225 halfsize = size // 2
226 tablestep = n // size
227 ir = list(range(0, n, size))
228 print (" xform", size, ir)
229 for i in ir:
230 # two lists of half-range indices, e.g. j 0123, jr 7654
231 j = list(range(i, i + halfsize))
232 jr = list(range(i+halfsize, i + size))
233 jr.reverse()
234 print (" xform jr", j, jr)
235 for ci, (jl, jh) in enumerate(zip(j, jr)):
236 t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
237 #coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
238 coeff = ctable[k]
239 k += 1
240 # normally DCT would use jl+halfsize not jh, here.
241 # to be able to work in-place, the idea is to perform a
242 # swap afterwards.
243 vec[ri[ji[jl]]] = t1 + t2
244 vec[ri[ji[jh]]] = (t1 - t2) * (1/coeff)
245 print ("coeff", size, i, "ci", ci,
246 "jl", ri[ji[jl]], "jh", ri[ji[jh]],
247 "i/n", (ci+0.5)/size, coeff, vec[ri[ji[jl]]],
248 vec[ri[ji[jh]]])
249 # instead of using jl+halfsize, perform a swap here.
250 # use half of j/jr because actually jl+halfsize = reverse(j)
251 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
252 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
253 jlh = jl+halfsize
254 # swap indices, NOT the data
255 tmp1, tmp2 = ji[jlh], ji[jh]
256 ji[jlh], ji[jh] = tmp2, tmp1
257 print (" swap", size, i, ji[jlh], ji[jh])
258 size //= 2
259
260 print("post-swapped", ri)
261 print("ji-swapped", ji)
262 print("transform2 pre-itersum", vec)
263
264 # now things are in the right order for the outer butterfly.
265 n = len(vec)
266 size = n // 2
267 while size >= 2:
268 halfsize = size // 2
269 ir = list(range(0, halfsize))
270 print ("itersum", halfsize, size, ir)
271 for i in ir:
272 jr = list(range(i+halfsize, i+n-halfsize, size))
273 print ("itersum jr", i+halfsize, i+size, jr)
274 for jh in jr:
275 vec[jh] += vec[jh+size]
276 print (" itersum", size, i, jh, jh+size)
277 size //= 2
278
279 print("transform2 result", vec)
280
281 return vec
282
283
284 # DCT type III, unscaled. Algorithm by Byeong Gi Lee, 1984.
285 # See: https://www.nayuki.io/res/fast-discrete-cosine-transform-algorithms/lee-new-algo-discrete-cosine-transform.pdf
286 def inverse_transform(vector, root=True, indent=0):
287 idt = " " * indent
288 if root:
289 vector = list(vector)
290 vector[0] /= 2
291 n = len(vector)
292 if n == 1:
293 return vector, [0]
294 elif n == 0 or n % 2 != 0:
295 raise ValueError()
296 else:
297 half = n // 2
298 alpha = [vector[0]]
299 beta = [vector[1]]
300 for i in range(2, n, 2):
301 alpha.append(vector[i])
302 beta.append(vector[i - 1] + vector[i + 1])
303 print (idt, "n", n, "alpha 0", end=" ")
304 for i in range(2, n, 2):
305 print (i, end=" ")
306 print ("beta 1", end=" ")
307 for i in range(2, n, 2):
308 print ("%d+%d" % (i-1, i+1), end=" ")
309 print()
310 inverse_transform(alpha, False, indent+1)
311 inverse_transform(beta , False, indent+1)
312 for i in range(half):
313 x, y = alpha[i], beta[i]
314 coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
315 y /= coeff
316 vector[i] = x + y
317 vector[n-(i+1)] = x - y
318 print (idt, " v[%d] = alpha[%d]+beta[%d]" % (i, i, i))
319 print (idt, " v[%d] = alpha[%d]-beta[%d]" % (n-i-1, i, i))
320 return vector
321
322
323 # totally cool *in-place* DCT algorithm
324 def inverse_transform_iter(vec):
325
326 # Initialization
327 n = len(vec)
328 print ()
329 print ("transform2 inv", n, vec)
330 levels = n.bit_length() - 1
331
332 # reference (read/write) the in-place data in *reverse-bit-order*
333 ri = list(range(n))
334 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
335
336 # reference list for not needing to do data-swaps, just swap what
337 # *indices* are referenced (two levels of indirection at the moment)
338 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
339 ji = list(range(n))
340 ji = halfrev2(ji, False)
341
342 print ("ri", ri)
343 print ("ji", ji)
344
345 # create a cos table: not strictly necessary but here for illustrative
346 # purposes, to demonstrate the point that it really *is* iterative.
347 # this table could be cached and used multiple times rather than
348 # computed every time.
349 ctable = []
350 size = n
351 while size >= 2:
352 halfsize = size // 2
353 for i in range(n//size):
354 for ci in range(halfsize):
355 ctable.append((math.cos((ci + 0.5) * math.pi / size) * 2.0))
356 size //= 2
357
358 # first divide element 0 by 2
359 vec[0] /= 2.0
360
361 print("transform2-inv pre-itersum", vec)
362 #vec = halfrev2(vec, True)
363 vec = [vec[ri[i]] for i in range(n)]
364 #print("transform2-inv post-itersum-reorder", vec)
365
366 # first the outer butterfly (iterative sum thing)
367 n = len(vec)
368 size = 2
369 while size <= n:
370 halfsize = size // 2
371 ir = list(range(0, halfsize))
372 print ("itersum", halfsize, size, ir)
373 for i in ir:
374 jr = list(range(i+halfsize, i+n-halfsize, size))
375 jr.reverse()
376 print ("itersum jr", i+halfsize, i+size, jr)
377 for jh in jr:
378 x = vec[ri[jh]]
379 y = vec[ri[jh+size]]
380 vec[ri[jh+size]] = x + y
381 print (" itersum", size, i, jh, jh+size,
382 x, y, "jh+sz", vec[ji[jh+size]])
383 size *= 2
384
385 print("transform2-inv post-itersum", vec)
386
387 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
388 # TODO: merge these two
389 vec = halfrev2(vec, True)
390 ri = list(range(n))
391
392 print("transform2-inv post-reorder", vec)
393
394 # start the inner butterfly (coefficients)
395 size = 2
396 k = 0
397 while size <= n:
398 halfsize = size // 2
399 tablestep = n // size
400 ir = list(range(0, n, size))
401 print (" xform", size, ir)
402 for i in ir:
403 # two lists of half-range indices, e.g. j 0123, jr 7654
404 j = list(range(i, i + halfsize))
405 jr = list(range(i+halfsize, i + size))
406 jr.reverse()
407 print (" xform jr", j, jr)
408 for ci, (jl, jh) in enumerate(zip(j, jr)):
409 #t1, t2 = vec[ri[ji[jl]]], vec[ri[ji[jh]]]
410 t1, t2 = vec[ji[jl]], vec[ji[jl+halfsize]]
411 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
412 #coeff = ctable[k]
413 k += 1
414 # normally DCT would use jl+halfsize not jh, here.
415 # to be able to work in-place, the idea is to perform a
416 # swap afterwards.
417 #vec[ri[ji[jl]]] = t1 + t2/coeff
418 #vec[ri[ji[jh]]] = t1 - t2/coeff
419 vec[ji[jl]] = t1 + t2/coeff
420 vec[ji[jl+halfsize]] = t1 - t2/coeff
421 print ("coeff", size, i, "ci", ci,
422 "jl", ri[ji[jl]], "jh", ri[ji[jh]],
423 "i/n", (ci+0.5)/size, coeff,
424 "t1,t2", t1, t2,
425 "+/i", vec[ji[jl]], vec[ji[jh]])
426 #"+/i", vec2[ri[ji[jl]]], vec2[ri[ji[jh]]])
427 # instead of using jl+halfsize, perform a swap here.
428 # use half of j/jr because actually jl+halfsize = reverse(j)
429 hz2 = halfsize // 2 # can be zero which stops reversing 1-item lists
430 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
431 jlh = jl+halfsize
432 # swap indices, NOT the data
433 tmp1, tmp2 = ji[jlh], ji[jh]
434 ji[jlh], ji[jh] = tmp2, tmp1
435 print (" swap", size, i, ji[jlh], ji[jh])
436 size *= 2
437
438 print("post-swapped", ri)
439 print("ji-swapped", ji)
440 ji = list(range(n))
441 ji = halfrev2(ji, True)
442 print("ji-calc ", ji)
443
444 print("transform2-inv result", vec)
445
446 return vec
447
448
449 def inverse_transform2(vector, root=True, indent=0):
450 idt = " " * indent
451 n = len(vector)
452 if root:
453 vector = list(vector)
454 vector[0] /= 2
455 if n == 1:
456 return vector
457 elif n == 0 or n % 2 != 0:
458 raise ValueError()
459 else:
460 print (idt, "inverse_xform2", vector)
461 half = n // 2
462 alpha = [vector[0]]
463 beta = [vector[1]]
464 for i in range(2, n, 2):
465 alpha.append(vector[i])
466 beta.append(vector[i - 1] + vector[i + 1])
467 print (idt, " alpha", i, vector[i])
468 print (idt, " beta", i-1, i+1, vector[i-1], vector[i+1], "->",
469 beta[-1])
470 inverse_transform2(alpha, False, indent+1)
471 inverse_transform2(beta , False, indent+1)
472 for i in range(half):
473 x, y = alpha[i], beta[i]
474 coeff = (math.cos((i + 0.5) * math.pi / n) * 2)
475 vector[i] = x + y / coeff
476 vector[n-(i+1)] = x - y / coeff
477 print (idt, " v[%d] = %f+%f/%f=%f" % (i, x, y, coeff, vector[i]))
478 print (idt, " v[%d] = %f-%f/%f=%f" % (n-i-1, x, y,
479 coeff, vector[n-i-1]))
480 return vector
481
482
483 def inverse_transform2_explore(vector, root=True, indent=0):
484 n = len(vector)
485 if root:
486 vector = list(vector)
487 if n == 1:
488 return vector
489 elif n == 0 or n % 2 != 0:
490 raise ValueError()
491 else:
492 half = n // 2
493 alpha = [vector[0]]
494 beta = [vector[1]]
495 for i in range(2, n, 2):
496 alpha.append(vector[i])
497 beta.append(("add%d" % indent, vector[i - 1], vector[i + 1]))
498 inverse_transform2_explore(alpha, False, indent+1)
499 inverse_transform2_explore(beta , False, indent+1)
500 for i in range(half):
501 x = alpha[i]
502 y = ("cos%d" % indent, beta[i], i, n)
503 vector[i] = ("add%d" % indent, x, y)
504 vector[n-(i + 1)] = ("sub%d" % indent, x, y)
505 return vector
506
507
508
509 # does the outer butterfly in a recursive fashion, used in an
510 # intermediary development of the in-place DCT.
511 def transform_itersum(vector, indent=0):
512 idt = " " * indent
513 n = len(vector)
514 if n == 1:
515 return list(vector)
516 elif n == 0 or n % 2 != 0:
517 raise ValueError()
518 else:
519 half = n // 2
520 alpha = [0] * half
521 beta = [0] * half
522 for i in range(half):
523 t1, t2 = vector[i], vector[i+half]
524 alpha[i] = t1
525 beta[i] = t2
526 alpha = transform_itersum(alpha, indent+1)
527 beta = transform_itersum(beta , indent+1)
528 result = [0] * n
529 for i in range(half):
530 result[i*2] = alpha[i]
531 result[i*2+1] = beta[i]
532 print(idt, "iter-merge", result)
533 for i in range(half - 1):
534 result[i*2+1] += result[i*2+3]
535 print(idt, "iter-result", result)
536 return result
537
538
539 # prints out an "add" schedule for the outer butterfly, recursively,
540 # matching what transform_itersum does.
541 def itersum_explore(vector, indent=0):
542 idt = " " * indent
543 n = len(vector)
544 if n == 1:
545 return list(vector)
546 elif n == 0 or n % 2 != 0:
547 raise ValueError()
548 else:
549 half = n // 2
550 alpha = [0] * half
551 beta = [0] * half
552 for i in range(half):
553 t1, t2 = vector[i], vector[i+half]
554 alpha[i] = t1
555 beta[i] = t2
556 alpha = itersum_explore(alpha, indent+1)
557 beta = itersum_explore(beta , indent+1)
558 result = [0] * n
559 for i in range(half):
560 result[i*2] = alpha[i]
561 result[i*2+1] = beta[i]
562 print(idt, "iter-merge", result)
563 for i in range(half - 1):
564 result[i*2+1] = ("add", result[i*2+1], result[i*2+3])
565 print(idt, "iter-result", result)
566 return result
567
568
569 # prints out the exact same outer butterfly but does so iteratively.
570 # by comparing the output from itersum_explore and itersum_explore2
571 # and by drawing out the resultant ADDs as a graph it was possible
572 # to deduce what the heck was going on.
573 def itersum_explore2(vec, indent=0):
574 n = len(vec)
575 size = n // 2
576 while size >= 2:
577 halfsize = size // 2
578 ir = list(range(0, halfsize))
579 print ("itersum", halfsize, size, ir)
580 for i in ir:
581 jr = list(range(i+halfsize, i+n-halfsize, size))
582 print ("itersum jr", i+halfsize, i+size, jr)
583 for jh in jr:
584 vec[jh] = ("add", vec[jh], vec[jh+size])
585 print (" itersum", size, i, jh, jh+size)
586 size //= 2
587
588 return vec
589
590 if __name__ == '__main__':
591 n = 16
592 vec = list(range(n))
593 levels = n.bit_length() - 1
594 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
595 ops = itersum_explore(vec)
596 for i, x in enumerate(ops):
597 print (i, x)
598
599 n = 16
600 vec = list(range(n))
601 levels = n.bit_length() - 1
602 ops = itersum_explore2(vec)
603 for i, x in enumerate(ops):
604 print (i, x)
605
606 # halfrev test
607 vec = list(range(16))
608 print ("orig vec", vec)
609 vecr = halfrev(vec, True)
610 print ("reversed", vecr)
611 for i, v in enumerate(vecr):
612 print ("%2d %2d %04s %04s %04s" % (i, v,
613 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
614 vecrr = halfrev(vecr, False)
615 assert vec == vecrr
616 vecrr = halfrev(vec, False)
617 print ("pre-reversed", vecrr)
618 for i, v in enumerate(vecrr):
619 print ("%2d %2d %04s %04s %04s" % (i, v,
620 bin(i)[2:], bin(v ^ i)[2:], bin(v)[2:]))
621 il = halfrev2(vec, False)
622 print ("iterative rev", il)
623 il = halfrev2(vec, True)
624 print ("iterative rev-true", il)
625
626 n = 4
627 vec = list(range(n))
628 levels = n.bit_length() - 1
629 ops = inverse_transform2_explore(vec)
630 for i, x in enumerate(ops):
631 print (i, x)
632