add new cos coefficient pre-computed and on-the-fly mode,
[openpower-isa.git] / src / openpower / decoder / isa / remap_dct_yield.py
1 # DCT "REMAP" scheduler
2 #
3 # Modifications made to create an in-place iterative DCT:
4 # Copyright (c) 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
5 #
6 # SPDX: LGPLv3+
7 #
8 # Original fastdctlee.py by Nayuki:
9 # Copyright (c) 2020 Project Nayuki. (MIT License)
10 # https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
11
12 import math
13
14 # bits of the integer 'val'.
15 def reverse_bits(val, width):
16 result = 0
17 for _ in range(width):
18 result = (result << 1) | (val & 1)
19 val >>= 1
20 return result
21
22
23 # iterative version of [recursively-applied] half-rev.
24 # relies on the list lengths being power-of-two and the fact
25 # that bit-inversion of a list of binary numbers is the same
26 # as reversing the order of the list
27 # this version is dead easy to implement in hardware.
28 # a big surprise is that the half-reversal can be done with
29 # such a simple XOR. the inverse operation is slightly trickier
30 def halfrev2(vec, pre_rev=True):
31 res = []
32 for i in range(len(vec)):
33 if pre_rev:
34 res.append(i ^ (i>>1))
35 else:
36 ri = i
37 bl = i.bit_length()
38 for ji in range(1, bl):
39 ri ^= (i >> ji)
40 res.append(vec[ri])
41 return res
42
43
44 # python "yield" can be iterated. use this to make it clear how
45 # the indices are generated by using natural-looking nested loops
46 def iterate_dct_inner_costable_indices(SVSHAPE):
47 # get indices to iterate over, in the required order
48 n = SVSHAPE.lims[0]
49 mode = SVSHAPE.lims[1]
50 print ("inner costable", mode)
51 # creating lists of indices to iterate over in each dimension
52 # has to be done dynamically, because it depends on the size
53 # first, the size-based loop (which can be done statically)
54 x_r = []
55 size = 2
56 while size <= n:
57 x_r.append(size)
58 size *= 2
59 # invert order if requested
60 if SVSHAPE.invxyz[0]:
61 x_r.reverse()
62
63 if len(x_r) == 0:
64 return
65
66 #print ("ri", ri)
67 #print ("ji", ji)
68
69 # start an infinite (wrapping) loop
70 skip = 0
71 z_end = 1 # doesn't exist in this, only 2 loops
72 k = 0
73 while True:
74 for size in x_r: # loop over 3rd order dimension (size)
75 x_end = size == x_r[-1]
76 # y_r schedule depends on size
77 halfsize = size // 2
78 y_r = []
79 for i in range(0, n, size):
80 y_r.append(i)
81 # invert if requested
82 if SVSHAPE.invxyz[1]: y_r.reverse()
83 # two lists of half-range indices, e.g. j 0123, jr 7654
84 j = list(range(0, halfsize))
85 # invert if requested
86 if SVSHAPE.invxyz[2]: j_r.reverse()
87 #print ("xform jr", jr)
88 # loop over 1st order dimension
89 for ci, jl in enumerate(j):
90 y_end = jl == j[-1]
91 # now depending on MODE return the index. inner butterfly
92 if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
93 result = k # offset into COS table
94 elif SVSHAPE.skip == 0b10: #
95 result = ci # coefficient helper
96 elif SVSHAPE.skip == 0b11: #
97 result = size # coefficient helper
98 loopends = (z_end |
99 ((y_end and z_end)<<1) |
100 ((y_end and x_end and z_end)<<2))
101
102 yield result + SVSHAPE.offset, loopends
103 k += 1
104
105 # python "yield" can be iterated. use this to make it clear how
106 # the indices are generated by using natural-looking nested loops
107 def iterate_dct_inner_butterfly_indices(SVSHAPE):
108 # get indices to iterate over, in the required order
109 n = SVSHAPE.lims[0]
110 mode = SVSHAPE.lims[1]
111 #print ("inner butterfly", mode)
112 # creating lists of indices to iterate over in each dimension
113 # has to be done dynamically, because it depends on the size
114 # first, the size-based loop (which can be done statically)
115 x_r = []
116 size = 2
117 while size <= n:
118 x_r.append(size)
119 size *= 2
120 # invert order if requested
121 if SVSHAPE.invxyz[0]:
122 x_r.reverse()
123
124 if len(x_r) == 0:
125 return
126
127 # reference (read/write) the in-place data in *reverse-bit-order*
128 ri = list(range(n))
129 if SVSHAPE.submode2 == 0b01:
130 levels = n.bit_length() - 1
131 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
132
133 # reference list for not needing to do data-swaps, just swap what
134 # *indices* are referenced (two levels of indirection at the moment)
135 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
136 ji = list(range(n))
137 inplace_mode = SVSHAPE.submode2 == 0b01
138 # and SVSHAPE.skip not in [0b10, 0b11]
139 if inplace_mode:
140 #print ("inplace mode")
141 ji = halfrev2(ji, True)
142
143 #print ("ri", ri)
144 #print ("ji", ji)
145
146 # start an infinite (wrapping) loop
147 skip = 0
148 while True:
149 for size in x_r: # loop over 3rd order dimension (size)
150 x_end = size == x_r[-1]
151 # y_r schedule depends on size
152 halfsize = size // 2
153 y_r = []
154 for i in range(0, n, size):
155 y_r.append(i)
156 # invert if requested
157 if SVSHAPE.invxyz[1]: y_r.reverse()
158 for i in y_r: # loop over 2nd order dimension
159 y_end = i == y_r[-1]
160 # two lists of half-range indices, e.g. j 0123, jr 7654
161 j = list(range(i, i + halfsize))
162 jr = list(range(i+halfsize, i + size))
163 jr.reverse()
164 # invert if requested
165 if SVSHAPE.invxyz[2]: j_r.reverse()
166 hz2 = halfsize // 2 # zero stops reversing 1-item lists
167 # if you *really* want to do the in-place swapping manually,
168 # this allows you to do it. good luck...
169 if SVSHAPE.submode2 == 0b01 and not inplace_mode:
170 #print ("swap mode")
171 jr = j_r[:hz2]
172 #print ("xform jr", jr)
173 # loop over 1st order dimension
174 for ci, (jl, jh) in enumerate(zip(j, jr)):
175 z_end = jl == j[-1]
176 # now depending on MODE return the index. inner butterfly
177 if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
178 result = ri[ji[jl]] # lower half
179 elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
180 result = ri[ji[jh]] # upper half, reverse order
181 elif SVSHAPE.skip == 0b10: #
182 result = ci # coefficient helper
183 elif SVSHAPE.skip == 0b11: #
184 result = size # coefficient helper
185 loopends = (z_end |
186 ((y_end and z_end)<<1) |
187 ((y_end and x_end and z_end)<<2))
188
189 yield result + SVSHAPE.offset, loopends
190
191 # now in-place swap
192 if inplace_mode:
193 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
194 jlh = jl+halfsize
195 #print ("inplace swap", jh, jlh)
196 tmp1, tmp2 = ji[jlh], ji[jh]
197 ji[jlh], ji[jh] = tmp2, tmp1
198
199
200 # python "yield" can be iterated. use this to make it clear how
201 # the indices are generated by using natural-looking nested loops
202 def iterate_dct_outer_butterfly_indices(SVSHAPE):
203 # get indices to iterate over, in the required order
204 n = SVSHAPE.lims[0]
205 mode = SVSHAPE.lims[1]
206 # createing lists of indices to iterate over in each dimension
207 # has to be done dynamically, because it depends on the size
208 # first, the size-based loop (which can be done statically)
209 x_r = []
210 size = n // 2
211 while size >= 2:
212 x_r.append(size)
213 size //= 2
214 # invert order if requested
215 if SVSHAPE.invxyz[0]:
216 x_r.reverse()
217
218 if len(x_r) == 0:
219 return
220
221 #print ("outer butterfly")
222
223 # reference (read/write) the in-place data in *reverse-bit-order*
224 ri = list(range(n))
225 if SVSHAPE.submode2 == 0b11:
226 levels = n.bit_length() - 1
227 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
228
229 # reference list for not needing to do data-swaps, just swap what
230 # *indices* are referenced (two levels of indirection at the moment)
231 # pre-reverse the data-swap list so that it *ends up* in the order 0123..
232 ji = list(range(n))
233 inplace_mode = False # need the space... SVSHAPE.skip in [0b10, 0b11]
234 if inplace_mode:
235 #print ("inplace mode", SVSHAPE.skip)
236 ji = halfrev2(ji, True)
237
238 #print ("ri", ri)
239 #print ("ji", ji)
240
241 # start an infinite (wrapping) loop
242 skip = 0
243 k = 0
244 k_start = 0
245 while True:
246 for size in x_r: # loop over 3rd order dimension (size)
247 halfsize = size//2
248 x_end = size == x_r[-1]
249 y_r = list(range(0, halfsize))
250 #print ("itersum", halfsize, size, y_r)
251 # invert if requested
252 if SVSHAPE.invxyz[1]: y_r.reverse()
253 for i in y_r: # loop over 2nd order dimension
254 y_end = i == y_r[-1]
255 # one list to create iterative-sum schedule
256 jr = list(range(i+halfsize, i+n-halfsize, size))
257 #print ("itersum jr", i+halfsize, i+size, jr)
258 # invert if requested
259 if SVSHAPE.invxyz[2]: j_r.reverse()
260 hz2 = halfsize // 2 # zero stops reversing 1-item lists
261 k = k_start
262 for ci, jh in enumerate(jr): # loop over 1st order dimension
263 z_end = jh == jr[-1]
264 #print (" itersum", size, i, jh, jh+size)
265 if mode == 4:
266 # COS table pre-generated mode
267 if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
268 result = ri[ji[jh]] # lower half
269 elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
270 result = ri[ji[jh+size]] # upper half
271 elif SVSHAPE.skip == 0b10: #
272 result = k # cos table offset
273 else:
274 # COS table generated on-demand ("Vertical-First") mode
275 if SVSHAPE.skip == 0b00: # in [0b00, 0b10]:
276 result = ri[ji[jh]] # lower half
277 elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]:
278 result = ri[ji[jh+size]] # upper half
279 elif SVSHAPE.skip == 0b10: #
280 result = ci # coefficient helper
281 elif SVSHAPE.skip == 0b11: #
282 result = size # coefficient helper
283 loopends = (z_end |
284 ((y_end and z_end)<<1) |
285 ((y_end and x_end and z_end)<<2))
286
287 yield result + SVSHAPE.offset, loopends
288 k += 1
289
290 # now in-place swap
291 if SVSHAPE.submode2 == 0b11 and inplace_mode:
292 j = list(range(i, i + halfsize))
293 jr = list(range(i+halfsize, i + size))
294 jr.reverse()
295 for ci, (jl, jh) in enumerate(zip(j[:hz2], jr[:hz2])):
296 jlh = jl+halfsize
297 #print ("inplace swap", jh, jlh)
298 tmp1, tmp2 = ji[jlh], ji[jh]
299 ji[jlh], ji[jh] = tmp2, tmp1
300
301 # new k_start point for cos tables( runs inside x_r loop NOT i loop)
302 k_start += halfsize
303
304
305 def pprint_schedule(schedule, n):
306 size = 2
307 idx = 0
308 while size <= n:
309 halfsize = size // 2
310 tablestep = n // size
311 print ("size %d halfsize %d tablestep %d" % \
312 (size, halfsize, tablestep))
313 for i in range(0, n, size):
314 prefix = "i %d\t" % i
315 for j in range(i, i + halfsize):
316 (jl, je), (jh, he) = schedule[idx]
317 print (" %-3d\t%s j=%-2d jh=%-2d "
318 "j[jl=%-2d] j[jh=%-2d]" % \
319 (idx, prefix, j, j+halfsize,
320 jl, jh,
321 ),
322 "end", bin(je)[2:], bin(je)[2:])
323 idx += 1
324 size *= 2
325
326 def pprint_schedule_outer(schedule, n):
327 size = 2
328 idx = 0
329 while size <= n//2:
330 halfsize = size // 2
331 tablestep = n // size
332 print ("size %d halfsize %d tablestep %d" % \
333 (size, halfsize, tablestep))
334 y_r = list(range(0, halfsize))
335 for i in y_r:
336 prefix = "i %d\t" % i
337 jr = list(range(i+halfsize, i+n-halfsize, size))
338 for j in jr:
339 (jl, je), (jh, he) = schedule[idx]
340 print (" %-3d\t%s j=%-2d jh=%-2d "
341 "j[jl=%-2d] j[jh=%-2d]" % \
342 (idx, prefix, j, j+halfsize,
343 jl, jh,
344 ),
345 "end", bin(je)[2:], bin(je)[2:])
346 idx += 1
347 size *= 2
348
349
350 # totally cool *in-place* DCT algorithm using yield REMAPs
351 def transform2(vec):
352
353 # Initialization
354 n = len(vec)
355 print ()
356 print ("transform2", n)
357 levels = n.bit_length() - 1
358
359 # set up dims
360 xdim = n
361
362 # reference (read/write) the in-place data in *reverse-bit-order*
363 ri = list(range(n))
364 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
365
366 # and pretend we LDed data in half-swapped *and* bit-reversed order as well
367 # TODO: merge these two
368 vec = halfrev2(vec, False)
369 vec = [vec[ri[i]] for i in range(n)]
370
371 # create a cos table: not strictly necessary but here for illustrative
372 # purposes, to demonstrate the point that it really *is* iterative.
373 # this table could be cached and used multiple times rather than
374 # computed every time.
375 ctable = []
376 size = n
377 while size >= 2:
378 halfsize = size // 2
379 for ci in range(halfsize):
380 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
381 ctable.append(coeff)
382 print ("coeff", size, "ci", ci, "k", len(ctable)-1,
383 "i/n", (ci+0.5)/size, coeff)
384 size //= 2
385
386 # set up an SVSHAPE
387 class SVSHAPE:
388 pass
389 # ci schedule
390 SVSHAPE0 = SVSHAPE()
391 SVSHAPE0.lims = [xdim, 4, 0]
392 SVSHAPE0.mode = 0b01
393 SVSHAPE0.submode2 = 0b01
394 SVSHAPE0.skip = 0b10
395 SVSHAPE0.offset = 0 # experiment with different offset, here
396 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
397 # size schedule
398 SVSHAPE1 = SVSHAPE()
399 SVSHAPE1.lims = [xdim, 4, 0]
400 SVSHAPE1.mode = 0b01
401 SVSHAPE1.submode2 = 0b01
402 SVSHAPE1.skip = 0b11
403 SVSHAPE1.offset = 0 # experiment with different offset, here
404 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
405 # k schedule
406 SVSHAPE2 = SVSHAPE()
407 SVSHAPE2.lims = [xdim, 4, 0]
408 SVSHAPE2.mode = 0b01
409 SVSHAPE2.submode2 = 0b01
410 SVSHAPE2.skip = 0b00
411 SVSHAPE2.offset = 0 # experiment with different offset, here
412 SVSHAPE2.invxyz = [1,0,0] # inversion if desired
413
414 # enumerate over the iterator function, getting new indices
415 i0 = iterate_dct_inner_costable_indices(SVSHAPE0)
416 i1 = iterate_dct_inner_costable_indices(SVSHAPE1)
417 i2 = iterate_dct_inner_costable_indices(SVSHAPE2)
418 for ((ci, cie), (size, sze), (k, ke)) in \
419 zip(i0, i1, i2):
420 print ("xform2 cos", ci, size, k)
421 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
422 assert coeff == ctable[k]
423 print ("coeff", size, "ci", ci, "k", k,
424 "i/n", (ci+0.5)/size, coeff,
425 "end", bin(cie), bin(sze), bin(ke))
426 if cie == 0b111: # all loops end
427 break
428
429 ################
430 # INNER butterfly
431 ################
432
433 # j schedule
434 SVSHAPE0 = SVSHAPE()
435 SVSHAPE0.lims = [xdim, 0b000001, 0]
436 SVSHAPE0.mode = 0b01
437 SVSHAPE0.submode2 = 0b01
438 SVSHAPE0.skip = 0b00
439 SVSHAPE0.offset = 0 # experiment with different offset, here
440 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
441 # j+halfstep schedule
442 SVSHAPE1 = SVSHAPE()
443 SVSHAPE1.lims = [xdim, 0b000001, 0]
444 SVSHAPE1.mode = 0b01
445 SVSHAPE1.submode2 = 0b01
446 SVSHAPE1.skip = 0b01
447 SVSHAPE1.offset = 0 # experiment with different offset, here
448 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
449 # ci schedule
450 SVSHAPE2 = SVSHAPE()
451 SVSHAPE2.lims = [xdim, 0b000001, 0]
452 SVSHAPE2.mode = 0b01
453 SVSHAPE2.submode2 = 0b01
454 SVSHAPE2.skip = 0b10
455 SVSHAPE2.offset = 0 # experiment with different offset, here
456 SVSHAPE2.invxyz = [1,0,0] # inversion if desired
457 # size schedule
458 SVSHAPE3 = SVSHAPE()
459 SVSHAPE3.lims = [xdim, 0b000001, 0]
460 SVSHAPE3.mode = 0b01
461 SVSHAPE3.submode2 = 0b01
462 SVSHAPE3.skip = 0b11
463 SVSHAPE3.offset = 0 # experiment with different offset, here
464 SVSHAPE3.invxyz = [1,0,0] # inversion if desired
465
466 # enumerate over the iterator function, getting new indices
467 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
468 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
469 i2 = iterate_dct_inner_butterfly_indices(SVSHAPE2)
470 i3 = iterate_dct_inner_butterfly_indices(SVSHAPE3)
471 for k, ((jl, jle), (jh, jhe), (ci, cie), (size, sze)) in \
472 enumerate(zip(i0, i1, i2, i3)):
473 t1, t2 = vec[jl], vec[jh]
474 print ("xform2", jl, jh, ci, size)
475 coeff = (math.cos((ci + 0.5) * math.pi / size) * 2.0)
476 #assert coeff == ctable[k]
477 vec[jl] = t1 + t2
478 vec[jh] = (t1 - t2) * (1/coeff)
479 print ("coeff", size, "ci", ci,
480 "jl", jl, "jh", jh,
481 "i/n", (ci+0.5)/size, coeff, vec[jl],
482 vec[jh],
483 "end", bin(jle), bin(jhe))
484 if jle == 0b111: # all loops end
485 break
486
487 print("transform2 pre-itersum", vec)
488
489 # now things are in the right order for the outer butterfly.
490
491 # j schedule
492 SVSHAPE0 = SVSHAPE()
493 SVSHAPE0.lims = [xdim, 0b0000010, 0]
494 SVSHAPE0.submode2 = 0b100
495 SVSHAPE0.mode = 0b01
496 SVSHAPE0.skip = 0b00
497 SVSHAPE0.offset = 0 # experiment with different offset, here
498 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
499 # j+halfstep schedule
500 SVSHAPE1 = SVSHAPE()
501 SVSHAPE1.lims = [xdim, 0b0000010, 0]
502 SVSHAPE1.mode = 0b01
503 SVSHAPE1.submode2 = 0b100
504 SVSHAPE1.skip = 0b01
505 SVSHAPE1.offset = 0 # experiment with different offset, here
506 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
507
508 # enumerate over the iterator function, getting new indices
509 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
510 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
511 for k, ((jl, jle), (jh, jhe)) in enumerate(zip(i0, i1)):
512 print ("itersum jr", jl, jh,
513 "end", bin(jle), bin(jhe))
514 vec[jl] += vec[jh]
515 size //= 2
516 if jle == 0b111: # all loops end
517 break
518
519 print("transform2 result", vec)
520
521 return vec
522
523
524 def demo():
525 # set the dimension sizes here
526 n = 8
527 xdim = n
528 ydim = 0 # not needed
529 zdim = 0 # again, not needed
530
531
532 ################
533 # INNER butterfly
534 ################
535
536 # set up an SVSHAPE
537 class SVSHAPE:
538 pass
539 # j schedule
540 SVSHAPE0 = SVSHAPE()
541 SVSHAPE0.lims = [xdim, 0b000001, zdim]
542 SVSHAPE0.submode2 = 0b010
543 SVSHAPE0.mode = 0b01
544 SVSHAPE0.skip = 0b00
545 SVSHAPE0.offset = 0 # experiment with different offset, here
546 SVSHAPE0.invxyz = [0,0,0] # inversion if desired
547 # j+halfstep schedule
548 SVSHAPE1 = SVSHAPE()
549 SVSHAPE1.lims = [xdim, 0b000001, zdim]
550 SVSHAPE1.submode2 = 0b010
551 SVSHAPE1.mode = 0b01
552 SVSHAPE1.skip = 0b01
553 SVSHAPE1.offset = 0 # experiment with different offset, here
554 SVSHAPE1.invxyz = [0,0,0] # inversion if desired
555
556 # enumerate over the iterator function, getting new indices
557 schedule = []
558 i0 = iterate_dct_inner_butterfly_indices(SVSHAPE0)
559 i1 = iterate_dct_inner_butterfly_indices(SVSHAPE1)
560 for idx, (jl, jh) in enumerate(zip(i0, i1)):
561 schedule.append((jl, jh))
562 if jl[1] == 0b111: # end
563 break
564
565 # ok now pretty-print the results, with some debug output
566 print ("inner butterfly")
567 pprint_schedule(schedule, n)
568 print ("")
569
570 ################
571 # outer butterfly
572 ################
573
574 # j schedule
575 SVSHAPE0 = SVSHAPE()
576 SVSHAPE0.lims = [xdim, 0b000010, zdim]
577 SVSHAPE0.mode = 0b01
578 SVSHAPE0.submode2 = 0b100
579 SVSHAPE0.skip = 0b10
580 SVSHAPE0.offset = 0 # experiment with different offset, here
581 SVSHAPE0.invxyz = [1,0,0] # inversion if desired
582 # j+halfstep schedule
583 SVSHAPE1 = SVSHAPE()
584 SVSHAPE1.lims = [xdim, 0b000010, zdim]
585 SVSHAPE1.mode = 0b01
586 SVSHAPE1.submode2 = 0b100
587 SVSHAPE1.skip = 0b11
588 SVSHAPE1.offset = 0 # experiment with different offset, here
589 SVSHAPE1.invxyz = [1,0,0] # inversion if desired
590
591 # enumerate over the iterator function, getting new indices
592 schedule = []
593 i0 = iterate_dct_outer_butterfly_indices(SVSHAPE0)
594 i1 = iterate_dct_outer_butterfly_indices(SVSHAPE1)
595 for idx, (jl, jh) in enumerate(zip(i0, i1)):
596 schedule.append((jl, jh))
597 if jl[1] == 0b111: # end
598 break
599
600 # ok now pretty-print the results, with some debug output
601 print ("outer butterfly")
602 pprint_schedule_outer(schedule, n)
603
604 # run the demo
605 if __name__ == '__main__':
606 demo()