-# a "yield" version of the Parallel Reduction REMAP algorithm.
+# a "yield" version of the Parallel Reduction/Prefix-Sum REMAP algorithm.
# the algorithm is in-place. it does not perform "MV" operations.
# instead, where a masked-out value *should* be read from is tracked
from copy import deepcopy
import operator
+# first, we'll have a simpler prefix-sum algorithm so we can logically work our
+# way up to the full algorithm:
+def prefix_sum_work_efficient(item_count):
+ # this is based on:
+ # `nmutil.prefix_sum.prefix_sum_ops(..., work_efficient=True)`
+
+ # compute the partial sums using a set of binary trees
+ dist = 1
+ while dist < item_count:
+ start = dist * 2 - 1
+ step = dist * 2
+ for i in reversed(range(start, item_count, step)):
+ lhs = i - dist
+ # order is important for non-commutative ops -- don't swap lhs/rhs
+ yield f"items[{i}] = items[{lhs}] + items[{i}]"
+ dist <<= 1
+ # express all output items in terms of the computed partial sums.
+ dist >>= 1
+ while dist >= 1:
+ for i in reversed(range(dist * 3 - 1, item_count, dist * 2)):
+ lhs = i - dist
+ # order is important for non-commutative ops -- don't swap lhs/rhs
+ yield f"items[{i}] = items[{lhs}] + items[{i}]"
+ dist >>= 1
+
+
+# python "yield" can be iterated. use this to make it clear how
+# the indices are generated by using natural-looking nested loops
+def iterate_indices2(SVSHAPE, pred=None):
+ # this is a copy of iterate_indices with reversing `steps` removed since
+ # that is useless afaict. scan/prefix-sum support is also added.
+
+ # get indices to iterate over, in the required order
+ xd = SVSHAPE.lims[0]
+ # create lists of indices to iterate over in each dimension
+ ix = list(range(xd))
+ # invert the indices if needed
+ if SVSHAPE.invxyz[0]:
+ ix.reverse()
+ if SVSHAPE.order != [0, 1, 2] or any(SVSHAPE.invxyz[1:]):
+ raise ValueError("undefined")
+ if SVSHAPE.skip & 0b10:
+ # this is a scan/prefix-sum rather than a reduction.
+ if pred is not None and any(not pred[i] for i in ix):
+ raise ValueError("undefined")
+ results = []
+ dist = 1
+ while dist < xd:
+ start = dist * 2 - 1
+ step = dist * 2
+ for i in reversed(range(start, xd, step)):
+ lhs = i - dist
+ # yield f"items[{i}] = items[{lhs}] + items[{i}]"
+ v = ix[i if SVSHAPE.skip & 0b1 else lhs]
+ results.append([v + SVSHAPE.offset, 0])
+ if results:
+ results[-1][1] |= 0b001 # end of inner loop
+ dist <<= 1
+ if results:
+ results[-1][1] |= 0b010 # end of first loop nest
+ # express all output items in terms of the computed partial sums.
+ dist >>= 1
+ while dist >= 1:
+ for i in reversed(range(dist * 3 - 1, xd, dist * 2)):
+ lhs = i - dist
+ # yield f"items[{i}] = items[{lhs}] + items[{i}]"
+ v = ix[i if SVSHAPE.skip & 0b1 else lhs]
+ results.append([v + SVSHAPE.offset, 0])
+ if results:
+ results[-1][1] |= 0b001 # end of inner loop
+ dist >>= 1
+ if results:
+ results[-1][1] = 0b111 # end of full algorithm
+ yield from results
+ return
+ # start a loop from the lowest step
+ step = 1
+ while step < xd:
+ step *= 2
+ stepend = step >= xd # note end of steps
+ results = []
+ for i in range(0, xd, step):
+ other = i + step // 2
+ ci = ix[i]
+ oi = ix[other] if other < xd else None
+ other_pred = other < xd and (pred is None or pred[oi])
+ if (pred is None or pred[ci]) and other_pred:
+ if SVSHAPE.skip == 0b00: # submode 00
+ result = ci
+ elif SVSHAPE.skip == 0b01: # submode 01
+ result = oi
+ results.append([result + SVSHAPE.offset, 0])
+ elif other_pred:
+ ix[i] = oi
+ if results:
+ results[-1][1] = (stepend << 1) | 1 # notify end of loops
+ yield from results
+
+
+def demo_prefix_sum():
+ # set the dimension sizes here
+ xdim = 9
+
+ # set up an SVSHAPE
+ class SVSHAPE:
+ pass
+ SVSHAPE0 = SVSHAPE()
+ SVSHAPE0.lims = [xdim, 0, 0]
+ SVSHAPE0.order = [0, 1, 2]
+ SVSHAPE0.mode = 0b10
+ SVSHAPE0.skip = 0b10 # prefix-sum lhs
+ SVSHAPE0.offset = 0 # experiment with different offset, here
+ SVSHAPE0.invxyz = [0, 0, 0] # inversion if desired
+
+ SVSHAPE1 = SVSHAPE()
+ SVSHAPE1.lims = [xdim, 0, 0]
+ SVSHAPE1.order = [0, 1, 2]
+ SVSHAPE1.mode = 0b10
+ SVSHAPE1.skip = 0b11 # prefix-sum rhs
+ SVSHAPE1.offset = 0 # experiment with different offset, here
+ SVSHAPE1.invxyz = [0, 0, 0] # inversion if desired
+
+ # enumerate over the iterator function, getting new indices
+ shapes = list(iterate_indices2(SVSHAPE0)), list(iterate_indices2(SVSHAPE1))
+ for idx in range(len(shapes[0])):
+ l_idx, lend = shapes[0][idx]
+ r_idx, rend = shapes[1][idx]
+ # note RT is r_idx, not l_idx -- this is different than reduction
+ print(f"{idx}: r{r_idx} = r{l_idx} + r{r_idx} "
+ f"lend={lend:03b} rend={rend:03b}")
+
+
# python "yield" can be iterated. use this to make it clear how
# the indices are generated by using natural-looking nested loops
def iterate_indices(SVSHAPE, pred=None):
# run the demo
if __name__ == '__main__':
+ print("reduction:")
demo()
+ print("prefix-sum:")
+ demo_prefix_sum()
+ print("reduction results:")
vec = [1, 2, 3, 4, 9, 5, 6]
res = preduce_y(vec)
print(vec)
--- /dev/null
+import unittest
+from openpower.decoder.isa.remap_preduce_yield import (
+ prefix_sum_work_efficient, iterate_indices2)
+from nmutil.prefix_sum import prefix_sum_ops, Op
+from nmutil.plain_data import plain_data
+
+
+@plain_data()
+class MockSVShape:
+ __slots__ = "lims", "order", "mode", "skip", "offset", "invxyz"
+
+ def __init__(self, lims, order, mode, skip, offset, invxyz):
+ # type: (list[int], list[int], int, int, int, list[int]) -> None
+ self.lims = lims
+ self.order = order
+ self.mode = mode
+ self.skip = skip
+ self.offset = offset
+ self.invxyz = invxyz
+
+
+class TestRemapPrefixSum(unittest.TestCase):
+ def test_prefix_sum_work_efficient(self):
+ def fmt_op(op):
+ return f"items[{op.out}] = items[{op.lhs}] + items[{op.rhs}]"
+ for item_count in range(1, 16):
+ with self.subTest(item_count=item_count):
+ ops = list(prefix_sum_work_efficient(item_count))
+ expected = prefix_sum_ops(item_count, work_efficient=True)
+ expected = list(map(fmt_op, expected))
+ self.assertEqual(ops, expected)
+
+ def iterate_indices2_helper(self, reverse, item_count, offset):
+ lhs_svshape = MockSVShape(
+ lims=[item_count, 0, 0],
+ order=[0, 1, 2],
+ mode=0b10,
+ skip=0b10, # prefix-sum lhs
+ offset=offset,
+ invxyz=[reverse, 0, 0],
+ )
+ rhs_svshape = MockSVShape(
+ lims=[item_count, 0, 0],
+ order=[0, 1, 2],
+ mode=0b10,
+ skip=0b11, # prefix-sum rhs
+ offset=offset,
+ invxyz=[reverse, 0, 0],
+ )
+ lhs_results = list(iterate_indices2(lhs_svshape))
+ rhs_results = list(iterate_indices2(rhs_svshape))
+ self.assertEqual(len(lhs_results), len(rhs_results))
+ ops = []
+ for (lhs, lend), (rhs, rend) in zip(lhs_results, rhs_results):
+ self.assertEqual(lend, rend)
+ ops.append(Op(out=rhs, lhs=lhs, rhs=rhs, row=0))
+ expected = list(prefix_sum_ops(item_count, work_efficient=True))
+
+ def f(i):
+ return offset + (item_count - 1 - i if reverse else i)
+ expected = [Op(
+ out=f(i.out), lhs=f(i.lhs), rhs=f(i.rhs), row=0) for i in expected]
+ self.assertEqual(ops, expected)
+
+ def test_iterate_indices2(self):
+ for r in (False, True):
+ for ic in range(1, 16):
+ for off in (0, 20):
+ with self.subTest(reverse=r, item_count=ic, offset=off):
+ self.iterate_indices2_helper(r, ic, off)
+
+
+if __name__ == "__main__":
+ unittest.main()