add scan/prefix-sum support to copy of parallel-reduce remap iterator
authorJacob Lifshay <programmerjake@gmail.com>
Thu, 27 Apr 2023 05:09:19 +0000 (22:09 -0700)
committerJacob Lifshay <programmerjake@gmail.com>
Thu, 27 Apr 2023 05:13:12 +0000 (22:13 -0700)
src/openpower/decoder/isa/remap_preduce_yield.py
src/openpower/decoder/isa/test_remap_preduce_yield.py [new file with mode: 0644]

index a22d040b6e46bc216c33272f0dcd54a1e747b6f0..9e9fa2a69a0efd0a7794149353fff14d6fbcd73a 100644 (file)
-# a "yield" version of the Parallel Reduction REMAP algorithm.
+# a "yield" version of the Parallel Reduction/Prefix-Sum REMAP algorithm.
 # the algorithm is in-place. it does not perform "MV" operations.
 # instead, where a masked-out value *should* be read from is tracked
 from copy import deepcopy
 import operator
 
 
+# first, we'll have a simpler prefix-sum algorithm so we can logically work our
+# way up to the full algorithm:
+def prefix_sum_work_efficient(item_count):
+    # this is based on:
+    # `nmutil.prefix_sum.prefix_sum_ops(..., work_efficient=True)`
+
+    # compute the partial sums using a set of binary trees
+    dist = 1
+    while dist < item_count:
+        start = dist * 2 - 1
+        step = dist * 2
+        for i in reversed(range(start, item_count, step)):
+            lhs = i - dist
+            # order is important for non-commutative ops -- don't swap lhs/rhs
+            yield f"items[{i}] = items[{lhs}] + items[{i}]"
+        dist <<= 1
+    # express all output items in terms of the computed partial sums.
+    dist >>= 1
+    while dist >= 1:
+        for i in reversed(range(dist * 3 - 1, item_count, dist * 2)):
+            lhs = i - dist
+            # order is important for non-commutative ops -- don't swap lhs/rhs
+            yield f"items[{i}] = items[{lhs}] + items[{i}]"
+        dist >>= 1
+
+
+# python "yield" can be iterated. use this to make it clear how
+# the indices are generated by using natural-looking nested loops
+def iterate_indices2(SVSHAPE, pred=None):
+    # this is a copy of iterate_indices with reversing `steps` removed since
+    # that is useless afaict. scan/prefix-sum support is also added.
+
+    # get indices to iterate over, in the required order
+    xd = SVSHAPE.lims[0]
+    # create lists of indices to iterate over in each dimension
+    ix = list(range(xd))
+    # invert the indices if needed
+    if SVSHAPE.invxyz[0]:
+        ix.reverse()
+    if SVSHAPE.order != [0, 1, 2] or any(SVSHAPE.invxyz[1:]):
+        raise ValueError("undefined")
+    if SVSHAPE.skip & 0b10:
+        # this is a scan/prefix-sum rather than a reduction.
+        if pred is not None and any(not pred[i] for i in ix):
+            raise ValueError("undefined")
+        results = []
+        dist = 1
+        while dist < xd:
+            start = dist * 2 - 1
+            step = dist * 2
+            for i in reversed(range(start, xd, step)):
+                lhs = i - dist
+                # yield f"items[{i}] = items[{lhs}] + items[{i}]"
+                v = ix[i if SVSHAPE.skip & 0b1 else lhs]
+                results.append([v + SVSHAPE.offset, 0])
+            if results:
+                results[-1][1] |= 0b001  # end of inner loop
+            dist <<= 1
+        if results:
+            results[-1][1] |= 0b010  # end of first loop nest
+        # express all output items in terms of the computed partial sums.
+        dist >>= 1
+        while dist >= 1:
+            for i in reversed(range(dist * 3 - 1, xd, dist * 2)):
+                lhs = i - dist
+                # yield f"items[{i}] = items[{lhs}] + items[{i}]"
+                v = ix[i if SVSHAPE.skip & 0b1 else lhs]
+                results.append([v + SVSHAPE.offset, 0])
+            if results:
+                results[-1][1] |= 0b001  # end of inner loop
+            dist >>= 1
+        if results:
+            results[-1][1] = 0b111  # end of full algorithm
+        yield from results
+        return
+    # start a loop from the lowest step
+    step = 1
+    while step < xd:
+        step *= 2
+        stepend = step >= xd  # note end of steps
+        results = []
+        for i in range(0, xd, step):
+            other = i + step // 2
+            ci = ix[i]
+            oi = ix[other] if other < xd else None
+            other_pred = other < xd and (pred is None or pred[oi])
+            if (pred is None or pred[ci]) and other_pred:
+                if SVSHAPE.skip == 0b00:  # submode 00
+                    result = ci
+                elif SVSHAPE.skip == 0b01:  # submode 01
+                    result = oi
+                results.append([result + SVSHAPE.offset, 0])
+            elif other_pred:
+                ix[i] = oi
+        if results:
+            results[-1][1] = (stepend << 1) | 1  # notify end of loops
+        yield from results
+
+
+def demo_prefix_sum():
+    # set the dimension sizes here
+    xdim = 9
+
+    # set up an SVSHAPE
+    class SVSHAPE:
+        pass
+    SVSHAPE0 = SVSHAPE()
+    SVSHAPE0.lims = [xdim, 0, 0]
+    SVSHAPE0.order = [0, 1, 2]
+    SVSHAPE0.mode = 0b10
+    SVSHAPE0.skip = 0b10  # prefix-sum lhs
+    SVSHAPE0.offset = 0       # experiment with different offset, here
+    SVSHAPE0.invxyz = [0, 0, 0]  # inversion if desired
+
+    SVSHAPE1 = SVSHAPE()
+    SVSHAPE1.lims = [xdim, 0, 0]
+    SVSHAPE1.order = [0, 1, 2]
+    SVSHAPE1.mode = 0b10
+    SVSHAPE1.skip = 0b11  # prefix-sum rhs
+    SVSHAPE1.offset = 0       # experiment with different offset, here
+    SVSHAPE1.invxyz = [0, 0, 0]  # inversion if desired
+
+    # enumerate over the iterator function, getting new indices
+    shapes = list(iterate_indices2(SVSHAPE0)), list(iterate_indices2(SVSHAPE1))
+    for idx in range(len(shapes[0])):
+        l_idx, lend = shapes[0][idx]
+        r_idx, rend = shapes[1][idx]
+        # note RT is r_idx, not l_idx -- this is different than reduction
+        print(f"{idx}: r{r_idx} = r{l_idx} + r{r_idx}   "
+              f"lend={lend:03b} rend={rend:03b}")
+
+
 # python "yield" can be iterated. use this to make it clear how
 # the indices are generated by using natural-looking nested loops
 def iterate_indices(SVSHAPE, pred=None):
@@ -119,7 +251,11 @@ def preduce_y(vec, pred=None, operation=None):
 
 # run the demo
 if __name__ == '__main__':
+    print("reduction:")
     demo()
+    print("prefix-sum:")
+    demo_prefix_sum()
+    print("reduction results:")
     vec = [1, 2, 3, 4, 9, 5, 6]
     res = preduce_y(vec)
     print(vec)
diff --git a/src/openpower/decoder/isa/test_remap_preduce_yield.py b/src/openpower/decoder/isa/test_remap_preduce_yield.py
new file mode 100644 (file)
index 0000000..04b4d93
--- /dev/null
@@ -0,0 +1,74 @@
+import unittest
+from openpower.decoder.isa.remap_preduce_yield import (
+    prefix_sum_work_efficient, iterate_indices2)
+from nmutil.prefix_sum import prefix_sum_ops, Op
+from nmutil.plain_data import plain_data
+
+
+@plain_data()
+class MockSVShape:
+    __slots__ = "lims", "order", "mode", "skip", "offset", "invxyz"
+
+    def __init__(self, lims, order, mode, skip, offset, invxyz):
+        # type: (list[int], list[int], int, int, int, list[int]) -> None
+        self.lims = lims
+        self.order = order
+        self.mode = mode
+        self.skip = skip
+        self.offset = offset
+        self.invxyz = invxyz
+
+
+class TestRemapPrefixSum(unittest.TestCase):
+    def test_prefix_sum_work_efficient(self):
+        def fmt_op(op):
+            return f"items[{op.out}] = items[{op.lhs}] + items[{op.rhs}]"
+        for item_count in range(1, 16):
+            with self.subTest(item_count=item_count):
+                ops = list(prefix_sum_work_efficient(item_count))
+                expected = prefix_sum_ops(item_count, work_efficient=True)
+                expected = list(map(fmt_op, expected))
+                self.assertEqual(ops, expected)
+
+    def iterate_indices2_helper(self, reverse, item_count, offset):
+        lhs_svshape = MockSVShape(
+            lims=[item_count, 0, 0],
+            order=[0, 1, 2],
+            mode=0b10,
+            skip=0b10,  # prefix-sum lhs
+            offset=offset,
+            invxyz=[reverse, 0, 0],
+        )
+        rhs_svshape = MockSVShape(
+            lims=[item_count, 0, 0],
+            order=[0, 1, 2],
+            mode=0b10,
+            skip=0b11,  # prefix-sum rhs
+            offset=offset,
+            invxyz=[reverse, 0, 0],
+        )
+        lhs_results = list(iterate_indices2(lhs_svshape))
+        rhs_results = list(iterate_indices2(rhs_svshape))
+        self.assertEqual(len(lhs_results), len(rhs_results))
+        ops = []
+        for (lhs, lend), (rhs, rend) in zip(lhs_results, rhs_results):
+            self.assertEqual(lend, rend)
+            ops.append(Op(out=rhs, lhs=lhs, rhs=rhs, row=0))
+        expected = list(prefix_sum_ops(item_count, work_efficient=True))
+
+        def f(i):
+            return offset + (item_count - 1 - i if reverse else i)
+        expected = [Op(
+            out=f(i.out), lhs=f(i.lhs), rhs=f(i.rhs), row=0) for i in expected]
+        self.assertEqual(ops, expected)
+
+    def test_iterate_indices2(self):
+        for r in (False, True):
+            for ic in range(1, 16):
+                for off in (0, 20):
+                    with self.subTest(reverse=r, item_count=ic, offset=off):
+                        self.iterate_indices2_helper(r, ic, off)
+
+
+if __name__ == "__main__":
+    unittest.main()