From c65d8e0b76254572a97e6062533341f2937f7b2b Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Wed, 26 Apr 2023 22:09:19 -0700 Subject: [PATCH] add scan/prefix-sum support to copy of parallel-reduce remap iterator --- .../decoder/isa/remap_preduce_yield.py | 138 +++++++++++++++++- .../decoder/isa/test_remap_preduce_yield.py | 74 ++++++++++ 2 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 src/openpower/decoder/isa/test_remap_preduce_yield.py diff --git a/src/openpower/decoder/isa/remap_preduce_yield.py b/src/openpower/decoder/isa/remap_preduce_yield.py index a22d040b..9e9fa2a6 100644 --- a/src/openpower/decoder/isa/remap_preduce_yield.py +++ b/src/openpower/decoder/isa/remap_preduce_yield.py @@ -1,10 +1,142 @@ -# a "yield" version of the Parallel Reduction REMAP algorithm. +# a "yield" version of the Parallel Reduction/Prefix-Sum REMAP algorithm. # the algorithm is in-place. it does not perform "MV" operations. # instead, where a masked-out value *should* be read from is tracked from copy import deepcopy import operator +# first, we'll have a simpler prefix-sum algorithm so we can logically work our +# way up to the full algorithm: +def prefix_sum_work_efficient(item_count): + # this is based on: + # `nmutil.prefix_sum.prefix_sum_ops(..., work_efficient=True)` + + # compute the partial sums using a set of binary trees + dist = 1 + while dist < item_count: + start = dist * 2 - 1 + step = dist * 2 + for i in reversed(range(start, item_count, step)): + lhs = i - dist + # order is important for non-commutative ops -- don't swap lhs/rhs + yield f"items[{i}] = items[{lhs}] + items[{i}]" + dist <<= 1 + # express all output items in terms of the computed partial sums. + dist >>= 1 + while dist >= 1: + for i in reversed(range(dist * 3 - 1, item_count, dist * 2)): + lhs = i - dist + # order is important for non-commutative ops -- don't swap lhs/rhs + yield f"items[{i}] = items[{lhs}] + items[{i}]" + dist >>= 1 + + +# python "yield" can be iterated. use this to make it clear how +# the indices are generated by using natural-looking nested loops +def iterate_indices2(SVSHAPE, pred=None): + # this is a copy of iterate_indices with reversing `steps` removed since + # that is useless afaict. scan/prefix-sum support is also added. + + # get indices to iterate over, in the required order + xd = SVSHAPE.lims[0] + # create lists of indices to iterate over in each dimension + ix = list(range(xd)) + # invert the indices if needed + if SVSHAPE.invxyz[0]: + ix.reverse() + if SVSHAPE.order != [0, 1, 2] or any(SVSHAPE.invxyz[1:]): + raise ValueError("undefined") + if SVSHAPE.skip & 0b10: + # this is a scan/prefix-sum rather than a reduction. + if pred is not None and any(not pred[i] for i in ix): + raise ValueError("undefined") + results = [] + dist = 1 + while dist < xd: + start = dist * 2 - 1 + step = dist * 2 + for i in reversed(range(start, xd, step)): + lhs = i - dist + # yield f"items[{i}] = items[{lhs}] + items[{i}]" + v = ix[i if SVSHAPE.skip & 0b1 else lhs] + results.append([v + SVSHAPE.offset, 0]) + if results: + results[-1][1] |= 0b001 # end of inner loop + dist <<= 1 + if results: + results[-1][1] |= 0b010 # end of first loop nest + # express all output items in terms of the computed partial sums. + dist >>= 1 + while dist >= 1: + for i in reversed(range(dist * 3 - 1, xd, dist * 2)): + lhs = i - dist + # yield f"items[{i}] = items[{lhs}] + items[{i}]" + v = ix[i if SVSHAPE.skip & 0b1 else lhs] + results.append([v + SVSHAPE.offset, 0]) + if results: + results[-1][1] |= 0b001 # end of inner loop + dist >>= 1 + if results: + results[-1][1] = 0b111 # end of full algorithm + yield from results + return + # start a loop from the lowest step + step = 1 + while step < xd: + step *= 2 + stepend = step >= xd # note end of steps + results = [] + for i in range(0, xd, step): + other = i + step // 2 + ci = ix[i] + oi = ix[other] if other < xd else None + other_pred = other < xd and (pred is None or pred[oi]) + if (pred is None or pred[ci]) and other_pred: + if SVSHAPE.skip == 0b00: # submode 00 + result = ci + elif SVSHAPE.skip == 0b01: # submode 01 + result = oi + results.append([result + SVSHAPE.offset, 0]) + elif other_pred: + ix[i] = oi + if results: + results[-1][1] = (stepend << 1) | 1 # notify end of loops + yield from results + + +def demo_prefix_sum(): + # set the dimension sizes here + xdim = 9 + + # set up an SVSHAPE + class SVSHAPE: + pass + SVSHAPE0 = SVSHAPE() + SVSHAPE0.lims = [xdim, 0, 0] + SVSHAPE0.order = [0, 1, 2] + SVSHAPE0.mode = 0b10 + SVSHAPE0.skip = 0b10 # prefix-sum lhs + SVSHAPE0.offset = 0 # experiment with different offset, here + SVSHAPE0.invxyz = [0, 0, 0] # inversion if desired + + SVSHAPE1 = SVSHAPE() + SVSHAPE1.lims = [xdim, 0, 0] + SVSHAPE1.order = [0, 1, 2] + SVSHAPE1.mode = 0b10 + SVSHAPE1.skip = 0b11 # prefix-sum rhs + SVSHAPE1.offset = 0 # experiment with different offset, here + SVSHAPE1.invxyz = [0, 0, 0] # inversion if desired + + # enumerate over the iterator function, getting new indices + shapes = list(iterate_indices2(SVSHAPE0)), list(iterate_indices2(SVSHAPE1)) + for idx in range(len(shapes[0])): + l_idx, lend = shapes[0][idx] + r_idx, rend = shapes[1][idx] + # note RT is r_idx, not l_idx -- this is different than reduction + print(f"{idx}: r{r_idx} = r{l_idx} + r{r_idx} " + f"lend={lend:03b} rend={rend:03b}") + + # python "yield" can be iterated. use this to make it clear how # the indices are generated by using natural-looking nested loops def iterate_indices(SVSHAPE, pred=None): @@ -119,7 +251,11 @@ def preduce_y(vec, pred=None, operation=None): # run the demo if __name__ == '__main__': + print("reduction:") demo() + print("prefix-sum:") + demo_prefix_sum() + print("reduction results:") vec = [1, 2, 3, 4, 9, 5, 6] res = preduce_y(vec) print(vec) diff --git a/src/openpower/decoder/isa/test_remap_preduce_yield.py b/src/openpower/decoder/isa/test_remap_preduce_yield.py new file mode 100644 index 00000000..04b4d93f --- /dev/null +++ b/src/openpower/decoder/isa/test_remap_preduce_yield.py @@ -0,0 +1,74 @@ +import unittest +from openpower.decoder.isa.remap_preduce_yield import ( + prefix_sum_work_efficient, iterate_indices2) +from nmutil.prefix_sum import prefix_sum_ops, Op +from nmutil.plain_data import plain_data + + +@plain_data() +class MockSVShape: + __slots__ = "lims", "order", "mode", "skip", "offset", "invxyz" + + def __init__(self, lims, order, mode, skip, offset, invxyz): + # type: (list[int], list[int], int, int, int, list[int]) -> None + self.lims = lims + self.order = order + self.mode = mode + self.skip = skip + self.offset = offset + self.invxyz = invxyz + + +class TestRemapPrefixSum(unittest.TestCase): + def test_prefix_sum_work_efficient(self): + def fmt_op(op): + return f"items[{op.out}] = items[{op.lhs}] + items[{op.rhs}]" + for item_count in range(1, 16): + with self.subTest(item_count=item_count): + ops = list(prefix_sum_work_efficient(item_count)) + expected = prefix_sum_ops(item_count, work_efficient=True) + expected = list(map(fmt_op, expected)) + self.assertEqual(ops, expected) + + def iterate_indices2_helper(self, reverse, item_count, offset): + lhs_svshape = MockSVShape( + lims=[item_count, 0, 0], + order=[0, 1, 2], + mode=0b10, + skip=0b10, # prefix-sum lhs + offset=offset, + invxyz=[reverse, 0, 0], + ) + rhs_svshape = MockSVShape( + lims=[item_count, 0, 0], + order=[0, 1, 2], + mode=0b10, + skip=0b11, # prefix-sum rhs + offset=offset, + invxyz=[reverse, 0, 0], + ) + lhs_results = list(iterate_indices2(lhs_svshape)) + rhs_results = list(iterate_indices2(rhs_svshape)) + self.assertEqual(len(lhs_results), len(rhs_results)) + ops = [] + for (lhs, lend), (rhs, rend) in zip(lhs_results, rhs_results): + self.assertEqual(lend, rend) + ops.append(Op(out=rhs, lhs=lhs, rhs=rhs, row=0)) + expected = list(prefix_sum_ops(item_count, work_efficient=True)) + + def f(i): + return offset + (item_count - 1 - i if reverse else i) + expected = [Op( + out=f(i.out), lhs=f(i.lhs), rhs=f(i.rhs), row=0) for i in expected] + self.assertEqual(ops, expected) + + def test_iterate_indices2(self): + for r in (False, True): + for ic in range(1, 16): + for off in (0, 20): + with self.subTest(reverse=r, item_count=ic, offset=off): + self.iterate_indices2_helper(r, ic, off) + + +if __name__ == "__main__": + unittest.main() -- 2.30.2