1 # a "yield" version of the Parallel Reduction/Prefix-Sum REMAP algorithm.
2 # the algorithm is in-place. it does not perform "MV" operations.
3 # instead, where a masked-out value *should* be read from is tracked
4 from copy
import deepcopy
8 # first, we'll have a simpler prefix-sum algorithm so we can logically work our
9 # way up to the full algorithm:
10 def prefix_sum_work_efficient(item_count
):
12 # `nmutil.prefix_sum.prefix_sum_ops(..., work_efficient=True)`
14 # compute the partial sums using a set of binary trees
16 while dist
< item_count
:
19 for i
in reversed(range(start
, item_count
, step
)):
21 # order is important for non-commutative ops -- don't swap lhs/rhs
22 yield f
"items[{i}] = items[{lhs}] + items[{i}]"
24 # express all output items in terms of the computed partial sums.
27 for i
in reversed(range(dist
* 3 - 1, item_count
, dist
* 2)):
29 # order is important for non-commutative ops -- don't swap lhs/rhs
30 yield f
"items[{i}] = items[{lhs}] + items[{i}]"
34 # python "yield" can be iterated. use this to make it clear how
35 # the indices are generated by using natural-looking nested loops
36 def iterate_indices2(SVSHAPE
, pred
=None):
37 # this is a copy of iterate_indices with reversing `steps` removed since
38 # that is useless afaict. scan/prefix-sum support is also added.
40 # get indices to iterate over, in the required order
42 # create lists of indices to iterate over in each dimension
44 # invert the indices if needed
47 if SVSHAPE
.order
!= (0, 1, 2) or any(SVSHAPE
.invxyz
[1:]):
48 raise ValueError("undefined")
49 if SVSHAPE
.skip
& 0b10:
50 # this is a scan/prefix-sum rather than a reduction.
51 if pred
is not None and any(not pred
[i
] for i
in ix
):
52 raise ValueError("undefined")
58 for i
in reversed(range(start
, xd
, step
)):
60 # yield f"items[{i}] = items[{lhs}] + items[{i}]"
61 v
= ix
[i
if SVSHAPE
.skip
& 0b1 else lhs
]
62 results
.append([v
+ SVSHAPE
.offset
, 0])
64 results
[-1][1] |
= 0b001 # end of inner loop
67 results
[-1][1] |
= 0b010 # end of first loop nest
68 # express all output items in terms of the computed partial sums.
71 for i
in reversed(range(dist
* 3 - 1, xd
, dist
* 2)):
73 # yield f"items[{i}] = items[{lhs}] + items[{i}]"
74 v
= ix
[i
if SVSHAPE
.skip
& 0b1 else lhs
]
75 results
.append([v
+ SVSHAPE
.offset
, 0])
77 results
[-1][1] |
= 0b001 # end of inner loop
80 results
[-1][1] = 0b111 # end of full algorithm
83 # start a loop from the lowest step
87 stepend
= step
>= xd
# note end of steps
89 for i
in range(0, xd
, step
):
92 oi
= ix
[other
] if other
< xd
else None
93 other_pred
= other
< xd
and (pred
is None or pred
[oi
])
94 if (pred
is None or pred
[ci
]) and other_pred
:
95 if SVSHAPE
.skip
== 0b00: # submode 00
97 elif SVSHAPE
.skip
== 0b01: # submode 01
99 results
.append([result
+ SVSHAPE
.offset
, 0])
103 results
[-1][1] = (stepend
<< 1) |
1 # notify end of loops
107 def demo_prefix_sum():
108 # set the dimension sizes here
115 SVSHAPE0
.lims
= [xdim
, 0, 0]
116 SVSHAPE0
.order
= 0, 1, 2
118 SVSHAPE0
.skip
= 0b10 # prefix-sum lhs
119 SVSHAPE0
.offset
= 0 # experiment with different offset, here
120 SVSHAPE0
.invxyz
= [0, 0, 0] # inversion if desired
123 SVSHAPE1
.lims
= [xdim
, 0, 0]
124 SVSHAPE1
.order
= 0, 1, 2
126 SVSHAPE1
.skip
= 0b11 # prefix-sum rhs
127 SVSHAPE1
.offset
= 0 # experiment with different offset, here
128 SVSHAPE1
.invxyz
= [0, 0, 0] # inversion if desired
130 # enumerate over the iterator function, getting new indices
131 shapes
= list(iterate_indices2(SVSHAPE0
)), list(iterate_indices2(SVSHAPE1
))
132 for idx
in range(len(shapes
[0])):
133 l_idx
, lend
= shapes
[0][idx
]
134 r_idx
, rend
= shapes
[1][idx
]
135 # note RT is r_idx, not l_idx -- this is different than reduction
136 print(f
"{idx}: r{r_idx} = r{l_idx} + r{r_idx} "
137 f
"lend={lend:03b} rend={rend:03b}")
140 # python "yield" can be iterated. use this to make it clear how
141 # the indices are generated by using natural-looking nested loops
142 def iterate_indices(SVSHAPE
, pred
=None):
143 # get indices to iterate over, in the required order
145 # create lists of indices to iterate over in each dimension
147 # invert the indices if needed
148 if SVSHAPE
.invxyz
[0]:
150 # start a loop from the lowest step
156 # invert the indices if needed
157 if SVSHAPE
.invxyz
[1]:
160 stepend
= (step
== steps
[-1]) # note end of steps
161 idxs
= list(range(0, xd
, step
))
164 other
= i
+ step
// 2
166 oi
= ix
[other
] if other
< xd
else None
167 other_pred
= other
< xd
and (pred
is None or pred
[oi
])
168 if (pred
is None or pred
[ci
]) and other_pred
:
169 if SVSHAPE
.skip
== 0b00: # submode 00
171 elif SVSHAPE
.skip
== 0b01: # submode 01
173 results
.append([result
+ SVSHAPE
.offset
, 0])
177 results
[-1][1] = (stepend
<< 1) |
1 # notify end of loops
182 # set the dimension sizes here
189 SVSHAPE0
.lims
= [xdim
, 0, 0]
190 SVSHAPE0
.order
= [0, 1, 2]
193 SVSHAPE0
.offset
= 0 # experiment with different offset, here
194 SVSHAPE0
.invxyz
= [0, 0, 0] # inversion if desired
197 SVSHAPE1
.lims
= [xdim
, 0, 0]
198 SVSHAPE1
.order
= [0, 1, 2]
201 SVSHAPE1
.offset
= 0 # experiment with different offset, here
202 SVSHAPE1
.invxyz
= [0, 0, 0] # inversion if desired
204 # enumerate over the iterator function, getting new indices
205 shapes
= list(iterate_indices(SVSHAPE0
)), list(iterate_indices(SVSHAPE1
))
206 for idx
in range(len(shapes
[0])):
211 print("%d->%d:%d" % (idx
, l_idx
, r_idx
),
212 "end", bin(lend
)[2:], bin(rend
)[2:])
215 def preduce_y(vec
, pred
=None, operation
=None):
216 if operation
is None:
217 operation
= operator
.add
226 SVSHAPE0
.lims
= [xdim
, 0, 0]
227 SVSHAPE0
.order
= [0, 1, 2]
230 SVSHAPE0
.offset
= 0 # experiment with different offset, here
231 SVSHAPE0
.invxyz
= [0, 0, 0] # inversion if desired
234 SVSHAPE1
.lims
= [xdim
, 0, 0]
235 SVSHAPE1
.order
= [0, 1, 2]
238 SVSHAPE1
.offset
= 0 # experiment with different offset, here
239 SVSHAPE1
.invxyz
= [0, 0, 0] # inversion if desired
241 # enumerate over the iterator function, getting new indices
242 shapes
= list(iterate_indices(SVSHAPE0
)), list(iterate_indices(SVSHAPE1
))
243 for idx
in range(len(shapes
[0])):
248 res
[l_idx
] = operation(res
[l_idx
], res
[r_idx
])
253 if __name__
== '__main__':
258 print("reduction results:")
259 vec
= [1, 2, 3, 4, 9, 5, 6]