From: Luke Kenneth Casson Leighton Date: Wed, 13 Oct 2021 11:25:25 +0000 (+0100) Subject: waaay too big a patch set (957 lines). X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5a9485ae7cbdc3c6a801bd91d87a9b06f8e48b40;p=ieee754fpu.git waaay too big a patch set (957 lines). https://bugs.libre-soc.org/show_bug.cgi?id=713#c98 Revert "refactor layout to use SimdScope and XLEN" This reverts commit 48c4d358fafb59fc7fdc414874c5288b81ec462d. --- diff --git a/src/ieee754/part/layout_experiment.py b/src/ieee754/part/layout_experiment.py index 26801a10..6ddea00d 100644 --- a/src/ieee754/part/layout_experiment.py +++ b/src/ieee754/part/layout_experiment.py @@ -22,7 +22,7 @@ from functools import reduce import operator from collections import defaultdict import dataclasses -from ieee754.part.util import XLEN, FpElWid, IntElWid, SimdMap, SimdScope + from ieee754.part_mul_add.partpoints import PartitionPoints @@ -49,220 +49,189 @@ class LayoutResult: return f"LayoutResult({fields})" -class SimdLayout(Shape): - def __init__(self, lane_shapes=None, signed=None, *, fixed_width=None, - width_follows_hint=True, scope=None): - """calculate a SIMD layout. - - Glossary: - * element: a single scalar value that is an element of a SIMD vector. - it has a width in bits, and a signedness. Every element is made of - 1 or more parts. An element optionally includes the padding - associated with it. - * lane: an element. An element optionally includes the padding - associated with it. - * ElWid: the element-width (really the element type) of an instruction. - Either an integer or a FP type. Integer `ElWid`s are sign-agnostic. - In Python, `ElWid` is either an enum type or is `int`. - Example `ElWid` definition for integers: - - class ElWid(Enum): - I8 = ... - I16 = ... - I32 = ... - I64 = ... - - Example `ElWid` definition for floats: - - class ElWid(Enum): - F16 = ... - BF16 = ... - F32 = ... - F64 = ... - * part: (not to be confused with a partition) A piece of a SIMD vector, - every SIMD vector is made of a non-negative integer of parts. - Elements are made of a power-of-two number of parts. A part is a - fixed number of bits wide for each different SIMD layout, it - doesn't vary when `elwid` changes. A part can have a bit width of - any non-negative integer, it is not restricted to power-of-two. - - - Arguments: - * lane_shapes: int or Mapping[ElWid, int] or SimdMap (optional) - the bit-width of all elements in this SIMD layout. - * signed: bool - the signedness of all elements in this SIMD layout - * fixed_width: int (optional) - the total width of a SIMD vector. One of lane_shapes and fixed_width - must be provided. - * width_follows_hint: bool - if fixed_width defaults to SimdScope.get().simd_full_width_hint - - Values used from SimdScope: - * elwid: ElWid or nmigen Value with ElWid as the shape - the current ElWid value - * part_counts: SimdMap - a map from `ElWid` values `k` to the number of parts in an element - when `elwid == k`. Values should be minimized, since higher values - often create bigger circuits. - - Example: - # here, an I8 element is 1 part wide - part_counts = SimdMap({ - IntElWid.I8: 1, - IntElWid.I16: 2, - IntElWid.I32: 4, - IntElWid.I64: 8, - }) - - Another Example: - # here, an F16 element is 1 part wide - part_counts = SimdMap({ - FpElWid.F16: 1, - FpElWid.BF16: 1, - FpElWid.F32: 2, - FpElWid.F64: 4, - }) - """ - if scope is None: - scope = SimdScope.get() - assert isinstance(scope, SimdScope) - self.scope = scope - elwid = self.scope.elwid - part_counts = self.scope.part_counts - assert isinstance(part_counts, SimdMap) - simd_full_width_hint = self.scope.simd_full_width_hint - full_part_count = self.scope.full_part_count - print(f"layout(elwid={elwid},\n" - f" signed={signed},\n" - f" part_counts={part_counts},\n" - f" lane_shapes={lane_shapes},\n" - f" fixed_width={fixed_width},\n" - f" simd_full_width_hint={simd_full_width_hint},\n" - f" width_follows_hint={width_follows_hint})") - - # when there are no lane_shapes specified, this indicates a - # desire to use the maximum available space based on the fixed width - # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67 - if lane_shapes is None: - assert fixed_width is not None, \ - "both fixed_width and lane_shapes cannot be None" - lane_shapes = {} - for k, cur_part_count in part_counts.items(): - cur_element_count = full_part_count // cur_part_count - assert fixed_width % cur_element_count == 0, ( - f"fixed_width ({fixed_width}) can't be split evenly into " - f"{cur_element_count} elements") - lane_shapes[k] = fixed_width // cur_element_count - print("lane_shapes", fixed_width, lane_shapes) - # convert lane_shapes to a Mapping[ElWid, Any] - lane_shapes = SimdMap(lane_shapes).mapping - # filter out unsupported elwidths - lane_shapes = {i: lane_shapes[i] for i in part_counts.keys()} - self.lane_shapes = lane_shapes - # calculate the minimum possible bit-width of a part. - # we divide each element's width by the number of parts in an element, - # giving the number of bits needed per part. - min_part_wid = 0 - for i, c in part_counts.items(): - # double negate to get ceil division - needed = -(-lane_shapes[i] // c) - min_part_wid = max(min_part_wid, needed) - # calculate the minimum bit-width required - min_width = min_part_wid * full_part_count - print("width", min_width, min_part_wid, full_part_count) - if width_follows_hint \ - and min_width <= simd_full_width_hint \ - and fixed_width is None: - fixed_width = simd_full_width_hint - - if fixed_width is not None: # override the width and part_wid - assert min_width <= fixed_width, \ - "not enough space to fit partitions" - self.part_wid = fixed_width // full_part_count - assert fixed_width % full_part_count == 0, \ - "fixed_width must be a multiple of full_part_count" - width = fixed_width - print("part_wid", self.part_wid, "count", full_part_count) - else: - # go with computed width - width = min_width - self.part_wid = min_part_wid - super().__init__(width, signed) - # create the breakpoints dictionary. - # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34 - # https://stackoverflow.com/questions/26367812/ - # dpoints: dict from bit-index to dict[ElWid, None] - # we use a dict from ElWid to None as the values of dpoints in order to - # get an ordered set - dpoints = defaultdict(dict) # if empty key, create a (empty) dict - for i, cur_part_count in part_counts.items(): - def add_p(bit_index): - # auto-creates dict if key non-existent - dpoints[bit_index][i] = None - # go through all elements for elwid `i`, each element starts at - # part index `start_part`, and goes for `cur_part_count` parts - for start_part in range(0, full_part_count, cur_part_count): - start_bit = start_part * self.part_wid - add_p(start_bit) # start of lane - add_p(start_bit + lane_shapes[i]) # start of padding - # do not need the breakpoints at the very start or the very end - dpoints.pop(0, None) - dpoints.pop(self.width, None) - plist = list(dpoints.keys()) - plist.sort() - dpoints = {k: dpoints[k].keys() for k in plist} - self.dpoints = dpoints - print("dpoints") - for k in plist: - print(f"{k}: {list(dpoints[k])}") - # second stage, add (map to) the elwidth==i expressions. - # TODO: use nmutil.treereduce? - points = {} - for p in plist: - it = map(lambda i: elwid == i, dpoints[p]) - points[p] = reduce(operator.or_, it) - # third stage, create the binary values which *if* elwidth is set to i - # *would* result in the mask at that elwidth being set to this value - # these can easily be double-checked through Assertion - self.bitp = {} - for i in part_counts.keys(): - self.bitp[i] = 0 - for p, elwidths in dpoints.items(): - if i in elwidths: - bitpos = plist.index(p) - self.bitp[i] |= 1 << bitpos - # fourth stage: determine which partitions are 100% unused. - # these can then be "blanked out" - self.bmask = (1 << len(plist)) - 1 - for p in self.bitp.values(): - self.bmask &= ~p - self.ppoints = PartitionPoints(points) - - def __repr__(self): - bitp = ", ".join(f"{k}: {bin(v)}" for k, v in self.bitp.items()) - dpoints = [] - for k, v in self.dpoints.items(): - dpoints.append(f"{k}: {list(v)}") - dpoints = ",\n ".join(dpoints) - ppoints = [] - for k, v in self.ppoints.items(): - ppoints.append(f"{k}: {list(v)}") - ppoints = ",\n ".join(ppoints) - return (f"SimdLayout(lane_shapes={self.lane_shapes},\n" - f" signed={self.signed},\n" - f" fixed_width={self.width},\n" - f" scope={self.scope},\n" - f" bitp={{{bitp}}},\n" - f" bmask={bin(self.bmask)},\n" - f" dpoints={{\n" - f" {dpoints}}},\n" - f" part_wid={self.part_wid},\n" - f" ppoints=PartitionPoints({{\n" - f" {ppoints}}}))") +# main fn, which started out here in the bugtracker: +# https://bugs.libre-soc.org/show_bug.cgi?id=713#c20 +def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None): + """calculate a SIMD layout. + + Glossary: + * element: a single scalar value that is an element of a SIMD vector. + it has a width in bits, and a signedness. Every element is made of 1 or + more parts. An element optionally includes the padding associated with + it. + * lane: an element. An element optionally includes the padding associated + with it. + * ElWid: the element-width (really the element type) of an instruction. + Either an integer or a FP type. Integer `ElWid`s are sign-agnostic. + In Python, `ElWid` is either an enum type or is `int`. + Example `ElWid` definition for integers: + + class ElWid(Enum): + I8 = ... + I16 = ... + I32 = ... + I64 = ... + + Example `ElWid` definition for floats: + + class ElWid(Enum): + F16 = ... + BF16 = ... + F32 = ... + F64 = ... + * part: (not to be confused with a partition) A piece of a SIMD vector, + every SIMD vector is made of a non-negative integer of parts. Elements + are made of a power-of-two number of parts. A part is a fixed number + of bits wide for each different SIMD layout, it doesn't vary when + `elwid` changes. A part can have a bit width of any non-negative + integer, it is not restricted to power-of-two. SIMD vectors should + have as few parts as necessary, since some circuits have size + proportional to the number of parts. + + + * elwid: ElWid or nmigen Value with ElWid as the shape + the current element-width + * signed: bool + the signedness of all elements in a SIMD layout + * part_counts: dict[ElWid, int] + a map from `ElWid` values `k` to the number of parts in an element + when `elwid == k`. Values should be minimized, since higher values + often create bigger circuits. + + Example: + # here, an I8 element is 1 part wide + part_counts = {ElWid.I8: 1, ElWid.I16: 2, ElWid.I32: 4, ElWid.I64: 8} + + Another Example: + # here, an F16 element is 1 part wide + part_counts = {ElWid.F16: 1, ElWid.BF16: 1, ElWid.F32: 2, ElWid.F64: 4} + * lane_shapes: int or Mapping[ElWid, int] (optional) + the bit-width of all elements in a SIMD layout. + * fixed_width: int (optional) + the total width of a SIMD vector. One of lane_shapes and fixed_width + must be provided. + """ + print(f"layout(elwid={elwid},\n" + f" signed={signed},\n" + f" part_counts={part_counts},\n" + f" lane_shapes={lane_shapes},\n" + f" fixed_width={fixed_width})") + assert isinstance(part_counts, Mapping) + # assert all part_counts are powers of two + assert all(v != 0 and (v & (v - 1)) == 0 for v in part_counts.values()),\ + "part_counts values must all be powers of two" + + full_part_count = max(part_counts.values()) + + # when there are no lane_shapes specified, this indicates a + # desire to use the maximum available space based on the fixed width + # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67 + if lane_shapes is None: + assert fixed_width is not None, \ + "both fixed_width and lane_shapes cannot be None" + lane_shapes = {} + for k, cur_part_count in part_counts.items(): + cur_element_count = full_part_count // cur_part_count + assert fixed_width % cur_element_count == 0, ( + f"fixed_width ({fixed_width}) can't be split evenly into " + f"{cur_element_count} elements") + lane_shapes[k] = fixed_width // cur_element_count + print("lane_shapes", fixed_width, lane_shapes) + # identify if the lane_shapes is a mapping (dict, etc.) + # if not, then assume that it is an integer (width) that + # needs to be requested across all partitions + if not isinstance(lane_shapes, Mapping): + lane_shapes = {i: lane_shapes for i in part_counts} + # calculate the minimum possible bit-width of a part. + # we divide each element's width by the number of parts in an element, + # giving the number of bits needed per part. + # we use `-min(-a // b for ...)` to get `max(ceil(a / b) for ...)`, + # but using integers. + min_part_wid = -min(-lane_shapes[i] // c for i, c in part_counts.items()) + # calculate the minimum bit-width required + min_width = min_part_wid * full_part_count + print("width", min_width, min_part_wid, full_part_count) + if fixed_width is not None: # override the width and part_wid + assert min_width <= fixed_width, "not enough space to fit partitions" + part_wid = fixed_width // full_part_count + assert fixed_width % full_part_count == 0, \ + "fixed_width must be a multiple of full_part_count" + width = fixed_width + print("part_wid", part_wid, "count", full_part_count) + else: + # go with computed width + width = min_width + part_wid = min_part_wid + # create the breakpoints dictionary. + # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34 + # https://stackoverflow.com/questions/26367812/ + # dpoints: dict from bit-index to dict[ElWid, None] + # we use a dict from ElWid to None as the values of dpoints in order to + # get an ordered set + dpoints = defaultdict(dict) # if empty key, create a (empty) dict + for i, cur_part_count in part_counts.items(): + def add_p(bit_index): + # auto-creates dict if key non-existent + dpoints[bit_index][i] = None + # go through all elements for elwid `i`, each element starts at + # part index `start_part`, and goes for `cur_part_count` parts + for start_part in range(0, full_part_count, cur_part_count): + start_bit = start_part * part_wid + add_p(start_bit) # start of lane + add_p(start_bit + lane_shapes[i]) # start of padding + # do not need the breakpoints at the very start or the very end + dpoints.pop(0, None) + dpoints.pop(width, None) + plist = list(dpoints.keys()) + plist.sort() + print("dpoints") + for k in plist: + print(f"{k}: {list(dpoints[k].keys())}") + # second stage, add (map to) the elwidth==i expressions. + # TODO: use nmutil.treereduce? + points = {} + for p in plist: + it = map(lambda i: elwid == i, dpoints[p]) + points[p] = reduce(operator.or_, it) + # third stage, create the binary values which *if* elwidth is set to i + # *would* result in the mask at that elwidth being set to this value + # these can easily be double-checked through Assertion + bitp = {} + for i in part_counts.keys(): + bitp[i] = 0 + for p, elwidths in dpoints.items(): + if i in elwidths: + bitpos = plist.index(p) + bitp[i] |= 1 << bitpos + # fourth stage: determine which partitions are 100% unused. + # these can then be "blanked out" + bmask = (1 << len(plist)) - 1 + for p in bitp.values(): + bmask &= ~p + return LayoutResult(PartitionPoints(points), bitp, bmask, width, + lane_shapes, part_wid, full_part_count) if __name__ == '__main__': + + class FpElWid(Enum): + F64 = 0 + F32 = 1 + F16 = 2 + BF16 = 3 + + def __repr__(self): + return super().__str__() + + class IntElWid(Enum): + I64 = 0 + I32 = 1 + I16 = 2 + I8 = 3 + + def __repr__(self): + return super().__str__() + # for each element-width (elwidth 0-3) the number of parts in an element # is given: # | part0 | part1 | part2 | part3 | @@ -286,8 +255,7 @@ if __name__ == '__main__': width_for_all_els = 3 for i in FpElWid: - with SimdScope(elwid=i, part_counts=part_counts): - print(i, SimdLayout(width_for_all_els, True, width_follows_hint=False)) + print(i, layout(i, True, part_counts, width_for_all_els)) # fixed_width=32 and no lane_widths says "allocate maximum" # elwid=F64 1x 32-bit |<-------i32------->| @@ -297,8 +265,7 @@ if __name__ == '__main__': print("maximum allocation from fixed_width=32") for i in FpElWid: - with SimdScope(elwid=i, part_counts=part_counts): - print(i, SimdLayout(signed=True, fixed_width=32)) + print(i, layout(i, True, part_counts, fixed_width=32)) # specify that the length is to be *different* at each of the elwidths. # combined with part_counts we have: @@ -314,18 +281,18 @@ if __name__ == '__main__': } for i in FpElWid: - with SimdScope(elwid=i, part_counts=part_counts): - print(i, SimdLayout(widths_at_elwidth, - False, width_follows_hint=False)) + print(i, layout(i, False, part_counts, widths_at_elwidth)) # this tests elwidth as an actual Signal. layout is allowed to # determine arbitrarily the overall length # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30 - with SimdScope(elwid_type=FpElWid, part_counts=part_counts) as scope: - l = SimdLayout(widths_at_elwidth, False, width_follows_hint=False) - elwid = scope.elwid - print(l) + elwid = Signal(FpElWid) + lr = layout(elwid, False, part_counts, widths_at_elwidth) + print(lr) + for k, v in lr.bitp.items(): + print(f"bitp elwidth={k}", bin(v)) + print("bmask", bin(lr.bmask)) m = Module() @@ -334,7 +301,7 @@ if __name__ == '__main__': yield elwid.eq(i) yield Settle() ppt = [] - for pval in l.ppoints.values(): + for pval in lr.ppoints.values(): val = yield pval # get nmigen to evaluate pp ppt.append(val) print(i, ppt) @@ -342,36 +309,7 @@ if __name__ == '__main__': # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47 # https://stackoverflow.com/a/27165694 ival = int(''.join(map(str, ppt[::-1])), 2) - assert ival == l.bitp[i] - - sim = Simulator(m) - sim.add_process(process) - sim.run() - - # this tests elwidth as an actual Signal. layout uses the width hint - # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30 - - with SimdScope(elwid_type=FpElWid, part_counts=part_counts) as scope: - l = SimdLayout(widths_at_elwidth, False) - elwid = scope.elwid - print(l) - - m = Module() - - def process(): - for i in FpElWid: - yield elwid.eq(i) - yield Settle() - ppt = [] - for pval in l.ppoints.values(): - val = yield pval # get nmigen to evaluate pp - ppt.append(val) - print(i, ppt) - # check the results against bitp static-expected partition points - # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47 - # https://stackoverflow.com/a/27165694 - ival = int(''.join(map(str, ppt[::-1])), 2) - assert ival == l.bitp[i] + assert ival == lr.bitp[i] sim = Simulator(m) sim.add_process(process) @@ -381,10 +319,12 @@ if __name__ == '__main__': # determine arbitrarily the overall length, it is fixed to 64 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22 - with SimdScope(elwid_type=FpElWid, part_counts=part_counts) as scope: - l = SimdLayout(widths_at_elwidth, False, fixed_width=64) - elwid = scope.elwid - print(l) + elwid = Signal(FpElWid) + lr = layout(elwid, False, part_counts, widths_at_elwidth, fixed_width=64) + print(lr) + for k, v in lr.bitp.items(): + print(f"bitp elwidth={k}", bin(v)) + print("bmask", bin(lr.bmask)) m = Module() @@ -393,7 +333,7 @@ if __name__ == '__main__': yield elwid.eq(i) yield Settle() ppt = [] - for pval in list(l.ppoints.values()): + for pval in list(lr.ppoints.values()): val = yield pval # get nmigen to evaluate pp ppt.append(val) print(f"test elwidth={i}") @@ -402,18 +342,9 @@ if __name__ == '__main__': # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47 # https://stackoverflow.com/a/27165694 ival = int(''.join(map(str, ppt[::-1])), 2) - assert ival == l.bitp[i], \ - f"ival {bin(ival)} actual {bin(l.bitp[i])}" + assert ival == lr.bitp[i], \ + f"ival {bin(ival)} actual {bin(lr.bitp[i])}" sim = Simulator(m) sim.add_process(process) sim.run() - - # test XLEN - with SimdScope(elwid_type=IntElWid): - print("\nSimdLayout(XLEN):") - l1 = SimdLayout(XLEN) - print(l1) - print("\nSimdLayout(XLEN // 2):") - l2 = SimdLayout(XLEN // 2) - print(l2)