From 6da0664c185dae796e3ba9a599bfe0b5371327c3 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Wed, 13 Oct 2021 12:28:13 +0100
Subject: [PATCH] unfortunately this is a multi-purpose commit.

it:

1) adds some types AND
2) adds some classes AND
3) fixes some whitespace issues AND
4) (finally) fixes some bugs

we strictly require *single* purpose commits
https://libre-soc.org/HDL_workflow/

Keep commits single-purpose

    edit files making minimal single purpose modifications
    (even if it involves multiple files. Good extreme example:
    globally changing a function name across an entire codebase
    is one purpose, one commit, yet hundreds of files.
    miss out one of those files, requiring multiple commits,
    and it actually becomes a nuisance).

https://bugs.libre-soc.org/show_bug.cgi?id=713#c98

Revert "fix layout bugs"

This reverts commit 9a318256b74054b8d592efe7be298764d0de415a.
---
 src/ieee754/part/layout_experiment.py | 266 ++++++++++----------------
 1 file changed, 96 insertions(+), 170 deletions(-)

diff --git a/src/ieee754/part/layout_experiment.py b/src/ieee754/part/layout_experiment.py
index 6ddea00d..346b8595 100644
--- a/src/ieee754/part/layout_experiment.py
+++ b/src/ieee754/part/layout_experiment.py
@@ -13,42 +13,18 @@ Links:
 """
 
 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
-from nmigen.sim import Simulator, Delay, Settle
+from nmigen.back.pysim import Simulator, Delay, Settle
 from nmigen.cli import rtlil
-from enum import Enum
 
 from collections.abc import Mapping
 from functools import reduce
 import operator
 from collections import defaultdict
-import dataclasses
+from pprint import pprint
 
 from ieee754.part_mul_add.partpoints import PartitionPoints
 
 
-@dataclasses.dataclass
-class LayoutResult:
-    ppoints: PartitionPoints
-    bitp: dict
-    bmask: int
-    width: int
-    lane_shapes: dict
-    part_wid: int
-    full_part_count: int
-
-    def __repr__(self):
-        fields = []
-        for field in dataclasses.fields(LayoutResult):
-            field_v = getattr(self, field.name)
-            if isinstance(field_v, PartitionPoints):
-                field_v = ',\n        '.join(
-                    f"{k}: {v}" for k, v in field_v.items())
-                field_v = f"{{{field_v}}}"
-            fields.append(f"{field.name}={field_v}")
-        fields = ",\n    ".join(fields)
-        return f"LayoutResult({fields})"
-
-
 # main fn, which started out here in the bugtracker:
 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
 def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
@@ -57,10 +33,7 @@ def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
     Glossary:
     * element: a single scalar value that is an element of a SIMD vector.
         it has a width in bits, and a signedness. Every element is made of 1 or
-        more parts. An element optionally includes the padding associated with
-        it.
-    * lane: an element. An element optionally includes the padding associated
-        with it.
+        more parts.
     * ElWid: the element-width (really the element type) of an instruction.
         Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
         In Python, `ElWid` is either an enum type or is `int`.
@@ -79,14 +52,13 @@ def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
             BF16 = ...
             F32 = ...
             F64 = ...
-    * part: (not to be confused with a partition) A piece of a SIMD vector,
-        every SIMD vector is made of a non-negative integer of parts. Elements
-        are made of a power-of-two number of parts. A part is a fixed number
-        of bits wide for each different SIMD layout, it doesn't vary when
-        `elwid` changes. A part can have a bit width of any non-negative
-        integer, it is not restricted to power-of-two. SIMD vectors should
-        have as few parts as necessary, since some circuits have size
-        proportional to the number of parts.
+    * part: A piece of a SIMD vector, every SIMD vector is made of a
+        non-negative integer of parts. Elements are made of a power-of-two
+        number of parts. A part is a fixed number of bits wide for each
+        different SIMD layout, it doesn't vary when `elwid` changes. A part
+        can have a bit width of any non-negative integer, it is not restricted
+        to power-of-two. SIMD vectors should have as few parts as necessary,
+        since some circuits have size proportional to the number of parts.
 
 
     * elwid: ElWid or nmigen Value with ElWid as the shape
@@ -111,88 +83,60 @@ def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
         the total width of a SIMD vector. One of lane_shapes and fixed_width
         must be provided.
     """
-    print(f"layout(elwid={elwid},\n"
-          f"    signed={signed},\n"
-          f"    part_counts={part_counts},\n"
-          f"    lane_shapes={lane_shapes},\n"
-          f"    fixed_width={fixed_width})")
-    assert isinstance(part_counts, Mapping)
-    # assert all part_counts are powers of two
-    assert all(v != 0 and (v & (v - 1)) == 0 for v in part_counts.values()),\
-        "part_counts values must all be powers of two"
-
-    full_part_count = max(part_counts.values())
-
     # when there are no lane_shapes specified, this indicates a
     # desire to use the maximum available space based on the fixed width
     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
     if lane_shapes is None:
         assert fixed_width is not None, \
             "both fixed_width and lane_shapes cannot be None"
-        lane_shapes = {}
-        for k, cur_part_count in part_counts.items():
-            cur_element_count = full_part_count // cur_part_count
-            assert fixed_width % cur_element_count == 0, (
-                f"fixed_width ({fixed_width}) can't be split evenly into "
-                f"{cur_element_count} elements")
-            lane_shapes[k] = fixed_width // cur_element_count
+        lane_shapes = {i: fixed_width // part_counts[i] for i in part_counts}
         print("lane_shapes", fixed_width, lane_shapes)
     # identify if the lane_shapes is a mapping (dict, etc.)
     # if not, then assume that it is an integer (width) that
     # needs to be requested across all partitions
     if not isinstance(lane_shapes, Mapping):
         lane_shapes = {i: lane_shapes for i in part_counts}
-    # calculate the minimum possible bit-width of a part.
-    # we divide each element's width by the number of parts in an element,
-    # giving the number of bits needed per part.
-    # we use `-min(-a // b for ...)` to get `max(ceil(a / b) for ...)`,
-    # but using integers.
-    min_part_wid = -min(-lane_shapes[i] // c for i, c in part_counts.items())
-    # calculate the minimum bit-width required
-    min_width = min_part_wid * full_part_count
-    print("width", min_width, min_part_wid, full_part_count)
+    # compute a set of partition widths
+    cpart_wid = [-lane_shapes[i] // c for i, c in part_counts.items()]
+    print("cpart_wid", cpart_wid, "part_counts", part_counts)
+    cpart_wid = -min(cpart_wid)
+    part_count = max(part_counts.values())
+    # calculate the minumum width required
+    width = cpart_wid * part_count
+    print("width", width, cpart_wid, part_count)
     if fixed_width is not None:  # override the width and part_wid
-        assert min_width <= fixed_width, "not enough space to fit partitions"
-        part_wid = fixed_width // full_part_count
-        assert fixed_width % full_part_count == 0, \
-            "fixed_width must be a multiple of full_part_count"
+        assert width < fixed_width, "not enough space to fit partitions"
+        part_wid = fixed_width // part_count
+        assert part_wid * part_count == fixed_width, \
+            "calculated width not aligned multiples"
         width = fixed_width
-        print("part_wid", part_wid, "count", full_part_count)
+        print("part_wid", part_wid, "count", part_count)
     else:
         # go with computed width
-        width = min_width
-        part_wid = min_part_wid
+        part_wid = cpart_wid
     # create the breakpoints dictionary.
     # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
     # https://stackoverflow.com/questions/26367812/
-    # dpoints: dict from bit-index to dict[ElWid, None]
-    # we use a dict from ElWid to None as the values of dpoints in order to
-    # get an ordered set
-    dpoints = defaultdict(dict)  # if empty key, create a (empty) dict
-    for i, cur_part_count in part_counts.items():
-        def add_p(bit_index):
-            # auto-creates dict if key non-existent
-            dpoints[bit_index][i] = None
-        # go through all elements for elwid `i`, each element starts at
-        # part index `start_part`, and goes for `cur_part_count` parts
-        for start_part in range(0, full_part_count, cur_part_count):
-            start_bit = start_part * part_wid
-            add_p(start_bit)  # start of lane
-            add_p(start_bit + lane_shapes[i])  # start of padding
+    dpoints = defaultdict(list)  # if empty key, create a (empty) list
+    for i, c in part_counts.items():
+        def add_p(p):
+            dpoints[p].append(i)  # auto-creates list if key non-existent
+        for start in range(0, part_count, c):
+            add_p(start * part_wid)  # start of lane
+            add_p(start * part_wid + lane_shapes[i])  # start of padding
     # do not need the breakpoints at the very start or the very end
     dpoints.pop(0, None)
     dpoints.pop(width, None)
     plist = list(dpoints.keys())
     plist.sort()
     print("dpoints")
-    for k in plist:
-        print(f"{k}: {list(dpoints[k].keys())}")
+    pprint(dict(dpoints))
     # second stage, add (map to) the elwidth==i expressions.
     # TODO: use nmutil.treereduce?
     points = {}
     for p in plist:
-        it = map(lambda i: elwid == i, dpoints[p])
-        points[p] = reduce(operator.or_, it)
+        points[p] = map(lambda i: elwid == i, dpoints[p])
+        points[p] = reduce(operator.or_, points[p])
     # third stage, create the binary values which *if* elwidth is set to i
     # *would* result in the mask at that elwidth being set to this value
     # these can easily be double-checked through Assertion
@@ -205,111 +149,92 @@ def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
                 bitp[i] |= 1 << bitpos
     # fourth stage: determine which partitions are 100% unused.
     # these can then be "blanked out"
-    bmask = (1 << len(plist)) - 1
+    bmask = (1 << len(plist))-1
     for p in bitp.values():
         bmask &= ~p
-    return LayoutResult(PartitionPoints(points), bitp, bmask, width,
-                        lane_shapes, part_wid, full_part_count)
+    return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
+            part_wid, part_count)
 
 
 if __name__ == '__main__':
 
-    class FpElWid(Enum):
-        F64 = 0
-        F32 = 1
-        F16 = 2
-        BF16 = 3
-
-        def __repr__(self):
-            return super().__str__()
-
-    class IntElWid(Enum):
-        I64 = 0
-        I32 = 1
-        I16 = 2
-        I8 = 3
-
-        def __repr__(self):
-            return super().__str__()
-
-    # for each element-width (elwidth 0-3) the number of parts in an element
-    # is given:
-    #                                | part0 | part1 | part2 | part3 |
-    # elwid=F64 4 parts per element: |<-------------F64------------->|
-    # elwid=F32 2 parts per element: |<-----F32----->|<-----F32----->|
-    # elwid=F16 1 part per element:  |<-F16->|<-F16->|<-F16->|<-F16->|
-    # elwid=BF16 1 part per element: |<BF16->|<BF16->|<BF16->|<BF16->|
+    # for each element-width (elwidth 0-3) the number of partitions is given
+    # elwidth=0b00 QTY 1 partitions:   |          ?          |
+    # elwidth=0b01 QTY 1 partitions:   |          ?          |
+    # elwidth=0b10 QTY 2 partitions:   |    ?     |     ?    |
+    # elwidth=0b11 QTY 4 partitions:   | ?  |  ?  |  ?  | ?  |
     # actual widths of Signals *within* those partitions is given separately
     part_counts = {
-        FpElWid.F64: 4,
-        FpElWid.F32: 2,
-        FpElWid.F16: 1,
-        FpElWid.BF16: 1,
+        0: 1,
+        1: 1,
+        2: 2,
+        3: 4,
     }
 
-    # width=3 indicates "we want the same element bit-width (3) at all elwids"
-    # elwid=F64 1x 3-bit     |<--------i3------->|
-    # elwid=F32 2x 3-bit     |<---i3-->|<---i3-->|
-    # elwid=F16 4x 3-bit     |<i3>|<i3>|<i3>|<i3>|
-    # elwid=BF16 4x 3-bit    |<i3>|<i3>|<i3>|<i3>|
-    width_for_all_els = 3
+    # width=3 indicates "we want the same width (3) at all elwidths"
+    # elwidth=0b00 1x 5-bit     |                 ..3 |
+    # elwidth=0b01 1x 6-bit     |                 ..3 |
+    # elwidth=0b10 2x 12-bit    |      ..3 |      ..3 |
+    # elwidth=0b11 3x 24-bit    | ..3| ..3 | ..3 |..3 |
+    width_in_all_parts = 3
 
-    for i in FpElWid:
-        print(i, layout(i, True, part_counts, width_for_all_els))
+    for i in range(4):
+        pprint((i, layout(i, True, part_counts, width_in_all_parts)))
 
     # fixed_width=32 and no lane_widths says "allocate maximum"
-    # elwid=F64 1x 32-bit    |<-------i32------->|
-    # elwid=F32 2x 16-bit    |<--i16-->|<--i16-->|
-    # elwid=F16 4x 8-bit     |<i8>|<i8>|<i8>|<i8>|
-    # elwid=BF16 4x 8-bit    |<i8>|<i8>|<i8>|<i8>|
+    # elwidth=0b00 1x 32-bit    | .................32 |
+    # elwidth=0b01 1x 32-bit    | .................32 |
+    # elwidth=0b10 2x 12-bit    | ......16 | ......16 |
+    # elwidth=0b11 3x 24-bit    | ..8| ..8 | ..8 |..8 |
 
-    print("maximum allocation from fixed_width=32")
-    for i in FpElWid:
-        print(i, layout(i, True, part_counts, fixed_width=32))
+    #print ("maximum allocation from fixed_width=32")
+    # for i in range(4):
+    #    pprint((i, layout(i, True, part_counts, fixed_width=32)))
 
     # specify that the length is to be *different* at each of the elwidths.
     # combined with part_counts we have:
-    # elwid=F64 1x 24-bit    |<-------i24------->|
-    # elwid=F32 2x 12-bit    |<--i12-->|<--i12-->|
-    # elwid=F16 4x 6-bit     |<i6>|<i6>|<i6>|<i6>|
-    # elwid=BF16 4x 5-bit    |<i5>|<i5>|<i5>|<i5>|
+    # elwidth=0b00 1x 5-bit     |               ....5 |
+    # elwidth=0b01 1x 6-bit     |              .....6 |
+    # elwidth=0b10 2x 12-bit    |   ....12 |  .....12 |
+    # elwidth=0b11 3x 24-bit    | 24 |  24 |  24 | 24 |
     widths_at_elwidth = {
-        FpElWid.F64: 24,
-        FpElWid.F32: 12,
-        FpElWid.F16: 6,
-        FpElWid.BF16: 5,
+        0: 5,
+        1: 6,
+        2: 12,
+        3: 24
     }
 
-    for i in FpElWid:
-        print(i, layout(i, False, part_counts, widths_at_elwidth))
+    for i in range(4):
+        pprint((i, layout(i, False, part_counts, widths_at_elwidth)))
 
     # this tests elwidth as an actual Signal. layout is allowed to
     # determine arbitrarily the overall length
     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
 
-    elwid = Signal(FpElWid)
-    lr = layout(elwid, False, part_counts, widths_at_elwidth)
-    print(lr)
-    for k, v in lr.bitp.items():
-        print(f"bitp elwidth={k}", bin(v))
-    print("bmask", bin(lr.bmask))
+    elwid = Signal(2)
+    pp, bitp, bm, b, c, d, e = layout(
+        elwid, False, part_counts, widths_at_elwidth)
+    pprint((pp, b, c, d, e))
+    for k, v in bitp.items():
+        print("bitp elwidth=%d" % k, bin(v))
+    print("bmask", bin(bm))
 
     m = Module()
 
     def process():
-        for i in FpElWid:
+        for i in range(4):
             yield elwid.eq(i)
             yield Settle()
             ppt = []
-            for pval in lr.ppoints.values():
+            for pval in list(pp.values()):
                 val = yield pval  # get nmigen to evaluate pp
                 ppt.append(val)
-            print(i, ppt)
+            pprint((i, (ppt, b, c, d, e)))
             # check the results against bitp static-expected partition points
             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
             # https://stackoverflow.com/a/27165694
             ival = int(''.join(map(str, ppt[::-1])), 2)
-            assert ival == lr.bitp[i]
+            assert ival == bitp[i]
 
     sim = Simulator(m)
     sim.add_process(process)
@@ -319,31 +244,32 @@ if __name__ == '__main__':
     # determine arbitrarily the overall length, it is fixed to 64
     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
 
-    elwid = Signal(FpElWid)
-    lr = layout(elwid, False, part_counts, widths_at_elwidth, fixed_width=64)
-    print(lr)
-    for k, v in lr.bitp.items():
-        print(f"bitp elwidth={k}", bin(v))
-    print("bmask", bin(lr.bmask))
+    elwid = Signal(2)
+    pp, bitp, bm, b, c, d, e = layout(elwid, False, part_counts, widths_at_elwidth,
+                                      fixed_width=64)
+    pprint((pp, b, c, d, e))
+    for k, v in bitp.items():
+        print("bitp elwidth=%d" % k, bin(v))
+    print("bmask", bin(bm))
 
     m = Module()
 
     def process():
-        for i in FpElWid:
+        for i in range(4):
             yield elwid.eq(i)
             yield Settle()
             ppt = []
-            for pval in list(lr.ppoints.values()):
+            for pval in list(pp.values()):
                 val = yield pval  # get nmigen to evaluate pp
                 ppt.append(val)
-            print(f"test elwidth={i}")
-            print(i, ppt)
+            print("test elwidth=%d" % i)
+            pprint((i, (ppt, b, c, d, e)))
             # check the results against bitp static-expected partition points
             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
             # https://stackoverflow.com/a/27165694
             ival = int(''.join(map(str, ppt[::-1])), 2)
-            assert ival == lr.bitp[i], \
-                f"ival {bin(ival)} actual {bin(lr.bitp[i])}"
+            assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
+                                                           bin(bitp[i]))
 
     sim = Simulator(m)
     sim.add_process(process)
-- 
2.30.2