src/ieee754/part/layout_experiment.py

   1 #!/usr/bin/env python3
   2 # SPDX-License-Identifier: LGPL-3-or-later
   3 # See Notices.txt for copyright information
   4 """
   5 Links:
   6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
   7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
   8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
   9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
  10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
  11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
  12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  13 """
  14
  15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
  16 from nmigen.back.pysim import Simulator, Delay, Settle
  17 from nmigen.cli import rtlil
  18
  19 from collections.abc import Mapping
  20 from functools import reduce
  21 import operator
  22 from collections import defaultdict
  23 from pprint import pprint
  24
  25 from ieee754.part_mul_add.partpoints import PartitionPoints
  26
  27
  28 # main fn, which started out here in the bugtracker:
  29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
  30 def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
  31     """calculate a SIMD layout.
  32
  33     Glossary:
  34     * element: a single scalar value that is an element of a SIMD vector.
  35         it has a width in bits, and a signedness. Every element is made of 1 or
  36         more parts.
  37     * ElWid: the element-width (really the element type) of an instruction.
  38         Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
  39         In Python, `ElWid` is either an enum type or is `int`.
  40         Example `ElWid` definition for integers:
  41
  42         class ElWid(Enum):
  43             I8 = ...
  44             I16 = ...
  45             I32 = ...
  46             I64 = ...
  47
  48         Example `ElWid` definition for floats:
  49
  50         class ElWid(Enum):
  51             F16 = ...
  52             BF16 = ...
  53             F32 = ...
  54             F64 = ...
  55     * part: A piece of a SIMD vector, every SIMD vector is made of a
  56         non-negative integer of parts. Elements are made of a power-of-two
  57         number of parts. A part is a fixed number of bits wide for each
  58         different SIMD layout, it doesn't vary when `elwid` changes. A part
  59         can have a bit width of any non-negative integer, it is not restricted
  60         to power-of-two. SIMD vectors should have as few parts as necessary,
  61         since some circuits have size proportional to the number of parts.
  62
  63
  64     * elwid: ElWid or nmigen Value with ElWid as the shape
  65         the current element-width
  66     * signed: bool
  67         the signedness of all elements in a SIMD layout
  68     * part_counts: dict[ElWid, int]
  69         a map from `ElWid` values `k` to the number of parts in an element
  70         when `elwid == k`. Values should be minimized, since higher values
  71         often create bigger circuits.
  72
  73         Example:
  74         # here, an I8 element is 1 part wide
  75         part_counts = {ElWid.I8: 1, ElWid.I16: 2, ElWid.I32: 4, ElWid.I64: 8}
  76
  77         Another Example:
  78         # here, an F16 element is 1 part wide
  79         part_counts = {ElWid.F16: 1, ElWid.BF16: 1, ElWid.F32: 2, ElWid.F64: 4}
  80     * lane_shapes: int or Mapping[ElWid, int] (optional)
  81         the bit-width of all elements in a SIMD layout.
  82     * fixed_width: int (optional)
  83         the total width of a SIMD vector. One of lane_shapes and fixed_width
  84         must be provided.
  85     """
  86     # when there are no lane_shapes specified, this indicates a
  87     # desire to use the maximum available space based on the fixed width
  88     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  89     if lane_shapes is None:
  90         assert fixed_width is not None, \
  91             "both fixed_width and lane_shapes cannot be None"
  92         lane_shapes = {i: fixed_width // part_counts[i] for i in part_counts}
  93         print("lane_shapes", fixed_width, lane_shapes)
  94     # identify if the lane_shapes is a mapping (dict, etc.)
  95     # if not, then assume that it is an integer (width) that
  96     # needs to be requested across all partitions
  97     if not isinstance(lane_shapes, Mapping):
  98         lane_shapes = {i: lane_shapes for i in part_counts}
  99     # compute a set of partition widths
 100     cpart_wid = [-lane_shapes[i] // c for i, c in part_counts.items()]
 101     print("cpart_wid", cpart_wid, "part_counts", part_counts)
 102     cpart_wid = -min(cpart_wid)
 103     part_count = max(part_counts.values())
 104     # calculate the minumum width required
 105     width = cpart_wid * part_count
 106     print("width", width, cpart_wid, part_count)
 107     if fixed_width is not None:  # override the width and part_wid
 108         assert width < fixed_width, "not enough space to fit partitions"
 109         part_wid = fixed_width // part_count
 110         assert part_wid * part_count == fixed_width, \
 111             "calculated width not aligned multiples"
 112         width = fixed_width
 113         print("part_wid", part_wid, "count", part_count)
 114     else:
 115         # go with computed width
 116         part_wid = cpart_wid
 117     # create the breakpoints dictionary.
 118     # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
 119     # https://stackoverflow.com/questions/26367812/
 120     dpoints = defaultdict(list)  # if empty key, create a (empty) list
 121     for i, c in part_counts.items():
 122         def add_p(p):
 123             dpoints[p].append(i)  # auto-creates list if key non-existent
 124         for start in range(0, part_count, c):
 125             add_p(start * part_wid)  # start of lane
 126             add_p(start * part_wid + lane_shapes[i])  # start of padding
 127     # do not need the breakpoints at the very start or the very end
 128     dpoints.pop(0, None)
 129     dpoints.pop(width, None)
 130     plist = list(dpoints.keys())
 131     plist.sort()
 132     print("dpoints")
 133     pprint(dict(dpoints))
 134     # second stage, add (map to) the elwidth==i expressions.
 135     # TODO: use nmutil.treereduce?
 136     points = {}
 137     for p in plist:
 138         points[p] = map(lambda i: elwid == i, dpoints[p])
 139         points[p] = reduce(operator.or_, points[p])
 140     # third stage, create the binary values which *if* elwidth is set to i
 141     # *would* result in the mask at that elwidth being set to this value
 142     # these can easily be double-checked through Assertion
 143     bitp = {}
 144     for i in part_counts.keys():
 145         bitp[i] = 0
 146         for p, elwidths in dpoints.items():
 147             if i in elwidths:
 148                 bitpos = plist.index(p)
 149                 bitp[i] |= 1 << bitpos
 150     # fourth stage: determine which partitions are 100% unused.
 151     # these can then be "blanked out"
 152     bmask = (1 << len(plist))-1
 153     for p in bitp.values():
 154         bmask &= ~p
 155     return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
 156             part_wid, part_count)
 157
 158
 159 if __name__ == '__main__':
 160
 161     # for each element-width (elwidth 0-3) the number of partitions is given
 162     # elwidth=0b00 QTY 1 partitions:   |          ?          |
 163     # elwidth=0b01 QTY 1 partitions:   |          ?          |
 164     # elwidth=0b10 QTY 2 partitions:   |    ?     |     ?    |
 165     # elwidth=0b11 QTY 4 partitions:   | ?  |  ?  |  ?  | ?  |
 166     # actual widths of Signals *within* those partitions is given separately
 167     part_counts = {
 168         0: 1,
 169         1: 1,
 170         2: 2,
 171         3: 4,
 172     }
 173
 174     # width=3 indicates "we want the same width (3) at all elwidths"
 175     # elwidth=0b00 1x 5-bit     |                 ..3 |
 176     # elwidth=0b01 1x 6-bit     |                 ..3 |
 177     # elwidth=0b10 2x 12-bit    |      ..3 |      ..3 |
 178     # elwidth=0b11 3x 24-bit    | ..3| ..3 | ..3 |..3 |
 179     width_in_all_parts = 3
 180
 181     for i in range(4):
 182         pprint((i, layout(i, True, part_counts, width_in_all_parts)))
 183
 184     # fixed_width=32 and no lane_widths says "allocate maximum"
 185     # elwidth=0b00 1x 32-bit    | .................32 |
 186     # elwidth=0b01 1x 32-bit    | .................32 |
 187     # elwidth=0b10 2x 12-bit    | ......16 | ......16 |
 188     # elwidth=0b11 3x 24-bit    | ..8| ..8 | ..8 |..8 |
 189
 190     #print ("maximum allocation from fixed_width=32")
 191     # for i in range(4):
 192     #    pprint((i, layout(i, True, part_counts, fixed_width=32)))
 193
 194     # specify that the length is to be *different* at each of the elwidths.
 195     # combined with part_counts we have:
 196     # elwidth=0b00 1x 5-bit     |               ....5 |
 197     # elwidth=0b01 1x 6-bit     |              .....6 |
 198     # elwidth=0b10 2x 12-bit    |   ....12 |  .....12 |
 199     # elwidth=0b11 3x 24-bit    | 24 |  24 |  24 | 24 |
 200     widths_at_elwidth = {
 201         0: 5,
 202         1: 6,
 203         2: 12,
 204         3: 24
 205     }
 206
 207     for i in range(4):
 208         pprint((i, layout(i, False, part_counts, widths_at_elwidth)))
 209
 210     # this tests elwidth as an actual Signal. layout is allowed to
 211     # determine arbitrarily the overall length
 212     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
 213
 214     elwid = Signal(2)
 215     pp, bitp, bm, b, c, d, e = layout(
 216         elwid, False, part_counts, widths_at_elwidth)
 217     pprint((pp, b, c, d, e))
 218     for k, v in bitp.items():
 219         print("bitp elwidth=%d" % k, bin(v))
 220     print("bmask", bin(bm))
 221
 222     m = Module()
 223
 224     def process():
 225         for i in range(4):
 226             yield elwid.eq(i)
 227             yield Settle()
 228             ppt = []
 229             for pval in list(pp.values()):
 230                 val = yield pval  # get nmigen to evaluate pp
 231                 ppt.append(val)
 232             pprint((i, (ppt, b, c, d, e)))
 233             # check the results against bitp static-expected partition points
 234             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 235             # https://stackoverflow.com/a/27165694
 236             ival = int(''.join(map(str, ppt[::-1])), 2)
 237             assert ival == bitp[i]
 238
 239     sim = Simulator(m)
 240     sim.add_process(process)
 241     sim.run()
 242
 243     # this tests elwidth as an actual Signal. layout is *not* allowed to
 244     # determine arbitrarily the overall length, it is fixed to 64
 245     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
 246
 247     elwid = Signal(2)
 248     pp, bitp, bm, b, c, d, e = layout(elwid, False, part_counts, widths_at_elwidth,
 249                                       fixed_width=64)
 250     pprint((pp, b, c, d, e))
 251     for k, v in bitp.items():
 252         print("bitp elwidth=%d" % k, bin(v))
 253     print("bmask", bin(bm))
 254
 255     m = Module()
 256
 257     def process():
 258         for i in range(4):
 259             yield elwid.eq(i)
 260             yield Settle()
 261             ppt = []
 262             for pval in list(pp.values()):
 263                 val = yield pval  # get nmigen to evaluate pp
 264                 ppt.append(val)
 265             print("test elwidth=%d" % i)
 266             pprint((i, (ppt, b, c, d, e)))
 267             # check the results against bitp static-expected partition points
 268             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 269             # https://stackoverflow.com/a/27165694
 270             ival = int(''.join(map(str, ppt[::-1])), 2)
 271             assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
 272                                                            bin(bitp[i]))
 273
 274     sim = Simulator(m)
 275     sim.add_process(process)
 276     sim.run()