src/ieee754/part/layout_experiment.py

   1 #!/usr/bin/env python3
   2 # SPDX-License-Identifier: LGPL-3-or-later
   3 # See Notices.txt for copyright information
   4 """
   5 Links:
   6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
   7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
   8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
   9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
  10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
  11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
  12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  13 """
  14
  15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
  16 from nmigen.back.pysim import Simulator, Delay, Settle
  17 from nmigen.cli import rtlil
  18
  19 from collections.abc import Mapping
  20 from functools import reduce
  21 import operator
  22 from collections import defaultdict
  23 from pprint import pprint
  24
  25 from ieee754.part_mul_add.partpoints import PartitionPoints
  26
  27
  28 # XXX MAKE SURE TO PRESERVE ALL THESE COMMENTS XXX
  29
  30 # main fn, which started out here in the bugtracker:
  31 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
  32 # note that signed is **NOT** part of the layout, and will NOT
  33 # be added (because it is not relevant or appropriate).
  34 # sign belongs in ast.Shape and is the only appropriate location.
  35 # there is absolutely nothing within this function that in any
  36 # way requires a sign.  it is *purely* performing numerical width
  37 # computations that have absolutely nothing to do with whether the
  38 # actual data is signed or unsigned.
  39 #
  40 # context for parameters:
  41 # http://lists.libre-soc.org/pipermail/libre-soc-dev/2021-October/003921.html
  42 # XXX tempted to suggest that this function remain as a function, because
  43 # it takes all the context it needs as parameters.  its usefulness goes
  44 # beyond a single class, and there is actually nothing realistically
  45 # that it needs whixh is context-sensitive.  therefore, on balance,
  46 # it should remain a function
  47 def layout(elwid,            # comes from SimdScope constructor
  48            vec_el_counts,    # comes from SimdScope constructor
  49            lane_shapes=None,   # from SimdScope.Signal via a SimdShape
  50            fixed_width=None):  # from SimdScope.Signal via a SimdShape
  51     """calculate a SIMD layout.
  52
  53     Glossary:
  54     * element: a single scalar value that is an element of a SIMD vector.
  55         it has a width in bits. Every element is made of 1 or
  56         more parts.
  57     * ElWid: the element-width (really the element type) of an instruction.
  58         Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
  59         In Python, `ElWid` is either an enum type or is `int`.
  60         Example `ElWid` definition for integers:
  61
  62         class ElWid(Enum):
  63             I64 = ...       # SVP64 value 0b00
  64             I32 = ...       # SVP64 value 0b01
  65             I16 = ...       # SVP64 value 0b10
  66             I8 = ...        # SVP64 value 0b11
  67
  68         Example `ElWid` definition for floats:
  69
  70         class ElWid(Enum):
  71             F64 = ...    # SVP64 value 0b00
  72             F32 = ...    # SVP64 value 0b01
  73             F16 = ...    # SVP64 value 0b10
  74             BF16 = ...   # SVP64 value 0b11
  75
  76     * elwid: ElWid or nmigen Value with ElWid as the shape
  77         the current element-width
  78
  79     * vec_el_counts: dict[ElWid, int]
  80         a map from `ElWid` values `k` to the number of vector elements
  81         required within a partition when `elwid == k`.
  82
  83         Example:
  84         vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
  85                        ElWid.I16(==0b10): 4,  # 4 vector elements
  86                        ElWid.I32(==0b01): 2,  # 2 vector elements
  87                        ElWid.I64(==0b00): 1}  # 1 vector (aka scalar) element
  88
  89         Another Example:
  90         vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
  91                          ElWid.F16(==0b10): 4,  # 4 vector elements
  92                          ElWid.F32(==0b01): 2,  # 2 vector elements
  93                          ElWid.F64(==0b00): 1}  # 1 (aka scalar) vector element
  94
  95     * lane_shapes: int or Mapping[ElWid, int] (optional)
  96         the bit-width of all elements in a SIMD layout.
  97         if not provided, the lane_shapes are computed from fixed_width
  98         and vec_el_counts at each elwidth.
  99
 100     * fixed_width: int (optional)
 101         the total width of a SIMD vector. One or both of lane_shapes or
 102         fixed_width may be provided.  Both may not be left out.
 103     """
 104     # when there are no lane_shapes specified, this indicates a
 105     # desire to use the maximum available space based on the fixed width
 106     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
 107     if lane_shapes is None:
 108         assert fixed_width is not None, \
 109             "both fixed_width and lane_shapes cannot be None"
 110         lane_shapes = {i: fixed_width // vec_el_counts[i]
 111                        for i in vec_el_counts}
 112         print("lane_shapes", fixed_width, lane_shapes)
 113
 114     # identify if the lane_shapes is a mapping (dict, etc.)
 115     # if not, then assume that it is an integer (width) that
 116     # needs to be requested across all partitions
 117     if not isinstance(lane_shapes, Mapping):
 118         lane_shapes = {i: lane_shapes for i in vec_el_counts}
 119
 120     # compute a set of partition widths
 121     print("lane_shapes", lane_shapes, "vec_el_counts", vec_el_counts)
 122     cpart_wid = 0
 123     width = 0
 124     for i, lwid in lane_shapes.items():
 125         required_width = lwid * vec_el_counts[i]
 126         print("     required width", cpart_wid, i, lwid, required_width)
 127         if required_width > width:
 128             cpart_wid = lwid
 129             width = required_width
 130
 131     # calculate the minumum width required if fixed_width specified
 132     part_count = max(vec_el_counts.values())
 133     print("width", width, cpart_wid, part_count)
 134     if fixed_width is not None:  # override the width and part_wid
 135         assert width <= fixed_width, "not enough space to fit partitions"
 136         part_wid = fixed_width // part_count
 137         assert part_wid * part_count == fixed_width, \
 138             "calculated width not aligned multiples"
 139         width = fixed_width
 140         print("part_wid", part_wid, "count", part_count, "width", width)
 141
 142     # create the breakpoints dictionary.
 143     # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
 144     # https://stackoverflow.com/questions/26367812/
 145     dpoints = defaultdict(list)  # if empty key, create a (empty) list
 146     lpoints = defaultdict(list)  # dict of list of start-end points
 147     padding_masks = {}
 148     always_padding_mask = (1 << width) - 1  # start with all bits padding
 149     for i, c in vec_el_counts.items():
 150         print("dpoints", i, "count", c)
 151         # calculate part_wid based on overall width divided by number
 152         # of elements.
 153         part_wid = width // c
 154
 155         padding_mask = (1 << width) - 1  # start with all bits padding
 156
 157         def add_p(msg, start, p):
 158             print("    adding dpoint", msg, start, part_wid, i, c, p)
 159             dpoints[p].append(i)  # auto-creates list if key non-existent
 160         # for each elwidth, create the required number of vector elements
 161         for start in range(c):
 162             start_bit = start * part_wid
 163             end_bit = start_bit + lane_shapes[i]
 164             element_mask = (1 << end_bit) - (1 << start_bit)
 165             padding_mask &= ~element_mask  # remove element from padding_mask
 166             lpoints[i].append(range(start_bit, end_bit))
 167             add_p("start", start, start_bit)  # start of lane
 168             add_p("end  ", start, end_bit)  # end lane
 169         padding_masks[i] = padding_mask
 170         always_padding_mask &= padding_mask
 171
 172     # deduplicate dpoints lists
 173     for k in dpoints.keys():
 174         dpoints[k] = list({i: None for i in dpoints[k]}.keys())
 175
 176     # do not need the breakpoints at the very start or the very end
 177     dpoints.pop(0, None)
 178     dpoints.pop(width, None)
 179
 180     # sort dpoints keys
 181     dpoints = dict(sorted(dpoints.items(), key=lambda i: i[0]))
 182
 183     print("dpoints")
 184     pprint(dpoints)
 185
 186     # second stage, add (map to) the elwidth==i expressions.
 187     # TODO: use nmutil.treereduce?
 188     points = {}
 189     for p in dpoints.keys():
 190         points[p] = map(lambda i: elwid == i, dpoints[p])
 191         points[p] = reduce(operator.or_, points[p])
 192
 193     # third stage, create the binary values which *if* elwidth is set to i
 194     # *would* result in the mask at that elwidth being set to this value
 195     # these can easily be double-checked through Assertion
 196     bitp = {}
 197     for i in vec_el_counts.keys():
 198         bitp[i] = 0
 199         for bit_index, (p, elwidths) in enumerate(dpoints.items()):
 200             if i in elwidths:
 201                 bitp[i] |= 1 << bit_index
 202
 203     # fourth stage: determine which partitions are 100% unused.
 204     # these can then be "blanked out"
 205
 206     # points are the partition separators, not partition indexes
 207     partition_ends = [*dpoints.keys(), width]
 208     bmask = 0
 209     partition_start = 0
 210     for bit_index, partition_end in enumerate(partition_ends):
 211         pmask = (1 << partition_end) - (1 << partition_start)
 212         always_padding = (always_padding_mask & pmask) == pmask
 213         if always_padding:
 214             bmask |= 1 << bit_index
 215         partition_start = partition_end
 216     return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
 217             part_wid)
 218
 219 # XXX XXX XXX XXX quick tests TODO convert to proper ones but kinda good
 220 # enough for now.  if adding new tests do not alter or delete the old ones
 221 # XXX XXX XXX XXX
 222
 223 if __name__ == '__main__':
 224
 225     # for each element-width (elwidth 0-3) the number of Vector Elements is:
 226     # elwidth=0b00 QTY 1 partitions:   |          ?          |
 227     # elwidth=0b01 QTY 1 partitions:   |          ?          |
 228     # elwidth=0b10 QTY 2 partitions:   |    ?     |     ?    |
 229     # elwidth=0b11 QTY 4 partitions:   | ?  |  ?  |  ?  | ?  |
 230     # actual widths of Signals *within* those partitions is given separately
 231     vec_el_counts = {
 232         0: 1,
 233         1: 1,
 234         2: 2,
 235         3: 4,
 236     }
 237
 238     # width=3 indicates "same width Vector Elements (3) at all elwidths"
 239     # elwidth=0b00 1x 5-bit     |  unused xx      ..3 |
 240     # elwidth=0b01 1x 6-bit     |  unused xx      ..3 |
 241     # elwidth=0b10 2x 12-bit    | xxx  ..3 | xxx  ..3 |
 242     # elwidth=0b11 3x 24-bit    | ..3| ..3 | ..3 |..3 |
 243     # expected partitions      (^)   |     |     |   (^)
 244     # to be at these points:   (|)   |     |     |    |
 245     width_in_all_parts = 3
 246
 247     for i in range(4):
 248         pprint((i, layout(i, vec_el_counts, width_in_all_parts)))
 249
 250     # specify that the Vector Element lengths are to be *different* at
 251     # each of the elwidths.
 252     # combined with vec_el_counts we have:
 253     # elwidth=0b00 1x 5-bit    |<----unused---------->....5|
 254     # elwidth=0b01 1x 6-bit    |<----unused--------->.....6|
 255     # elwidth=0b10 2x 6-bit    |unused>.....6|unused>.....6|
 256     # elwidth=0b11 4x 6-bit    |.....6|.....6|.....6|.....6|
 257     # expected partitions     (^)     ^      ^      ^^    (^)
 258     # to be at these points:  (|)     |      |      ||    (|)
 259     #                         (24)   18     12      65    (0)
 260     widths_at_elwidth = {
 261         0: 5,
 262         1: 6,
 263         2: 6,
 264         3: 6
 265     }
 266
 267     print("5,6,6,6 elements", widths_at_elwidth)
 268     for i in range(4):
 269         pp, bitp, bm, b, c, d = \
 270             layout(i, vec_el_counts, widths_at_elwidth)
 271         pprint((i, (pp, bitp, bm, b, c, d)))
 272     # now check that the expected partition points occur
 273     print("5,6,6,6 ppt keys", pp.keys())
 274     assert list(pp.keys()) == [5, 6, 12, 18]
 275     assert bm == 0  # no unused partitions
 276
 277     # this example was probably what the 5,6,6,6 one was supposed to be.
 278     # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
 279     # elwidth=0b00 1x 24-bit    |.........................24|
 280     # elwidth=0b01 1x 12-bit    |<--unused--->|...........12|
 281     # elwidth=0b10 2x 5 -bit    |unused>|....5|unused>|....5|
 282     # elwidth=0b11 4x 6 -bit    |.....6|.....6|.....6|.....6|
 283     # expected partitions      (^)     ^^     ^       ^^    (^)
 284     # to be at these points:   (|)     ||     |       ||    (|)
 285     #                          (24)   1817   12       65    (0)
 286     widths_at_elwidth = {
 287         0: 24,  # QTY 1x 24
 288         1: 12,  # QTY 1x 12
 289         2: 5,   # QTY 2x 5
 290         3: 6    # QTY 4x 6
 291     }
 292
 293     print("24,12,5,6 elements", widths_at_elwidth)
 294     for i in range(4):
 295         pp, bitp, bm, b, c, d = \
 296             layout(i, vec_el_counts, widths_at_elwidth)
 297         pprint((i, (pp, bitp, bm, b, c, d)))
 298     # now check that the expected partition points occur
 299     print("24,12,5,6 ppt keys", pp.keys())
 300     assert list(pp.keys()) == [5, 6, 12, 17, 18]
 301     print("bmask", bin(bm))
 302     assert bm == 0  # no unused partitions
 303
 304     # this tests elwidth as an actual Signal. layout is allowed to
 305     # determine arbitrarily the overall length
 306     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
 307
 308     elwid = Signal(2)
 309     pp, bitp, bm, b, c, d = layout(
 310         elwid, vec_el_counts, widths_at_elwidth)
 311     pprint((pp, b, c, d))
 312     for k, v in bitp.items():
 313         print("bitp elwidth=%d" % k, bin(v))
 314     print("bmask", bin(bm))
 315     assert bm == 0  # no unused partitions
 316
 317     m = Module()
 318
 319     def process():
 320         for i in range(4):
 321             yield elwid.eq(i)
 322             yield Settle()
 323             ppt = []
 324             for pval in list(pp.values()):
 325                 val = yield pval  # get nmigen to evaluate pp
 326                 ppt.append(val)
 327             pprint((i, (ppt, b, c, d)))
 328             # check the results against bitp static-expected partition points
 329             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 330             # https://stackoverflow.com/a/27165694
 331             ival = int(''.join(map(str, ppt[::-1])), 2)
 332             assert ival == bitp[i]
 333
 334     sim = Simulator(m)
 335     sim.add_process(process)
 336     sim.run()
 337
 338     # this tests elwidth as an actual Signal. layout is *not* allowed to
 339     # determine arbitrarily the overall length, it is fixed to 64
 340     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
 341
 342     # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
 343     # elwidth=0b00 1x 24-bit
 344     # elwidth=0b01 1x 12-bit
 345     # elwidth=0b10 2x 5-bit
 346     # elwidth=0b11 4x 6-bit
 347     #
 348     # bmask<--------1<----0<---------10<---0<-------1<0<----0<---0<----00<---0
 349     # always unused:|     |     |    ||    |    |   | |     |    |     ||    |
 350     #      1111111111000000 1111111111000000 1111111100000000 0000000000000000
 351     #               |     |     |    ||    |    |   | |     |    |     ||    |
 352     # 0b00 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxx........ ..............24|
 353     # 0b01 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxx..........12|
 354     # 0b10 xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|
 355     # 0b11 xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|
 356     #               ^     ^          ^^    ^        ^ ^     ^    ^     ^^
 357     #     ppoints:  |     |          ||    |        | |     |    |     ||
 358     #               |  bit-48        /\    | bit-24-/ |     | bit-12   /\-bit-5
 359     #            bit-54      bit-38-/  \ bit-32       |   bit-16      /
 360     #                                 bit-37       bit-22          bit-6
 361
 362     elwid = Signal(2)
 363     pp, bitp, bm, b, c, d = layout(elwid, vec_el_counts,
 364                                    widths_at_elwidth,
 365                                    fixed_width=64)
 366     pprint((pp, b, c, d))
 367     for k, v in bitp.items():
 368         print("bitp elwidth=%d" % k, bin(v))
 369     print("bmask", bin(bm))
 370     assert bm == 0b101001000000
 371
 372     m = Module()
 373
 374     def process():
 375         for i in range(4):
 376             yield elwid.eq(i)
 377             yield Settle()
 378             ppt = []
 379             for pval in list(pp.values()):
 380                 val = yield pval  # get nmigen to evaluate pp
 381                 ppt.append(val)
 382             print("test elwidth=%d" % i)
 383             pprint((i, (ppt, b, c, d)))
 384             # check the results against bitp static-expected partition points
 385             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 386             # https://stackoverflow.com/a/27165694
 387             ival = int(''.join(map(str, ppt[::-1])), 2)
 388             assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
 389                                                            bin(bitp[i]))
 390
 391     sim = Simulator(m)
 392     sim.add_process(process)
 393     sim.run()
 394
 395     # fixed_width=32 and no lane_widths says "allocate maximum"
 396     # i.e. Vector Element Widths are auto-allocated
 397     # elwidth=0b00 1x 32-bit    | .................32 |
 398     # elwidth=0b01 1x 32-bit    | .................32 |
 399     # elwidth=0b10 2x 12-bit    | ......16 | ......16 |
 400     # elwidth=0b11 3x 24-bit    | ..8| ..8 | ..8 |..8 |
 401     # expected partitions      (^)   |     |     |   (^)
 402     # to be at these points:   (|)   |     |     |    |
 403
 404     # TODO, fix this so that it is correct.  put it at the end so it
 405     # shows that things break and doesn't stop the other tests.
 406     print("maximum allocation from fixed_width=32")
 407     for i in range(4):
 408         pprint((i, layout(i, vec_el_counts, fixed_width=32)))
 409
 410     # example "exponent"
 411     #  https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
 412     # 1xFP64: 11 bits, one exponent
 413     # 2xFP32: 8 bits, two exponents
 414     # 4xFP16: 5 bits, four exponents
 415     # 4xBF16: 8 bits, four exponents
 416     vec_el_counts = {
 417         0: 1,  # QTY 1x FP64
 418         1: 2,  # QTY 2x FP32
 419         2: 4,  # QTY 4x FP16
 420         3: 4,  # QTY 4x BF16
 421     }
 422     widths_at_elwidth = {
 423         0: 11,  # FP64 ew=0b00
 424         1: 8,  # FP32 ew=0b01
 425         2: 5,  # FP16 ew=0b10
 426         3: 8   # BF16 ew=0b11
 427     }
 428
 429     # expected results:
 430     #
 431     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 432     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 433     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 434     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 435     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 436     #  unused  x                     x
 437
 438     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 439     for i in range(4):
 440         pp, bitp, bm, b, c, d = \
 441             layout(i, vec_el_counts, widths_at_elwidth,
 442                    fixed_width=32)
 443         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 444     # now check that the expected partition points occur
 445     print("11,8,5,8 pp keys", pp.keys())
 446     #assert list(pp.keys()) == [5,6,12,18]
 447
 448     ######                                                           ######
 449     ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
 450     ######                                                           ######
 451
 452     # example "exponent"
 453     vec_el_counts = {
 454         0: 1,  # QTY 1x FP64
 455         1: 2,  # QTY 2x FP32
 456         2: 4,  # QTY 4x FP16
 457         3: 4,  # QTY 4x BF16
 458     }
 459     widths_at_elwidth = {
 460         0: 11,  # FP64 ew=0b00
 461         1: 11,  # FP32 ew=0b01
 462         2: 5,  # FP16 ew=0b10
 463         3: 8   # BF16 ew=0b11
 464     }
 465
 466     # expected results:
 467     #
 468     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 469     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 470     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 471     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 472     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 473     #  unused  x                     x
 474
 475     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 476     for i in range(4):
 477         pp, bitp, bm, b, c, d = \
 478             layout(i, vec_el_counts, widths_at_elwidth,
 479                    fixed_width=32)
 480         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 481     # now check that the expected partition points occur
 482     print("11,8,5,8 pp keys", pp.keys())
 483     #assert list(pp.keys()) == [5,6,12,18]