src/ieee754/part/layout_experiment.py

   1 #!/usr/bin/env python3
   2 # SPDX-License-Identifier: LGPL-3-or-later
   3 # See Notices.txt for copyright information
   4 """
   5 Links:
   6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
   7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
   8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
   9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
  10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
  11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
  12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  13 """
  14
  15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
  16 from nmigen.back.pysim import Simulator, Delay, Settle
  17 from nmigen.cli import rtlil
  18
  19 from collections.abc import Mapping
  20 from functools import reduce
  21 import operator
  22 from collections import defaultdict
  23 from pprint import pprint
  24
  25 from ieee754.part_mul_add.partpoints import PartitionPoints
  26
  27
  28 # XXX MAKE SURE TO PRESERVE ALL THESE COMMENTS XXX
  29
  30 # main fn, which started out here in the bugtracker:
  31 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
  32 # note that signed is **NOT** part of the layout, and will NOT
  33 # be added (because it is not relevant or appropriate).
  34 # sign belongs in ast.Shape and is the only appropriate location.
  35 # there is absolutely nothing within this function that in any
  36 # way requires a sign.  it is *purely* performing numerical width
  37 # computations that have absolutely nothing to do with whether the
  38 # actual data is signed or unsigned.
  39 #
  40 # context for parameters:
  41 # http://lists.libre-soc.org/pipermail/libre-soc-dev/2021-October/003921.html
  42 # XXX tempted to suggest that this function remain as a function, because
  43 # it takes all the context it needs as parameters.  its usefulness goes
  44 # beyond a dingle class, and there is actually nothing realistically
  45 # that it needs whixh is context-srnsitive.  theregore, on balabce,
  46 # it should remain a function
  47 def layout(elwid,            # comes from SimdScope constructor
  48            vec_el_counts,    # comes from SimdScope constructor
  49            lane_shapes=None,   # from SimdScope.Signal via a SimdShape
  50            fixed_width=None):  # from SimdScope.Signal via a SimdShape
  51     """calculate a SIMD layout.
  52
  53     Glossary:
  54     * element: a single scalar value that is an element of a SIMD vector.
  55         it has a width in bits. Every element is made of 1 or
  56         more parts.
  57     * ElWid: the element-width (really the element type) of an instruction.
  58         Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
  59         In Python, `ElWid` is either an enum type or is `int`.
  60         Example `ElWid` definition for integers:
  61
  62         class ElWid(Enum):
  63             I64 = ...       # SVP64 value 0b00
  64             I32 = ...       # SVP64 value 0b01
  65             I16 = ...       # SVP64 value 0b10
  66             I8 = ...        # SVP64 value 0b11
  67
  68         Example `ElWid` definition for floats:
  69
  70         class ElWid(Enum):
  71             F64 = ...    # SVP64 value 0b00
  72             F32 = ...    # SVP64 value 0b01
  73             F16 = ...    # SVP64 value 0b10
  74             BF16 = ...   # SVP64 value 0b11
  75
  76     * elwid: ElWid or nmigen Value with ElWid as the shape
  77         the current element-width
  78
  79     * vec_el_counts: dict[ElWid, int]
  80         a map from `ElWid` values `k` to the number of vector elements
  81         required within a partition when `elwid == k`.
  82
  83         Example:
  84         vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
  85                        ElWid.I16(==0b10): 4,  # 4 vector elements
  86                        ElWid.I32(==0b01): 2,  # 2 vector elements
  87                        ElWid.I64(==0b00): 1}  # 1 vector (aka scalar) element
  88
  89         Another Example:
  90         vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
  91                          ElWid.F16(==0b10): 4,  # 4 vector elements
  92                          ElWid.F32(==0b01): 2,  # 2 vector elements
  93                          ElWid.F64(==0b00): 1}  # 1 (aka scalar) vector element
  94
  95     * lane_shapes: int or Mapping[ElWid, int] (optional)
  96         the bit-width of all elements in a SIMD layout.
  97         if not provided, the lane_shapes are computed from fixed_width
  98         and vec_el_counts at each elwidth.
  99
 100     * fixed_width: int (optional)
 101         the total width of a SIMD vector. One or both of lane_shapes or
 102         fixed_width may be provided.  Both may not be left out.
 103     """
 104     # when there are no lane_shapes specified, this indicates a
 105     # desire to use the maximum available space based on the fixed width
 106     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
 107     if lane_shapes is None:
 108         assert fixed_width is not None, \
 109             "both fixed_width and lane_shapes cannot be None"
 110         lane_shapes = {i: fixed_width // vec_el_counts[i]
 111                        for i in vec_el_counts}
 112         print("lane_shapes", fixed_width, lane_shapes)
 113
 114     # identify if the lane_shapes is a mapping (dict, etc.)
 115     # if not, then assume that it is an integer (width) that
 116     # needs to be requested across all partitions
 117     if not isinstance(lane_shapes, Mapping):
 118         lane_shapes = {i: lane_shapes for i in vec_el_counts}
 119
 120     # compute a set of partition widths
 121     print("lane_shapes", lane_shapes, "vec_el_counts", vec_el_counts)
 122     cpart_wid = 0
 123     width = 0
 124     for i, lwid in lane_shapes.items():
 125         required_width = lwid * vec_el_counts[i]
 126         print("     required width", cpart_wid, i, lwid, required_width)
 127         if required_width > width:
 128             cpart_wid = lwid
 129             width = required_width
 130
 131     # calculate the minumum width required if fixed_width specified
 132     part_count = max(vec_el_counts.values())
 133     print("width", width, cpart_wid, part_count)
 134     if fixed_width is not None:  # override the width and part_wid
 135         assert width <= fixed_width, "not enough space to fit partitions"
 136         part_wid = fixed_width // part_count
 137         assert part_wid * part_count == fixed_width, \
 138             "calculated width not aligned multiples"
 139         width = fixed_width
 140         print("part_wid", part_wid, "count", part_count, "width", width)
 141
 142     # create the breakpoints dictionary.
 143     # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
 144     # https://stackoverflow.com/questions/26367812/
 145     dpoints = defaultdict(list)  # if empty key, create a (empty) list
 146     padding_masks = {}
 147     always_padding_mask = (1 << width) - 1  # start with all bits padding
 148     for i, c in vec_el_counts.items():
 149         print("dpoints", i, "count", c)
 150         # calculate part_wid based on overall width divided by number
 151         # of elements.
 152         part_wid = width // c
 153
 154         padding_mask = (1 << width) - 1  # start with all bits padding
 155
 156         def add_p(msg, start, p):
 157             print("    adding dpoint", msg, start, part_wid, i, c, p)
 158             dpoints[p].append(i)  # auto-creates list if key non-existent
 159         # for each elwidth, create the required number of vector elements
 160         for start in range(c):
 161             start_bit = start * part_wid
 162             end_bit = start_bit + lane_shapes[i]
 163             element_mask = (1 << end_bit) - (1 << start_bit)
 164             padding_mask &= ~element_mask  # remove element from padding_mask
 165             add_p("start", start, start_bit)  # start of lane
 166             add_p("end  ", start, end_bit)  # end lane
 167         padding_masks[i] = padding_mask
 168         always_padding_mask &= padding_mask
 169
 170     # deduplicate dpoints lists
 171     for k in dpoints.keys():
 172         dpoints[k] = list({i: None for i in dpoints[k]}.keys())
 173
 174     # do not need the breakpoints at the very start or the very end
 175     dpoints.pop(0, None)
 176     dpoints.pop(width, None)
 177
 178     # sort dpoints keys
 179     dpoints = dict(sorted(dpoints.items(), key=lambda i: i[0]))
 180
 181     print("dpoints")
 182     pprint(dpoints)
 183
 184     # second stage, add (map to) the elwidth==i expressions.
 185     # TODO: use nmutil.treereduce?
 186     points = {}
 187     for p in dpoints.keys():
 188         points[p] = map(lambda i: elwid == i, dpoints[p])
 189         points[p] = reduce(operator.or_, points[p])
 190
 191     # third stage, create the binary values which *if* elwidth is set to i
 192     # *would* result in the mask at that elwidth being set to this value
 193     # these can easily be double-checked through Assertion
 194     bitp = {}
 195     for i in vec_el_counts.keys():
 196         bitp[i] = 0
 197         for bit_index, (p, elwidths) in enumerate(dpoints.items()):
 198             if i in elwidths:
 199                 bitp[i] |= 1 << bit_index
 200
 201     # fourth stage: determine which partitions are 100% unused.
 202     # these can then be "blanked out"
 203
 204     # points are the partition separators, not partition indexes
 205     partition_ends = [*dpoints.keys(), width]
 206     bmask = 0
 207     partition_start = 0
 208     for bit_index, partition_end in enumerate(partition_ends):
 209         pmask = (1 << partition_end) - (1 << partition_start)
 210         always_padding = (always_padding_mask & pmask) == pmask
 211         if always_padding:
 212             bmask |= 1 << bit_index
 213         partition_start = partition_end
 214     return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
 215             part_wid)
 216
 217 # XXX XXX XXX XXX quick tests TODO convert to proper ones but kinda good
 218 # enough for now.  if adding new tests do not alter or delete the old ones
 219 # XXX XXX XXX XXX
 220
 221 if __name__ == '__main__':
 222
 223     # for each element-width (elwidth 0-3) the number of Vector Elements is:
 224     # elwidth=0b00 QTY 1 partitions:   |          ?          |
 225     # elwidth=0b01 QTY 1 partitions:   |          ?          |
 226     # elwidth=0b10 QTY 2 partitions:   |    ?     |     ?    |
 227     # elwidth=0b11 QTY 4 partitions:   | ?  |  ?  |  ?  | ?  |
 228     # actual widths of Signals *within* those partitions is given separately
 229     vec_el_counts = {
 230         0: 1,
 231         1: 1,
 232         2: 2,
 233         3: 4,
 234     }
 235
 236     # width=3 indicates "same width Vector Elements (3) at all elwidths"
 237     # elwidth=0b00 1x 5-bit     |  unused xx      ..3 |
 238     # elwidth=0b01 1x 6-bit     |  unused xx      ..3 |
 239     # elwidth=0b10 2x 12-bit    | xxx  ..3 | xxx  ..3 |
 240     # elwidth=0b11 3x 24-bit    | ..3| ..3 | ..3 |..3 |
 241     # expected partitions      (^)   |     |     |   (^)
 242     # to be at these points:   (|)   |     |     |    |
 243     width_in_all_parts = 3
 244
 245     for i in range(4):
 246         pprint((i, layout(i, vec_el_counts, width_in_all_parts)))
 247
 248     # specify that the Vector Element lengths are to be *different* at
 249     # each of the elwidths.
 250     # combined with vec_el_counts we have:
 251     # elwidth=0b00 1x 5-bit    |<----unused---------->....5|
 252     # elwidth=0b01 1x 6-bit    |<----unused--------->.....6|
 253     # elwidth=0b10 2x 6-bit    |unused>.....6|unused>.....6|
 254     # elwidth=0b11 4x 6-bit    |.....6|.....6|.....6|.....6|
 255     # expected partitions     (^)     ^      ^      ^^    (^)
 256     # to be at these points:  (|)     |      |      ||    (|)
 257     #                         (24)   18     12      65    (0)
 258     widths_at_elwidth = {
 259         0: 5,
 260         1: 6,
 261         2: 6,
 262         3: 6
 263     }
 264
 265     print("5,6,6,6 elements", widths_at_elwidth)
 266     for i in range(4):
 267         pp, bitp, bm, b, c, d = \
 268             layout(i, vec_el_counts, widths_at_elwidth)
 269         pprint((i, (pp, bitp, bm, b, c, d)))
 270     # now check that the expected partition points occur
 271     print("5,6,6,6 ppt keys", pp.keys())
 272     assert list(pp.keys()) == [5, 6, 12, 18]
 273     assert bm == 0  # no unused partitions
 274
 275     # this example was probably what the 5,6,6,6 one was supposed to be.
 276     # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
 277     # elwidth=0b00 1x 24-bit    |.........................24|
 278     # elwidth=0b01 1x 12-bit    |<--unused--->|...........12|
 279     # elwidth=0b10 2x 5 -bit    |unused>|....5|unused>|....5|
 280     # elwidth=0b11 4x 6 -bit    |.....6|.....6|.....6|.....6|
 281     # expected partitions      (^)     ^^     ^       ^^    (^)
 282     # to be at these points:   (|)     ||     |       ||    (|)
 283     #                          (24)   1817   12       65    (0)
 284     widths_at_elwidth = {
 285         0: 24,  # QTY 1x 24
 286         1: 12,  # QTY 1x 12
 287         2: 5,   # QTY 2x 5
 288         3: 6    # QTY 4x 6
 289     }
 290
 291     print("24,12,5,6 elements", widths_at_elwidth)
 292     for i in range(4):
 293         pp, bitp, bm, b, c, d = \
 294             layout(i, vec_el_counts, widths_at_elwidth)
 295         pprint((i, (pp, bitp, bm, b, c, d)))
 296     # now check that the expected partition points occur
 297     print("24,12,5,6 ppt keys", pp.keys())
 298     assert list(pp.keys()) == [5, 6, 12, 17, 18]
 299     print("bmask", bin(bm))
 300     assert bm == 0  # no unused partitions
 301
 302     # this tests elwidth as an actual Signal. layout is allowed to
 303     # determine arbitrarily the overall length
 304     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
 305
 306     elwid = Signal(2)
 307     pp, bitp, bm, b, c, d = layout(
 308         elwid, vec_el_counts, widths_at_elwidth)
 309     pprint((pp, b, c, d))
 310     for k, v in bitp.items():
 311         print("bitp elwidth=%d" % k, bin(v))
 312     print("bmask", bin(bm))
 313     assert bm == 0  # no unused partitions
 314
 315     m = Module()
 316
 317     def process():
 318         for i in range(4):
 319             yield elwid.eq(i)
 320             yield Settle()
 321             ppt = []
 322             for pval in list(pp.values()):
 323                 val = yield pval  # get nmigen to evaluate pp
 324                 ppt.append(val)
 325             pprint((i, (ppt, b, c, d)))
 326             # check the results against bitp static-expected partition points
 327             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 328             # https://stackoverflow.com/a/27165694
 329             ival = int(''.join(map(str, ppt[::-1])), 2)
 330             assert ival == bitp[i]
 331
 332     sim = Simulator(m)
 333     sim.add_process(process)
 334     sim.run()
 335
 336     # this tests elwidth as an actual Signal. layout is *not* allowed to
 337     # determine arbitrarily the overall length, it is fixed to 64
 338     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
 339
 340     # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
 341     # elwidth=0b00 1x 24-bit
 342     # elwidth=0b01 1x 12-bit
 343     # elwidth=0b10 2x 5-bit
 344     # elwidth=0b11 4x 6-bit
 345     #
 346     # bmask<--------1<----0<---------10<---0<-------1<0<----0<---0<----00<---0
 347     # always unused:|     |     |    ||    |    |   | |     |    |     ||    |
 348     #      1111111111000000 1111111111000000 1111111100000000 0000000000000000
 349     #               |     |     |    ||    |    |   | |     |    |     ||    |
 350     # 0b00 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxx........ ..............24|
 351     # 0b01 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxx..........12|
 352     # 0b10 xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|
 353     # 0b11 xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|
 354     #               ^     ^          ^^    ^        ^ ^     ^    ^     ^^
 355     #     ppoints:  |     |          ||    |        | |     |    |     ||
 356     #               |  bit-48        /\    | bit-24-/ |     | bit-12   /\-bit-5
 357     #            bit-54      bit-38-/  \ bit-32       |   bit-16      /
 358     #                                 bit-37       bit-22          bit-6
 359
 360     elwid = Signal(2)
 361     pp, bitp, bm, b, c, d = layout(elwid, vec_el_counts,
 362                                    widths_at_elwidth,
 363                                    fixed_width=64)
 364     pprint((pp, b, c, d))
 365     for k, v in bitp.items():
 366         print("bitp elwidth=%d" % k, bin(v))
 367     print("bmask", bin(bm))
 368     assert bm == 0b101001000000
 369
 370     m = Module()
 371
 372     def process():
 373         for i in range(4):
 374             yield elwid.eq(i)
 375             yield Settle()
 376             ppt = []
 377             for pval in list(pp.values()):
 378                 val = yield pval  # get nmigen to evaluate pp
 379                 ppt.append(val)
 380             print("test elwidth=%d" % i)
 381             pprint((i, (ppt, b, c, d)))
 382             # check the results against bitp static-expected partition points
 383             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 384             # https://stackoverflow.com/a/27165694
 385             ival = int(''.join(map(str, ppt[::-1])), 2)
 386             assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
 387                                                            bin(bitp[i]))
 388
 389     sim = Simulator(m)
 390     sim.add_process(process)
 391     sim.run()
 392
 393     # fixed_width=32 and no lane_widths says "allocate maximum"
 394     # i.e. Vector Element Widths are auto-allocated
 395     # elwidth=0b00 1x 32-bit    | .................32 |
 396     # elwidth=0b01 1x 32-bit    | .................32 |
 397     # elwidth=0b10 2x 12-bit    | ......16 | ......16 |
 398     # elwidth=0b11 3x 24-bit    | ..8| ..8 | ..8 |..8 |
 399     # expected partitions      (^)   |     |     |   (^)
 400     # to be at these points:   (|)   |     |     |    |
 401
 402     # TODO, fix this so that it is correct.  put it at the end so it
 403     # shows that things break and doesn't stop the other tests.
 404     print("maximum allocation from fixed_width=32")
 405     for i in range(4):
 406         pprint((i, layout(i, vec_el_counts, fixed_width=32)))
 407
 408     # example "exponent"
 409     #  https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
 410     # 1xFP64: 11 bits, one exponent
 411     # 2xFP32: 8 bits, two exponents
 412     # 4xFP16: 5 bits, four exponents
 413     # 4xBF16: 8 bits, four exponents
 414     vec_el_counts = {
 415         0: 1,  # QTY 1x FP64
 416         1: 2,  # QTY 2x FP32
 417         2: 4,  # QTY 4x FP16
 418         3: 4,  # QTY 4x BF16
 419     }
 420     widths_at_elwidth = {
 421         0: 11,  # FP64 ew=0b00
 422         1: 8,  # FP32 ew=0b01
 423         2: 5,  # FP16 ew=0b10
 424         3: 8   # BF16 ew=0b11
 425     }
 426
 427     # expected results:
 428     #
 429     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 430     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 431     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 432     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 433     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 434     #  unused  x                     x
 435
 436     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 437     for i in range(4):
 438         pp, bitp, bm, b, c, d = \
 439             layout(i, vec_el_counts, widths_at_elwidth,
 440                    fixed_width=32)
 441         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 442     # now check that the expected partition points occur
 443     print("11,8,5,8 pp keys", pp.keys())
 444     #assert list(pp.keys()) == [5,6,12,18]
 445
 446     ######                                                           ######
 447     ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
 448     ######                                                           ######
 449
 450     # example "exponent"
 451     vec_el_counts = {
 452         0: 1,  # QTY 1x FP64
 453         1: 2,  # QTY 2x FP32
 454         2: 4,  # QTY 4x FP16
 455         3: 4,  # QTY 4x BF16
 456     }
 457     widths_at_elwidth = {
 458         0: 11,  # FP64 ew=0b00
 459         1: 11,  # FP32 ew=0b01
 460         2: 5,  # FP16 ew=0b10
 461         3: 8   # BF16 ew=0b11
 462     }
 463
 464     # expected results:
 465     #
 466     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 467     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 468     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 469     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 470     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 471     #  unused  x                     x
 472
 473     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 474     for i in range(4):
 475         pp, bitp, bm, b, c, d = \
 476             layout(i, vec_el_counts, widths_at_elwidth,
 477                    fixed_width=32)
 478         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 479     # now check that the expected partition points occur
 480     print("11,8,5,8 pp keys", pp.keys())
 481     #assert list(pp.keys()) == [5,6,12,18]