switch to exact version of cython
[ieee754fpu.git] / src / ieee754 / part / layout_experiment.py
1 #!/usr/bin/env python3
2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
4 """
5 Links:
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
13 """
14
15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
16 from nmigen.back.pysim import Simulator, Delay, Settle
17 from nmigen.cli import rtlil
18
19 from collections.abc import Mapping
20 from functools import reduce
21 import operator
22 from collections import defaultdict
23 from pprint import pprint
24
25 from ieee754.part_mul_add.partpoints import PartitionPoints
26
27
28 # XXX MAKE SURE TO PRESERVE ALL THESE COMMENTS XXX
29
30 # main fn, which started out here in the bugtracker:
31 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
32 # note that signed is **NOT** part of the layout, and will NOT
33 # be added (because it is not relevant or appropriate).
34 # sign belongs in ast.Shape and is the only appropriate location.
35 # there is absolutely nothing within this function that in any
36 # way requires a sign. it is *purely* performing numerical width
37 # computations that have absolutely nothing to do with whether the
38 # actual data is signed or unsigned.
39 #
40 # context for parameters:
41 # http://lists.libre-soc.org/pipermail/libre-soc-dev/2021-October/003921.html
42 # XXX tempted to suggest that this function remain as a function, because
43 # it takes all the context it needs as parameters. its usefulness goes
44 # beyond a single class, and there is actually nothing realistically
45 # that it needs whixh is context-sensitive. therefore, on balance,
46 # it should remain a function
47 def layout(elwid, # comes from SimdScope constructor
48 vec_el_counts, # comes from SimdScope constructor
49 lane_shapes=None, # from SimdScope.Signal via a SimdShape
50 fixed_width=None): # from SimdScope.Signal via a SimdShape
51 """calculate a SIMD layout.
52
53 Glossary:
54 * element: a single scalar value that is an element of a SIMD vector.
55 it has a width in bits. Every element is made of 1 or
56 more parts.
57 * ElWid: the element-width (really the element type) of an instruction.
58 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
59 In Python, `ElWid` is either an enum type or is `int`.
60 Example `ElWid` definition for integers:
61
62 class ElWid(Enum):
63 I64 = ... # SVP64 value 0b00
64 I32 = ... # SVP64 value 0b01
65 I16 = ... # SVP64 value 0b10
66 I8 = ... # SVP64 value 0b11
67
68 Example `ElWid` definition for floats:
69
70 class ElWid(Enum):
71 F64 = ... # SVP64 value 0b00
72 F32 = ... # SVP64 value 0b01
73 F16 = ... # SVP64 value 0b10
74 BF16 = ... # SVP64 value 0b11
75
76 * elwid: ElWid or nmigen Value with ElWid as the shape
77 the current element-width
78
79 * vec_el_counts: dict[ElWid, int]
80 a map from `ElWid` values `k` to the number of vector elements
81 required within a partition when `elwid == k`.
82
83 Example:
84 vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
85 ElWid.I16(==0b10): 4, # 4 vector elements
86 ElWid.I32(==0b01): 2, # 2 vector elements
87 ElWid.I64(==0b00): 1} # 1 vector (aka scalar) element
88
89 Another Example:
90 vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
91 ElWid.F16(==0b10): 4, # 4 vector elements
92 ElWid.F32(==0b01): 2, # 2 vector elements
93 ElWid.F64(==0b00): 1} # 1 (aka scalar) vector element
94
95 * lane_shapes: int or Mapping[ElWid, int] (optional)
96 the bit-width of all elements in a SIMD layout.
97 if not provided, the lane_shapes are computed from fixed_width
98 and vec_el_counts at each elwidth.
99
100 * fixed_width: int (optional)
101 the total width of a SIMD vector. One or both of lane_shapes or
102 fixed_width may be provided. Both may not be left out.
103 """
104 # when there are no lane_shapes specified, this indicates a
105 # desire to use the maximum available space based on the fixed width
106 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
107 if lane_shapes is None:
108 assert fixed_width is not None, \
109 "both fixed_width and lane_shapes cannot be None"
110 lane_shapes = {i: fixed_width // vec_el_counts[i]
111 for i in vec_el_counts}
112 print("lane_shapes", fixed_width, lane_shapes)
113
114 # identify if the lane_shapes is a mapping (dict, etc.)
115 # if not, then assume that it is an integer (width) that
116 # needs to be requested across all partitions.
117 # Note: back in SimdScope.Shape(), this is how code that was
118 # formerly scalar can be "converted" to "look like" it is
119 # still scalar, by not altering the width at any of the elwids.
120 if not isinstance(lane_shapes, Mapping):
121 lane_shapes = {i: lane_shapes for i in vec_el_counts}
122
123 # compute a set of partition widths
124 print("lane_shapes", lane_shapes, "vec_el_counts", vec_el_counts)
125 cpart_wid = 0
126 width = 0
127 for i, lwid in lane_shapes.items():
128 required_width = lwid * vec_el_counts[i]
129 print(" required width", cpart_wid, i, lwid, required_width)
130 if required_width > width:
131 cpart_wid = lwid
132 width = required_width
133
134 # calculate the minumum width required if fixed_width specified
135 part_count = max(vec_el_counts.values())
136 print("width", width, cpart_wid, part_count)
137 if fixed_width is not None: # override the width and part_wid
138 assert width <= fixed_width, "not enough space to fit partitions"
139 part_wid = fixed_width // part_count
140 assert part_wid * part_count == fixed_width, \
141 "calculated width not aligned multiples"
142 width = fixed_width
143 print("part_wid", part_wid, "count", part_count, "width", width)
144
145 # create the breakpoints dictionary.
146 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
147 # https://stackoverflow.com/questions/26367812/
148
149 # first phase: create a dictionary of partition points (dpoints) and
150 # also keep a record of where each element starts and ends (lpoints)
151 dpoints = defaultdict(list) # if empty key, create a (empty) list
152 lpoints = defaultdict(list) # dict of list of start-end points
153 padding_masks = {}
154 always_padding_mask = (1 << width) - 1 # start with all bits padding
155 for i, c in vec_el_counts.items():
156 print("dpoints", i, "count", c)
157 # calculate part_wid based on overall width divided by number
158 # of elements.
159 part_wid = width // c
160
161 padding_mask = (1 << width) - 1 # start with all bits padding
162
163 def add_p(msg, start, p):
164 print(" adding dpoint", msg, start, part_wid, i, c, p)
165 dpoints[p].append(i) # auto-creates list if key non-existent
166 # for each elwidth, create the required number of vector elements
167 for start in range(c):
168 start_bit = start * part_wid
169 end_bit = start_bit + lane_shapes[i]
170 element_mask = (1 << end_bit) - (1 << start_bit)
171 padding_mask &= ~element_mask # remove element from padding_mask
172 lpoints[i].append(range(start_bit, end_bit))
173 add_p("start", start, start_bit) # start of lane
174 add_p("end ", start, end_bit) # end lane
175 padding_masks[i] = padding_mask
176 always_padding_mask &= padding_mask
177
178 # deduplicate dpoints lists
179 for k in dpoints.keys():
180 dpoints[k] = list({i: None for i in dpoints[k]}.keys())
181
182 # do not need the breakpoints at the very start or the very end
183 dpoints.pop(0, None)
184 dpoints.pop(width, None)
185
186 # sort dpoints keys
187 dpoints = dict(sorted(dpoints.items(), key=lambda i: i[0]))
188
189 print("dpoints")
190 pprint(dpoints)
191
192 # second stage, add (map to) the elwidth==i expressions.
193 # TODO: use nmutil.treereduce?
194 points = {}
195 for p in dpoints.keys():
196 points[p] = map(lambda i: elwid == i, dpoints[p])
197 points[p] = reduce(operator.or_, points[p])
198
199 # third stage, create the binary values which *if* elwidth is set to i
200 # *would* result in the mask at that elwidth being set to this value
201 # these can easily be double-checked through Assertion
202 bitp = {}
203 for i in vec_el_counts.keys():
204 bitp[i] = 0
205 for bit_index, (p, elwidths) in enumerate(dpoints.items()):
206 if i in elwidths:
207 bitp[i] |= 1 << bit_index
208
209 # fourth stage: determine which partitions are 100% unused.
210 # these can then be "blanked out"
211
212 # points are the partition separators, not partition indexes
213 partition_ends = [*dpoints.keys(), width]
214 bmask = 0
215 partition_start = 0
216 for bit_index, partition_end in enumerate(partition_ends):
217 pmask = (1 << partition_end) - (1 << partition_start)
218 always_padding = (always_padding_mask & pmask) == pmask
219 if always_padding:
220 bmask |= 1 << bit_index
221 partition_start = partition_end
222 return (PartitionPoints(points), bitp, lpoints, bmask, width, lane_shapes,
223 part_wid)
224
225 # XXX XXX XXX XXX quick tests TODO convert to proper ones but kinda good
226 # enough for now. if adding new tests do not alter or delete the old ones
227 # XXX XXX XXX XXX
228
229 if __name__ == '__main__':
230
231 # for each element-width (elwidth 0-3) the number of Vector Elements is:
232 # elwidth=0b00 QTY 1 partitions: | ? |
233 # elwidth=0b01 QTY 1 partitions: | ? |
234 # elwidth=0b10 QTY 2 partitions: | ? | ? |
235 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
236 # actual widths of Signals *within* those partitions is given separately
237 vec_el_counts = {
238 0: 1,
239 1: 1,
240 2: 2,
241 3: 4,
242 }
243
244 # width=3 indicates "same width Vector Elements (3) at all elwidths"
245 # elwidth=0b00 1x 5-bit | unused xx ..3 |
246 # elwidth=0b01 1x 6-bit | unused xx ..3 |
247 # elwidth=0b10 2x 12-bit | xxx ..3 | xxx ..3 |
248 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
249 # expected partitions (^) | | | (^)
250 # to be at these points: (|) | | | |
251 width_in_all_parts = 3
252
253 for i in range(4):
254 pprint((i, layout(i, vec_el_counts, width_in_all_parts)))
255
256 # specify that the Vector Element lengths are to be *different* at
257 # each of the elwidths.
258 # combined with vec_el_counts we have:
259 # elwidth=0b00 1x 5-bit |<----unused---------->....5|
260 # elwidth=0b01 1x 6-bit |<----unused--------->.....6|
261 # elwidth=0b10 2x 6-bit |unused>.....6|unused>.....6|
262 # elwidth=0b11 4x 6-bit |.....6|.....6|.....6|.....6|
263 # expected partitions (^) ^ ^ ^^ (^)
264 # to be at these points: (|) | | || (|)
265 # (24) 18 12 65 (0)
266 widths_at_elwidth = {
267 0: 5,
268 1: 6,
269 2: 6,
270 3: 6
271 }
272
273 print("5,6,6,6 elements", widths_at_elwidth)
274 for i in range(4):
275 pp, bitp, lp, bm, b, c, d = \
276 layout(i, vec_el_counts, widths_at_elwidth)
277 pprint((i, (pp, bitp, lp, bm, b, c, d)))
278 # now check that the expected partition points occur
279 print("5,6,6,6 ppt keys", pp.keys())
280 assert list(pp.keys()) == [5, 6, 12, 18]
281 assert bm == 0 # no unused partitions
282
283 # this example was probably what the 5,6,6,6 one was supposed to be.
284 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
285 # elwidth=0b00 1x 24-bit |.........................24|
286 # elwidth=0b01 1x 12-bit |<--unused--->|...........12|
287 # elwidth=0b10 2x 5 -bit |unused>|....5|unused>|....5|
288 # elwidth=0b11 4x 6 -bit |.....6|.....6|.....6|.....6|
289 # expected partitions (^) ^^ ^ ^^ (^)
290 # to be at these points: (|) || | || (|)
291 # (24) 1817 12 65 (0)
292 widths_at_elwidth = {
293 0: 24, # QTY 1x 24
294 1: 12, # QTY 1x 12
295 2: 5, # QTY 2x 5
296 3: 6 # QTY 4x 6
297 }
298
299 print("24,12,5,6 elements", widths_at_elwidth)
300 for i in range(4):
301 pp, bitp, lp, bm, b, c, d = \
302 layout(i, vec_el_counts, widths_at_elwidth)
303 pprint((i, (pp, bitp, lp, bm, b, c, d)))
304 # now check that the expected partition points occur
305 print("24,12,5,6 ppt keys", pp.keys())
306 assert list(pp.keys()) == [5, 6, 12, 17, 18]
307 print("bmask", bin(bm))
308 assert bm == 0 # no unused partitions
309
310 # this tests elwidth as an actual Signal. layout is allowed to
311 # determine arbitrarily the overall length
312 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
313
314 elwid = Signal(2)
315 pp, bitp, lp, bm, b, c, d = layout(
316 elwid, vec_el_counts, widths_at_elwidth)
317 pprint((pp, lp, b, c, d))
318 for k, v in bitp.items():
319 print("bitp elwidth=%d" % k, bin(v))
320 print("bmask", bin(bm))
321 assert bm == 0 # no unused partitions
322
323 m = Module()
324
325 def process():
326 for i in range(4):
327 yield elwid.eq(i)
328 yield Settle()
329 ppt = []
330 for pval in list(pp.values()):
331 val = yield pval # get nmigen to evaluate pp
332 ppt.append(val)
333 pprint((i, (ppt, b, c, d)))
334 # check the results against bitp static-expected partition points
335 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
336 # https://stackoverflow.com/a/27165694
337 ival = int(''.join(map(str, ppt[::-1])), 2)
338 assert ival == bitp[i]
339
340 sim = Simulator(m)
341 sim.add_process(process)
342 sim.run()
343
344 # this tests elwidth as an actual Signal. layout is *not* allowed to
345 # determine arbitrarily the overall length, it is fixed to 64
346 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
347
348 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
349 # elwidth=0b00 1x 24-bit
350 # elwidth=0b01 1x 12-bit
351 # elwidth=0b10 2x 5-bit
352 # elwidth=0b11 4x 6-bit
353 #
354 # bmask<--------1<----0<---------10<---0<-------1<0<----0<---0<----00<---0
355 # always unused:| | | || | | | | | | || |
356 # 1111111111000000 1111111111000000 1111111100000000 0000000000000000
357 # | | | || | | | | | | || |
358 # 0b00 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxx........ ..............24|
359 # 0b01 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxx..........12|
360 # 0b10 xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|
361 # 0b11 xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|
362 # ^ ^ ^^ ^ ^ ^ ^ ^ ^^
363 # ppoints: | | || | | | | | ||
364 # | bit-48 /\ | bit-24-/ | | bit-12 /\-bit-5
365 # bit-54 bit-38-/ \ bit-32 | bit-16 /
366 # bit-37 bit-22 bit-6
367
368 elwid = Signal(2)
369 pp, bitp, lp, bm, b, c, d = layout(elwid, vec_el_counts,
370 widths_at_elwidth,
371 fixed_width=64)
372 pprint((pp, lp, b, c, d))
373 for k, v in bitp.items():
374 print("bitp elwidth=%d" % k, bin(v))
375 print("bmask", bin(bm))
376 assert bm == 0b101001000000
377
378 m = Module()
379
380 def process():
381 for i in range(4):
382 yield elwid.eq(i)
383 yield Settle()
384 ppt = []
385 for pval in list(pp.values()):
386 val = yield pval # get nmigen to evaluate pp
387 ppt.append(val)
388 print("test elwidth=%d" % i)
389 pprint((i, (ppt, b, c, d)))
390 # check the results against bitp static-expected partition points
391 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
392 # https://stackoverflow.com/a/27165694
393 ival = int(''.join(map(str, ppt[::-1])), 2)
394 assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
395 bin(bitp[i]))
396
397 sim = Simulator(m)
398 sim.add_process(process)
399 sim.run()
400
401 # fixed_width=32 and no lane_widths says "allocate maximum"
402 # i.e. Vector Element Widths are auto-allocated
403 # elwidth=0b00 1x 32-bit | .................32 |
404 # elwidth=0b01 1x 32-bit | .................32 |
405 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
406 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
407 # expected partitions (^) | | | (^)
408 # to be at these points: (|) | | | |
409
410 # TODO, fix this so that it is correct. put it at the end so it
411 # shows that things break and doesn't stop the other tests.
412 print("maximum allocation from fixed_width=32")
413 for i in range(4):
414 pprint((i, layout(i, vec_el_counts, fixed_width=32)))
415
416 # example "exponent"
417 # https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
418 # 1xFP64: 11 bits, one exponent
419 # 2xFP32: 8 bits, two exponents
420 # 4xFP16: 5 bits, four exponents
421 # 4xBF16: 8 bits, four exponents
422 vec_el_counts = {
423 0: 1, # QTY 1x FP64
424 1: 2, # QTY 2x FP32
425 2: 4, # QTY 4x FP16
426 3: 4, # QTY 4x BF16
427 }
428 widths_at_elwidth = {
429 0: 11, # FP64 ew=0b00
430 1: 8, # FP32 ew=0b01
431 2: 5, # FP16 ew=0b10
432 3: 8 # BF16 ew=0b11
433 }
434
435 # expected results:
436 #
437 # |31| | |24| 16|15 | | 8|7 0 |
438 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
439 # 32bit | x| x| x| | x| x| x|10 .... 0 |
440 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
441 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
442 # unused x x
443
444 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
445 for i in range(4):
446 pp, bitp, lp, bm, b, c, d = \
447 layout(i, vec_el_counts, widths_at_elwidth,
448 fixed_width=32)
449 pprint((i, (pp, lp, bitp, bin(bm), b, c, d)))
450 # now check that the expected partition points occur
451 print("11,8,5,8 pp keys", pp.keys())
452 #assert list(pp.keys()) == [5,6,12,18]
453
454 ###### ######
455 ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
456 ###### ######
457
458 # example "exponent"
459 vec_el_counts = {
460 0: 1, # QTY 1x FP64
461 1: 2, # QTY 2x FP32
462 2: 4, # QTY 4x FP16
463 3: 4, # QTY 4x BF16
464 }
465 widths_at_elwidth = {
466 0: 11, # FP64 ew=0b00
467 1: 11, # FP32 ew=0b01
468 2: 5, # FP16 ew=0b10
469 3: 8 # BF16 ew=0b11
470 }
471
472 # expected results:
473 #
474 # |31| | |24| 16|15 | | 8|7 0 |
475 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
476 # 32bit | x| x| x| | x| x| x|10 .... 0 |
477 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
478 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
479 # unused x x
480
481 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
482 for i in range(4):
483 pp, bitp, lp, bm, b, c, d = \
484 layout(i, vec_el_counts, widths_at_elwidth,
485 fixed_width=32)
486 pprint((i, (pp, lp, bitp, bin(bm), b, c, d)))
487 # now check that the expected partition points occur
488 print("11,8,5,8 pp keys", pp.keys())
489 #assert list(pp.keys()) == [5,6,12,18]