whitespace for clarity. comments
[ieee754fpu.git] / src / ieee754 / part / layout_experiment.py
1 #!/usr/bin/env python3
2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
4 """
5 Links:
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
13 """
14
15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
16 from nmigen.back.pysim import Simulator, Delay, Settle
17 from nmigen.cli import rtlil
18
19 from collections.abc import Mapping
20 from functools import reduce
21 import operator
22 from collections import defaultdict
23 from pprint import pprint
24
25 from ieee754.part_mul_add.partpoints import PartitionPoints
26
27
28 # main fn, which started out here in the bugtracker:
29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
30 def layout(elwid, signed, vec_el_counts, lane_shapes=None, fixed_width=None):
31 """calculate a SIMD layout.
32
33 Glossary:
34 * element: a single scalar value that is an element of a SIMD vector.
35 it has a width in bits, and a signedness. Every element is made of 1 or
36 more parts.
37 * ElWid: the element-width (really the element type) of an instruction.
38 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
39 In Python, `ElWid` is either an enum type or is `int`.
40 Example `ElWid` definition for integers:
41
42 class ElWid(Enum):
43 I64 = ... # SVP64 value 0b00
44 I32 = ... # SVP64 value 0b01
45 I16 = ... # SVP64 value 0b10
46 I8 = ... # SVP64 value 0b11
47
48 Example `ElWid` definition for floats:
49
50 class ElWid(Enum):
51 F64 = ... # SVP64 value 0b00
52 F32 = ... # SVP64 value 0b01
53 F16 = ... # SVP64 value 0b10
54 BF16 = ... # SVP64 value 0b11
55
56 # XXX this is redundant and out-of-date with respect to the
57 # clarification that the input is in counts of *elements*
58 # *NOT* "fixed width parts".
59 # fixed-width parts results in 14 such parts being created
60 # when 5 will do, for a simple example 5-6-6-6
61 * part: A piece of a SIMD vector, every SIMD vector is made of a
62 non-negative integer of parts. Elements are made of a power-of-two
63 number of parts. A part is a fixed number of bits wide for each
64 different SIMD layout, it doesn't vary when `elwid` changes. A part
65 can have a bit width of any non-negative integer, it is not restricted
66 to power-of-two. SIMD vectors should have as few parts as necessary,
67 since some circuits have size proportional to the number of parts.
68
69 * elwid: ElWid or nmigen Value with ElWid as the shape
70 the current element-width
71
72 * signed: bool
73 the signedness of all elements in a SIMD layout
74 * vec_el_counts: dict[ElWid, int]
75 a map from `ElWid` values `k` to the number of vector elements
76 required within a partition when `elwid == k`.
77
78 Example:
79 vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
80 ElWid.I16(==0b10): 4, # 4 vector elements
81 ElWid.I32(==0b01): 2, # 2 vector elements
82 ElWid.I64(==0b00): 1} # 1 vector (aka scalar) element
83
84 Another Example:
85 vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
86 ElWid.F16(==0b10): 4, # 4 vector elements
87 ElWid.F32(==0b01): 2, # 2 vector elements
88 ElWid.F64(==0b00): 1} # 1 (aka scalar) vector element
89
90 * lane_shapes: int or Mapping[ElWid, int] (optional)
91 the bit-width of all elements in a SIMD layout.
92 if not provided, the lane_shapes are computed from fixed_width
93 and vec_el_counts at each elwidth.
94
95 * fixed_width: int (optional)
96 the total width of a SIMD vector. One or both of lane_shapes or
97 fixed_width may be provided. Both may not be left out.
98 """
99 # when there are no lane_shapes specified, this indicates a
100 # desire to use the maximum available space based on the fixed width
101 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
102 if lane_shapes is None:
103 assert fixed_width is not None, \
104 "both fixed_width and lane_shapes cannot be None"
105 lane_shapes = {i: fixed_width // vec_el_counts[i]
106 for i in vec_el_counts}
107 print("lane_shapes", fixed_width, lane_shapes)
108
109 # identify if the lane_shapes is a mapping (dict, etc.)
110 # if not, then assume that it is an integer (width) that
111 # needs to be requested across all partitions
112 if not isinstance(lane_shapes, Mapping):
113 lane_shapes = {i: lane_shapes for i in vec_el_counts}
114
115 # compute a set of partition widths
116 print("lane_shapes", lane_shapes, "vec_el_counts", vec_el_counts)
117 cpart_wid = max(lane_shapes.values())
118 part_count = max(vec_el_counts.values())
119 # calculate the minumum width required
120 width = cpart_wid * part_count
121 print("width", width, cpart_wid, part_count)
122 if fixed_width is not None: # override the width and part_wid
123 assert width < fixed_width, "not enough space to fit partitions"
124 part_wid = fixed_width // part_count
125 assert part_wid * part_count == fixed_width, \
126 "calculated width not aligned multiples"
127 width = fixed_width
128 print("part_wid", part_wid, "count", part_count)
129
130 # create the breakpoints dictionary.
131 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
132 # https://stackoverflow.com/questions/26367812/
133 dpoints = defaultdict(list) # if empty key, create a (empty) list
134 for i, c in vec_el_counts.items():
135 # calculate part_wid based on overall width divided by number
136 # of elements.
137 part_wid = width // c
138 def add_p(p):
139 dpoints[p].append(i) # auto-creates list if key non-existent
140 # for each elwidth, create the required number of vector elements
141 for start in range(c):
142 add_p(start * part_wid) # start of lane
143 add_p(start * part_wid + lane_shapes[i]) # start of padding
144
145 # do not need the breakpoints at the very start or the very end
146 dpoints.pop(0, None)
147 dpoints.pop(width, None)
148 plist = list(dpoints.keys())
149 plist.sort()
150 print("dpoints")
151 pprint(dict(dpoints))
152
153 # second stage, add (map to) the elwidth==i expressions.
154 # TODO: use nmutil.treereduce?
155 points = {}
156 for p in plist:
157 points[p] = map(lambda i: elwid == i, dpoints[p])
158 points[p] = reduce(operator.or_, points[p])
159
160 # third stage, create the binary values which *if* elwidth is set to i
161 # *would* result in the mask at that elwidth being set to this value
162 # these can easily be double-checked through Assertion
163 bitp = {}
164 for i in vec_el_counts.keys():
165 bitp[i] = 0
166 for p, elwidths in dpoints.items():
167 if i in elwidths:
168 bitpos = plist.index(p)
169 bitp[i] |= 1 << bitpos
170
171 # fourth stage: determine which partitions are 100% unused.
172 # these can then be "blanked out"
173 bmask = (1 << len(plist))-1
174 for p in bitp.values():
175 bmask &= ~p
176 return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
177 part_wid, part_count)
178
179
180 if __name__ == '__main__':
181
182 # for each element-width (elwidth 0-3) the number of Vector Elements is:
183 # elwidth=0b00 QTY 1 partitions: | ? |
184 # elwidth=0b01 QTY 1 partitions: | ? |
185 # elwidth=0b10 QTY 2 partitions: | ? | ? |
186 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
187 # actual widths of Signals *within* those partitions is given separately
188 vec_el_counts = {
189 0: 1,
190 1: 1,
191 2: 2,
192 3: 4,
193 }
194
195 # width=3 indicates "same width Vector Elements (3) at all elwidths"
196 # elwidth=0b00 1x 5-bit | unused xx ..3 |
197 # elwidth=0b01 1x 6-bit | unused xx ..3 |
198 # elwidth=0b10 2x 12-bit | xxx ..3 | xxx ..3 |
199 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
200 # expected partitions (^) | | | (^)
201 # to be at these points: (|) | | | |
202 width_in_all_parts = 3
203
204 for i in range(4):
205 pprint((i, layout(i, True, vec_el_counts, width_in_all_parts)))
206
207 # specify that the Vector Element lengths are to be *different* at
208 # each of the elwidths.
209 # combined with vec_el_counts we have:
210 # elwidth=0b00 1x 5-bit |<----unused----------->....5|
211 # elwidth=0b01 1x 6-bit |<----unused---------->.....6|
212 # elwidth=0b10 2x 12-bit |unused>.....6|unused->.....6|
213 # elwidth=0b11 3x 24-bit |.....6|.....6| .....6|.....6|
214 # expected partitions (^) ^ ^ ^^ (^)
215 # to be at these points: (|) | | || (|)
216 # (24) 18 12 65 (0)
217 widths_at_elwidth = {
218 0: 5,
219 1: 6,
220 2: 6,
221 3: 6
222 }
223
224 print ("5,6,6,6 elements", widths_at_elwidth)
225 for i in range(4):
226 pp, bitp, bm, b, c, d, e = \
227 layout(i, False, vec_el_counts, widths_at_elwidth)
228 pprint((i, (pp, bitp, bm, b, c, d, e)))
229 # now check that the expected partition points occur
230 print("5,6,6,6 ppt keys", pp.keys())
231 assert list(pp.keys()) == [5,6,12,18]
232
233
234 # this tests elwidth as an actual Signal. layout is allowed to
235 # determine arbitrarily the overall length
236 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
237
238 elwid = Signal(2)
239 pp, bitp, bm, b, c, d, e = layout(
240 elwid, False, vec_el_counts, widths_at_elwidth)
241 pprint((pp, b, c, d, e))
242 for k, v in bitp.items():
243 print("bitp elwidth=%d" % k, bin(v))
244 print("bmask", bin(bm))
245
246 m = Module()
247
248 def process():
249 for i in range(4):
250 yield elwid.eq(i)
251 yield Settle()
252 ppt = []
253 for pval in list(pp.values()):
254 val = yield pval # get nmigen to evaluate pp
255 ppt.append(val)
256 pprint((i, (ppt, b, c, d, e)))
257 # check the results against bitp static-expected partition points
258 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
259 # https://stackoverflow.com/a/27165694
260 ival = int(''.join(map(str, ppt[::-1])), 2)
261 assert ival == bitp[i]
262
263 sim = Simulator(m)
264 sim.add_process(process)
265 sim.run()
266
267 # this tests elwidth as an actual Signal. layout is *not* allowed to
268 # determine arbitrarily the overall length, it is fixed to 64
269 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
270
271 elwid = Signal(2)
272 pp, bitp, bm, b, c, d, e = layout(elwid, False, vec_el_counts,
273 widths_at_elwidth,
274 fixed_width=64)
275 pprint((pp, b, c, d, e))
276 for k, v in bitp.items():
277 print("bitp elwidth=%d" % k, bin(v))
278 print("bmask", bin(bm))
279
280 m = Module()
281
282 def process():
283 for i in range(4):
284 yield elwid.eq(i)
285 yield Settle()
286 ppt = []
287 for pval in list(pp.values()):
288 val = yield pval # get nmigen to evaluate pp
289 ppt.append(val)
290 print("test elwidth=%d" % i)
291 pprint((i, (ppt, b, c, d, e)))
292 # check the results against bitp static-expected partition points
293 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
294 # https://stackoverflow.com/a/27165694
295 ival = int(''.join(map(str, ppt[::-1])), 2)
296 assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
297 bin(bitp[i]))
298
299 sim = Simulator(m)
300 sim.add_process(process)
301 sim.run()
302
303 # fixed_width=32 and no lane_widths says "allocate maximum"
304 # i.e. Vector Element Widths are auto-allocated
305 # elwidth=0b00 1x 32-bit | .................32 |
306 # elwidth=0b01 1x 32-bit | .................32 |
307 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
308 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
309 # expected partitions (^) | | | (^)
310 # to be at these points: (|) | | | |
311
312 # TODO, fix this so that it is correct. put it at the end so it
313 # shows that things break and doesn't stop the other tests.
314 print ("maximum allocation from fixed_width=32")
315 for i in range(4):
316 pprint((i, layout(i, True, vec_el_counts, fixed_width=32)))
317