unfortunately this is a multi-purpose commit.
[ieee754fpu.git] / src / ieee754 / part / layout_experiment.py
1 #!/usr/bin/env python3
2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
4 """
5 Links:
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
13 """
14
15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
16 from nmigen.back.pysim import Simulator, Delay, Settle
17 from nmigen.cli import rtlil
18
19 from collections.abc import Mapping
20 from functools import reduce
21 import operator
22 from collections import defaultdict
23 from pprint import pprint
24
25 from ieee754.part_mul_add.partpoints import PartitionPoints
26
27
28 # main fn, which started out here in the bugtracker:
29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
30 def layout(elwid, signed, part_counts, lane_shapes=None, fixed_width=None):
31 """calculate a SIMD layout.
32
33 Glossary:
34 * element: a single scalar value that is an element of a SIMD vector.
35 it has a width in bits, and a signedness. Every element is made of 1 or
36 more parts.
37 * ElWid: the element-width (really the element type) of an instruction.
38 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
39 In Python, `ElWid` is either an enum type or is `int`.
40 Example `ElWid` definition for integers:
41
42 class ElWid(Enum):
43 I8 = ...
44 I16 = ...
45 I32 = ...
46 I64 = ...
47
48 Example `ElWid` definition for floats:
49
50 class ElWid(Enum):
51 F16 = ...
52 BF16 = ...
53 F32 = ...
54 F64 = ...
55 * part: A piece of a SIMD vector, every SIMD vector is made of a
56 non-negative integer of parts. Elements are made of a power-of-two
57 number of parts. A part is a fixed number of bits wide for each
58 different SIMD layout, it doesn't vary when `elwid` changes. A part
59 can have a bit width of any non-negative integer, it is not restricted
60 to power-of-two. SIMD vectors should have as few parts as necessary,
61 since some circuits have size proportional to the number of parts.
62
63
64 * elwid: ElWid or nmigen Value with ElWid as the shape
65 the current element-width
66 * signed: bool
67 the signedness of all elements in a SIMD layout
68 * part_counts: dict[ElWid, int]
69 a map from `ElWid` values `k` to the number of parts in an element
70 when `elwid == k`. Values should be minimized, since higher values
71 often create bigger circuits.
72
73 Example:
74 # here, an I8 element is 1 part wide
75 part_counts = {ElWid.I8: 1, ElWid.I16: 2, ElWid.I32: 4, ElWid.I64: 8}
76
77 Another Example:
78 # here, an F16 element is 1 part wide
79 part_counts = {ElWid.F16: 1, ElWid.BF16: 1, ElWid.F32: 2, ElWid.F64: 4}
80 * lane_shapes: int or Mapping[ElWid, int] (optional)
81 the bit-width of all elements in a SIMD layout.
82 * fixed_width: int (optional)
83 the total width of a SIMD vector. One of lane_shapes and fixed_width
84 must be provided.
85 """
86 # when there are no lane_shapes specified, this indicates a
87 # desire to use the maximum available space based on the fixed width
88 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
89 if lane_shapes is None:
90 assert fixed_width is not None, \
91 "both fixed_width and lane_shapes cannot be None"
92 lane_shapes = {i: fixed_width // part_counts[i] for i in part_counts}
93 print("lane_shapes", fixed_width, lane_shapes)
94 # identify if the lane_shapes is a mapping (dict, etc.)
95 # if not, then assume that it is an integer (width) that
96 # needs to be requested across all partitions
97 if not isinstance(lane_shapes, Mapping):
98 lane_shapes = {i: lane_shapes for i in part_counts}
99 # compute a set of partition widths
100 cpart_wid = [-lane_shapes[i] // c for i, c in part_counts.items()]
101 print("cpart_wid", cpart_wid, "part_counts", part_counts)
102 cpart_wid = -min(cpart_wid)
103 part_count = max(part_counts.values())
104 # calculate the minumum width required
105 width = cpart_wid * part_count
106 print("width", width, cpart_wid, part_count)
107 if fixed_width is not None: # override the width and part_wid
108 assert width < fixed_width, "not enough space to fit partitions"
109 part_wid = fixed_width // part_count
110 assert part_wid * part_count == fixed_width, \
111 "calculated width not aligned multiples"
112 width = fixed_width
113 print("part_wid", part_wid, "count", part_count)
114 else:
115 # go with computed width
116 part_wid = cpart_wid
117 # create the breakpoints dictionary.
118 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
119 # https://stackoverflow.com/questions/26367812/
120 dpoints = defaultdict(list) # if empty key, create a (empty) list
121 for i, c in part_counts.items():
122 def add_p(p):
123 dpoints[p].append(i) # auto-creates list if key non-existent
124 for start in range(0, part_count, c):
125 add_p(start * part_wid) # start of lane
126 add_p(start * part_wid + lane_shapes[i]) # start of padding
127 # do not need the breakpoints at the very start or the very end
128 dpoints.pop(0, None)
129 dpoints.pop(width, None)
130 plist = list(dpoints.keys())
131 plist.sort()
132 print("dpoints")
133 pprint(dict(dpoints))
134 # second stage, add (map to) the elwidth==i expressions.
135 # TODO: use nmutil.treereduce?
136 points = {}
137 for p in plist:
138 points[p] = map(lambda i: elwid == i, dpoints[p])
139 points[p] = reduce(operator.or_, points[p])
140 # third stage, create the binary values which *if* elwidth is set to i
141 # *would* result in the mask at that elwidth being set to this value
142 # these can easily be double-checked through Assertion
143 bitp = {}
144 for i in part_counts.keys():
145 bitp[i] = 0
146 for p, elwidths in dpoints.items():
147 if i in elwidths:
148 bitpos = plist.index(p)
149 bitp[i] |= 1 << bitpos
150 # fourth stage: determine which partitions are 100% unused.
151 # these can then be "blanked out"
152 bmask = (1 << len(plist))-1
153 for p in bitp.values():
154 bmask &= ~p
155 return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
156 part_wid, part_count)
157
158
159 if __name__ == '__main__':
160
161 # for each element-width (elwidth 0-3) the number of partitions is given
162 # elwidth=0b00 QTY 1 partitions: | ? |
163 # elwidth=0b01 QTY 1 partitions: | ? |
164 # elwidth=0b10 QTY 2 partitions: | ? | ? |
165 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
166 # actual widths of Signals *within* those partitions is given separately
167 part_counts = {
168 0: 1,
169 1: 1,
170 2: 2,
171 3: 4,
172 }
173
174 # width=3 indicates "we want the same width (3) at all elwidths"
175 # elwidth=0b00 1x 5-bit | ..3 |
176 # elwidth=0b01 1x 6-bit | ..3 |
177 # elwidth=0b10 2x 12-bit | ..3 | ..3 |
178 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
179 width_in_all_parts = 3
180
181 for i in range(4):
182 pprint((i, layout(i, True, part_counts, width_in_all_parts)))
183
184 # fixed_width=32 and no lane_widths says "allocate maximum"
185 # elwidth=0b00 1x 32-bit | .................32 |
186 # elwidth=0b01 1x 32-bit | .................32 |
187 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
188 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
189
190 #print ("maximum allocation from fixed_width=32")
191 # for i in range(4):
192 # pprint((i, layout(i, True, part_counts, fixed_width=32)))
193
194 # specify that the length is to be *different* at each of the elwidths.
195 # combined with part_counts we have:
196 # elwidth=0b00 1x 5-bit | ....5 |
197 # elwidth=0b01 1x 6-bit | .....6 |
198 # elwidth=0b10 2x 12-bit | ....12 | .....12 |
199 # elwidth=0b11 3x 24-bit | 24 | 24 | 24 | 24 |
200 widths_at_elwidth = {
201 0: 5,
202 1: 6,
203 2: 12,
204 3: 24
205 }
206
207 for i in range(4):
208 pprint((i, layout(i, False, part_counts, widths_at_elwidth)))
209
210 # this tests elwidth as an actual Signal. layout is allowed to
211 # determine arbitrarily the overall length
212 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
213
214 elwid = Signal(2)
215 pp, bitp, bm, b, c, d, e = layout(
216 elwid, False, part_counts, widths_at_elwidth)
217 pprint((pp, b, c, d, e))
218 for k, v in bitp.items():
219 print("bitp elwidth=%d" % k, bin(v))
220 print("bmask", bin(bm))
221
222 m = Module()
223
224 def process():
225 for i in range(4):
226 yield elwid.eq(i)
227 yield Settle()
228 ppt = []
229 for pval in list(pp.values()):
230 val = yield pval # get nmigen to evaluate pp
231 ppt.append(val)
232 pprint((i, (ppt, b, c, d, e)))
233 # check the results against bitp static-expected partition points
234 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
235 # https://stackoverflow.com/a/27165694
236 ival = int(''.join(map(str, ppt[::-1])), 2)
237 assert ival == bitp[i]
238
239 sim = Simulator(m)
240 sim.add_process(process)
241 sim.run()
242
243 # this tests elwidth as an actual Signal. layout is *not* allowed to
244 # determine arbitrarily the overall length, it is fixed to 64
245 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
246
247 elwid = Signal(2)
248 pp, bitp, bm, b, c, d, e = layout(elwid, False, part_counts, widths_at_elwidth,
249 fixed_width=64)
250 pprint((pp, b, c, d, e))
251 for k, v in bitp.items():
252 print("bitp elwidth=%d" % k, bin(v))
253 print("bmask", bin(bm))
254
255 m = Module()
256
257 def process():
258 for i in range(4):
259 yield elwid.eq(i)
260 yield Settle()
261 ppt = []
262 for pval in list(pp.values()):
263 val = yield pval # get nmigen to evaluate pp
264 ppt.append(val)
265 print("test elwidth=%d" % i)
266 pprint((i, (ppt, b, c, d, e)))
267 # check the results against bitp static-expected partition points
268 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
269 # https://stackoverflow.com/a/27165694
270 ival = int(''.join(map(str, ppt[::-1])), 2)
271 assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
272 bin(bitp[i]))
273
274 sim = Simulator(m)
275 sim.add_process(process)
276 sim.run()