2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
15 from nmigen
import Signal
, Module
, Elaboratable
, Mux
, Cat
, Shape
, Repl
16 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
17 from nmigen
.cli
import rtlil
19 from collections
.abc
import Mapping
20 from functools
import reduce
22 from collections
import defaultdict
23 from pprint
import pprint
25 from ieee754
.part_mul_add
.partpoints
import PartitionPoints
28 # XXX MAKE SURE TO PRESERVE ALL THESE COMMENTS XXX
30 # main fn, which started out here in the bugtracker:
31 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
32 # note that signed is **NOT** part of the layout, and will NOT
33 # be added (because it is not relevant or appropriate).
34 # sign belongs in ast.Shape and is the only appropriate location.
35 # there is absolutely nothing within this function that in any
36 # way requires a sign. it is *purely* performing numerical width
37 # computations that have absolutely nothing to do with whether the
38 # actual data is signed or unsigned.
40 # context for parameters:
41 # http://lists.libre-soc.org/pipermail/libre-soc-dev/2021-October/003921.html
42 # XXX tempted to suggest that this function remain as a function, because
43 # it takes all the context it needs as parameters. its usefulness goes
44 # beyond a single class, and there is actually nothing realistically
45 # that it needs whixh is context-sensitive. therefore, on balance,
46 # it should remain a function
47 def layout(elwid
, # comes from SimdScope constructor
48 vec_el_counts
, # comes from SimdScope constructor
49 lane_shapes
=None, # from SimdScope.Signal via a SimdShape
50 fixed_width
=None): # from SimdScope.Signal via a SimdShape
51 """calculate a SIMD layout.
54 * element: a single scalar value that is an element of a SIMD vector.
55 it has a width in bits. Every element is made of 1 or
57 * ElWid: the element-width (really the element type) of an instruction.
58 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
59 In Python, `ElWid` is either an enum type or is `int`.
60 Example `ElWid` definition for integers:
63 I64 = ... # SVP64 value 0b00
64 I32 = ... # SVP64 value 0b01
65 I16 = ... # SVP64 value 0b10
66 I8 = ... # SVP64 value 0b11
68 Example `ElWid` definition for floats:
71 F64 = ... # SVP64 value 0b00
72 F32 = ... # SVP64 value 0b01
73 F16 = ... # SVP64 value 0b10
74 BF16 = ... # SVP64 value 0b11
76 * elwid: ElWid or nmigen Value with ElWid as the shape
77 the current element-width
79 * vec_el_counts: dict[ElWid, int]
80 a map from `ElWid` values `k` to the number of vector elements
81 required within a partition when `elwid == k`.
84 vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
85 ElWid.I16(==0b10): 4, # 4 vector elements
86 ElWid.I32(==0b01): 2, # 2 vector elements
87 ElWid.I64(==0b00): 1} # 1 vector (aka scalar) element
90 vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
91 ElWid.F16(==0b10): 4, # 4 vector elements
92 ElWid.F32(==0b01): 2, # 2 vector elements
93 ElWid.F64(==0b00): 1} # 1 (aka scalar) vector element
95 * lane_shapes: int or Mapping[ElWid, int] (optional)
96 the bit-width of all elements in a SIMD layout.
97 if not provided, the lane_shapes are computed from fixed_width
98 and vec_el_counts at each elwidth.
100 * fixed_width: int (optional)
101 the total width of a SIMD vector. One or both of lane_shapes or
102 fixed_width may be provided. Both may not be left out.
104 # when there are no lane_shapes specified, this indicates a
105 # desire to use the maximum available space based on the fixed width
106 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
107 if lane_shapes
is None:
108 assert fixed_width
is not None, \
109 "both fixed_width and lane_shapes cannot be None"
110 lane_shapes
= {i
: fixed_width
// vec_el_counts
[i
]
111 for i
in vec_el_counts
}
112 print("lane_shapes", fixed_width
, lane_shapes
)
114 # identify if the lane_shapes is a mapping (dict, etc.)
115 # if not, then assume that it is an integer (width) that
116 # needs to be requested across all partitions
117 if not isinstance(lane_shapes
, Mapping
):
118 lane_shapes
= {i
: lane_shapes
for i
in vec_el_counts
}
120 # compute a set of partition widths
121 print("lane_shapes", lane_shapes
, "vec_el_counts", vec_el_counts
)
124 for i
, lwid
in lane_shapes
.items():
125 required_width
= lwid
* vec_el_counts
[i
]
126 print(" required width", cpart_wid
, i
, lwid
, required_width
)
127 if required_width
> width
:
129 width
= required_width
131 # calculate the minumum width required if fixed_width specified
132 part_count
= max(vec_el_counts
.values())
133 print("width", width
, cpart_wid
, part_count
)
134 if fixed_width
is not None: # override the width and part_wid
135 assert width
<= fixed_width
, "not enough space to fit partitions"
136 part_wid
= fixed_width
// part_count
137 assert part_wid
* part_count
== fixed_width
, \
138 "calculated width not aligned multiples"
140 print("part_wid", part_wid
, "count", part_count
, "width", width
)
142 # create the breakpoints dictionary.
143 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
144 # https://stackoverflow.com/questions/26367812/
145 dpoints
= defaultdict(list) # if empty key, create a (empty) list
146 lpoints
= defaultdict(list) # dict of list of start-end points
148 always_padding_mask
= (1 << width
) - 1 # start with all bits padding
149 for i
, c
in vec_el_counts
.items():
150 print("dpoints", i
, "count", c
)
151 # calculate part_wid based on overall width divided by number
153 part_wid
= width
// c
155 padding_mask
= (1 << width
) - 1 # start with all bits padding
157 def add_p(msg
, start
, p
):
158 print(" adding dpoint", msg
, start
, part_wid
, i
, c
, p
)
159 dpoints
[p
].append(i
) # auto-creates list if key non-existent
160 # for each elwidth, create the required number of vector elements
161 for start
in range(c
):
162 start_bit
= start
* part_wid
163 end_bit
= start_bit
+ lane_shapes
[i
]
164 element_mask
= (1 << end_bit
) - (1 << start_bit
)
165 padding_mask
&= ~element_mask
# remove element from padding_mask
166 lpoints
[i
].append(range(start_bit
, end_bit
))
167 add_p("start", start
, start_bit
) # start of lane
168 add_p("end ", start
, end_bit
) # end lane
169 padding_masks
[i
] = padding_mask
170 always_padding_mask
&= padding_mask
172 # deduplicate dpoints lists
173 for k
in dpoints
.keys():
174 dpoints
[k
] = list({i
: None for i
in dpoints
[k
]}.keys())
176 # do not need the breakpoints at the very start or the very end
178 dpoints
.pop(width
, None)
181 dpoints
= dict(sorted(dpoints
.items(), key
=lambda i
: i
[0]))
186 # second stage, add (map to) the elwidth==i expressions.
187 # TODO: use nmutil.treereduce?
189 for p
in dpoints
.keys():
190 points
[p
] = map(lambda i
: elwid
== i
, dpoints
[p
])
191 points
[p
] = reduce(operator
.or_
, points
[p
])
193 # third stage, create the binary values which *if* elwidth is set to i
194 # *would* result in the mask at that elwidth being set to this value
195 # these can easily be double-checked through Assertion
197 for i
in vec_el_counts
.keys():
199 for bit_index
, (p
, elwidths
) in enumerate(dpoints
.items()):
201 bitp
[i
] |
= 1 << bit_index
203 # fourth stage: determine which partitions are 100% unused.
204 # these can then be "blanked out"
206 # points are the partition separators, not partition indexes
207 partition_ends
= [*dpoints
.keys(), width
]
210 for bit_index
, partition_end
in enumerate(partition_ends
):
211 pmask
= (1 << partition_end
) - (1 << partition_start
)
212 always_padding
= (always_padding_mask
& pmask
) == pmask
214 bmask |
= 1 << bit_index
215 partition_start
= partition_end
216 return (PartitionPoints(points
), bitp
, bmask
, width
, lane_shapes
,
219 # XXX XXX XXX XXX quick tests TODO convert to proper ones but kinda good
220 # enough for now. if adding new tests do not alter or delete the old ones
223 if __name__
== '__main__':
225 # for each element-width (elwidth 0-3) the number of Vector Elements is:
226 # elwidth=0b00 QTY 1 partitions: | ? |
227 # elwidth=0b01 QTY 1 partitions: | ? |
228 # elwidth=0b10 QTY 2 partitions: | ? | ? |
229 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
230 # actual widths of Signals *within* those partitions is given separately
238 # width=3 indicates "same width Vector Elements (3) at all elwidths"
239 # elwidth=0b00 1x 5-bit | unused xx ..3 |
240 # elwidth=0b01 1x 6-bit | unused xx ..3 |
241 # elwidth=0b10 2x 12-bit | xxx ..3 | xxx ..3 |
242 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
243 # expected partitions (^) | | | (^)
244 # to be at these points: (|) | | | |
245 width_in_all_parts
= 3
248 pprint((i
, layout(i
, vec_el_counts
, width_in_all_parts
)))
250 # specify that the Vector Element lengths are to be *different* at
251 # each of the elwidths.
252 # combined with vec_el_counts we have:
253 # elwidth=0b00 1x 5-bit |<----unused---------->....5|
254 # elwidth=0b01 1x 6-bit |<----unused--------->.....6|
255 # elwidth=0b10 2x 6-bit |unused>.....6|unused>.....6|
256 # elwidth=0b11 4x 6-bit |.....6|.....6|.....6|.....6|
257 # expected partitions (^) ^ ^ ^^ (^)
258 # to be at these points: (|) | | || (|)
260 widths_at_elwidth
= {
267 print("5,6,6,6 elements", widths_at_elwidth
)
269 pp
, bitp
, bm
, b
, c
, d
= \
270 layout(i
, vec_el_counts
, widths_at_elwidth
)
271 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
272 # now check that the expected partition points occur
273 print("5,6,6,6 ppt keys", pp
.keys())
274 assert list(pp
.keys()) == [5, 6, 12, 18]
275 assert bm
== 0 # no unused partitions
277 # this example was probably what the 5,6,6,6 one was supposed to be.
278 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
279 # elwidth=0b00 1x 24-bit |.........................24|
280 # elwidth=0b01 1x 12-bit |<--unused--->|...........12|
281 # elwidth=0b10 2x 5 -bit |unused>|....5|unused>|....5|
282 # elwidth=0b11 4x 6 -bit |.....6|.....6|.....6|.....6|
283 # expected partitions (^) ^^ ^ ^^ (^)
284 # to be at these points: (|) || | || (|)
285 # (24) 1817 12 65 (0)
286 widths_at_elwidth
= {
293 print("24,12,5,6 elements", widths_at_elwidth
)
295 pp
, bitp
, bm
, b
, c
, d
= \
296 layout(i
, vec_el_counts
, widths_at_elwidth
)
297 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
298 # now check that the expected partition points occur
299 print("24,12,5,6 ppt keys", pp
.keys())
300 assert list(pp
.keys()) == [5, 6, 12, 17, 18]
301 print("bmask", bin(bm
))
302 assert bm
== 0 # no unused partitions
304 # this tests elwidth as an actual Signal. layout is allowed to
305 # determine arbitrarily the overall length
306 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
309 pp
, bitp
, bm
, b
, c
, d
= layout(
310 elwid
, vec_el_counts
, widths_at_elwidth
)
311 pprint((pp
, b
, c
, d
))
312 for k
, v
in bitp
.items():
313 print("bitp elwidth=%d" % k
, bin(v
))
314 print("bmask", bin(bm
))
315 assert bm
== 0 # no unused partitions
324 for pval
in list(pp
.values()):
325 val
= yield pval
# get nmigen to evaluate pp
327 pprint((i
, (ppt
, b
, c
, d
)))
328 # check the results against bitp static-expected partition points
329 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
330 # https://stackoverflow.com/a/27165694
331 ival
= int(''.join(map(str, ppt
[::-1])), 2)
332 assert ival
== bitp
[i
]
335 sim
.add_process(process
)
338 # this tests elwidth as an actual Signal. layout is *not* allowed to
339 # determine arbitrarily the overall length, it is fixed to 64
340 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
342 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
343 # elwidth=0b00 1x 24-bit
344 # elwidth=0b01 1x 12-bit
345 # elwidth=0b10 2x 5-bit
346 # elwidth=0b11 4x 6-bit
348 # bmask<--------1<----0<---------10<---0<-------1<0<----0<---0<----00<---0
349 # always unused:| | | || | | | | | | || |
350 # 1111111111000000 1111111111000000 1111111100000000 0000000000000000
351 # | | | || | | | | | | || |
352 # 0b00 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxx........ ..............24|
353 # 0b01 xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx xxxx..........12|
354 # 0b10 xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|xxxxxxxxxxxxxxxx xxxxxxxxxxx....5|
355 # 0b11 xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|xxxxxxxxxx.....6|
356 # ^ ^ ^^ ^ ^ ^ ^ ^ ^^
357 # ppoints: | | || | | | | | ||
358 # | bit-48 /\ | bit-24-/ | | bit-12 /\-bit-5
359 # bit-54 bit-38-/ \ bit-32 | bit-16 /
360 # bit-37 bit-22 bit-6
363 pp
, bitp
, bm
, b
, c
, d
= layout(elwid
, vec_el_counts
,
366 pprint((pp
, b
, c
, d
))
367 for k
, v
in bitp
.items():
368 print("bitp elwidth=%d" % k
, bin(v
))
369 print("bmask", bin(bm
))
370 assert bm
== 0b101001000000
379 for pval
in list(pp
.values()):
380 val
= yield pval
# get nmigen to evaluate pp
382 print("test elwidth=%d" % i
)
383 pprint((i
, (ppt
, b
, c
, d
)))
384 # check the results against bitp static-expected partition points
385 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
386 # https://stackoverflow.com/a/27165694
387 ival
= int(''.join(map(str, ppt
[::-1])), 2)
388 assert ival
== bitp
[i
], "ival %s actual %s" % (bin(ival
),
392 sim
.add_process(process
)
395 # fixed_width=32 and no lane_widths says "allocate maximum"
396 # i.e. Vector Element Widths are auto-allocated
397 # elwidth=0b00 1x 32-bit | .................32 |
398 # elwidth=0b01 1x 32-bit | .................32 |
399 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
400 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
401 # expected partitions (^) | | | (^)
402 # to be at these points: (|) | | | |
404 # TODO, fix this so that it is correct. put it at the end so it
405 # shows that things break and doesn't stop the other tests.
406 print("maximum allocation from fixed_width=32")
408 pprint((i
, layout(i
, vec_el_counts
, fixed_width
=32)))
411 # https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
412 # 1xFP64: 11 bits, one exponent
413 # 2xFP32: 8 bits, two exponents
414 # 4xFP16: 5 bits, four exponents
415 # 4xBF16: 8 bits, four exponents
422 widths_at_elwidth
= {
423 0: 11, # FP64 ew=0b00
431 # |31| | |24| 16|15 | | 8|7 0 |
432 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
433 # 32bit | x| x| x| | x| x| x|10 .... 0 |
434 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
435 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
438 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
440 pp
, bitp
, bm
, b
, c
, d
= \
441 layout(i
, vec_el_counts
, widths_at_elwidth
,
443 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
444 # now check that the expected partition points occur
445 print("11,8,5,8 pp keys", pp
.keys())
446 #assert list(pp.keys()) == [5,6,12,18]
449 ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
459 widths_at_elwidth
= {
460 0: 11, # FP64 ew=0b00
461 1: 11, # FP32 ew=0b01
468 # |31| | |24| 16|15 | | 8|7 0 |
469 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
470 # 32bit | x| x| x| | x| x| x|10 .... 0 |
471 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
472 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
475 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
477 pp
, bitp
, bm
, b
, c
, d
= \
478 layout(i
, vec_el_counts
, widths_at_elwidth
,
480 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
481 # now check that the expected partition points occur
482 print("11,8,5,8 pp keys", pp
.keys())
483 #assert list(pp.keys()) == [5,6,12,18]