1 # SPDX-License-Identifier: LGPL-2.1-or-later
2 # See Notices.txt for copyright information
3 """Integer Multiplication."""
5 from nmigen
import Signal
, Module
, Value
, Elaboratable
, Cat
, C
, Mux
, Repl
6 from nmigen
.hdl
.ast
import Assign
7 from abc
import ABCMeta
, abstractmethod
8 from nmigen
.cli
import main
9 from functools
import reduce
10 from operator
import or_
12 class PartitionPoints(dict):
13 """Partition points and corresponding ``Value``s.
15 The points at where an ALU is partitioned along with ``Value``s that
16 specify if the corresponding partition points are enabled.
18 For example: ``{1: True, 5: True, 10: True}`` with
19 ``width == 16`` specifies that the ALU is split into 4 sections:
22 * bits 5 <= ``i`` < 10
23 * bits 10 <= ``i`` < 16
25 If the partition_points were instead ``{1: True, 5: a, 10: True}``
26 where ``a`` is a 1-bit ``Signal``:
27 * If ``a`` is asserted:
30 * bits 5 <= ``i`` < 10
31 * bits 10 <= ``i`` < 16
34 * bits 1 <= ``i`` < 10
35 * bits 10 <= ``i`` < 16
38 def __init__(self
, partition_points
=None):
39 """Create a new ``PartitionPoints``.
41 :param partition_points: the input partition points to values mapping.
44 if partition_points
is not None:
45 for point
, enabled
in partition_points
.items():
46 if not isinstance(point
, int):
47 raise TypeError("point must be a non-negative integer")
49 raise ValueError("point must be a non-negative integer")
50 self
[point
] = Value
.wrap(enabled
)
52 def like(self
, name
=None, src_loc_at
=0):
53 """Create a new ``PartitionPoints`` with ``Signal``s for all values.
55 :param name: the base name for the new ``Signal``s.
58 name
= Signal(src_loc_at
=1+src_loc_at
).name
# get variable name
59 retval
= PartitionPoints()
60 for point
, enabled
in self
.items():
61 retval
[point
] = Signal(enabled
.shape(), name
=f
"{name}_{point}")
65 """Assign ``PartitionPoints`` using ``Signal.eq``."""
66 if set(self
.keys()) != set(rhs
.keys()):
67 raise ValueError("incompatible point set")
68 for point
, enabled
in self
.items():
69 yield enabled
.eq(rhs
[point
])
71 def as_mask(self
, width
):
72 """Create a bit-mask from `self`.
74 Each bit in the returned mask is clear only if the partition point at
75 the same bit-index is enabled.
77 :param width: the bit width of the resulting mask
80 for i
in range(width
):
87 def get_max_partition_count(self
, width
):
88 """Get the maximum number of partitions.
90 Gets the number of partitions when all partition points are enabled.
93 for point
in self
.keys():
98 def fits_in_width(self
, width
):
99 """Check if all partition points are smaller than `width`."""
100 for point
in self
.keys():
106 class FullAdder(Elaboratable
):
109 :attribute in0: the first input
110 :attribute in1: the second input
111 :attribute in2: the third input
112 :attribute sum: the sum output
113 :attribute carry: the carry output
116 def __init__(self
, width
):
117 """Create a ``FullAdder``.
119 :param width: the bit width of the input and output
121 self
.in0
= Signal(width
)
122 self
.in1
= Signal(width
)
123 self
.in2
= Signal(width
)
124 self
.sum = Signal(width
)
125 self
.carry
= Signal(width
)
127 def elaborate(self
, platform
):
128 """Elaborate this module."""
130 m
.d
.comb
+= self
.sum.eq(self
.in0 ^ self
.in1 ^ self
.in2
)
131 m
.d
.comb
+= self
.carry
.eq((self
.in0
& self
.in1
)
132 |
(self
.in1
& self
.in2
)
133 |
(self
.in2
& self
.in0
))
137 class PartitionedAdder(Elaboratable
):
138 """Partitioned Adder.
140 :attribute width: the bit width of the input and output. Read-only.
141 :attribute a: the first input to the adder
142 :attribute b: the second input to the adder
143 :attribute output: the sum output
144 :attribute partition_points: the input partition points. Modification not
145 supported, except for by ``Signal.eq``.
148 def __init__(self
, width
, partition_points
):
149 """Create a ``PartitionedAdder``.
151 :param width: the bit width of the input and output
152 :param partition_points: the input partition points
155 self
.a
= Signal(width
)
156 self
.b
= Signal(width
)
157 self
.output
= Signal(width
)
158 self
.partition_points
= PartitionPoints(partition_points
)
159 if not self
.partition_points
.fits_in_width(width
):
160 raise ValueError("partition_points doesn't fit in width")
162 for i
in range(self
.width
):
163 if i
in self
.partition_points
:
166 self
._expanded
_width
= expanded_width
168 def elaborate(self
, platform
):
169 """Elaborate this module."""
173 expanded_a
= Signal(self
._expanded
_width
)
174 expanded_b
= Signal(self
._expanded
_width
)
175 expanded_output
= Signal(self
._expanded
_width
)
178 # store bits in a list, use Cat later. graphviz is much cleaner
185 # partition points are "breaks" (extra zeros) in what would otherwise
186 # be a massive long add.
187 for i
in range(self
.width
):
188 if i
in self
.partition_points
:
189 # add extra bit set to 0 + 0 for enabled partition points
190 # and 1 + 0 for disabled partition points
191 ea
.append(expanded_a
[expanded_index
])
192 al
.append(~self
.partition_points
[i
])
193 eb
.append(expanded_b
[expanded_index
])
196 ea
.append(expanded_a
[expanded_index
])
198 eb
.append(expanded_b
[expanded_index
])
200 eo
.append(expanded_output
[expanded_index
])
201 ol
.append(self
.output
[i
])
203 # combine above using Cat
204 m
.d
.comb
+= Cat(*ea
).eq(Cat(*al
))
205 m
.d
.comb
+= Cat(*eb
).eq(Cat(*bl
))
206 m
.d
.comb
+= Cat(*ol
).eq(Cat(*eo
))
207 # use only one addition to take advantage of look-ahead carry and
208 # special hardware on FPGAs
209 m
.d
.comb
+= expanded_output
.eq( expanded_a
+ expanded_b
)
213 FULL_ADDER_INPUT_COUNT
= 3
216 class AddReduce(Elaboratable
):
217 """Add list of numbers together.
219 :attribute inputs: input ``Signal``s to be summed. Modification not
220 supported, except for by ``Signal.eq``.
221 :attribute register_levels: List of nesting levels that should have
223 :attribute output: output sum.
224 :attribute partition_points: the input partition points. Modification not
225 supported, except for by ``Signal.eq``.
228 def __init__(self
, inputs
, output_width
, register_levels
, partition_points
):
229 """Create an ``AddReduce``.
231 :param inputs: input ``Signal``s to be summed.
232 :param output_width: bit-width of ``output``.
233 :param register_levels: List of nesting levels that should have
235 :param partition_points: the input partition points.
237 self
.inputs
= list(inputs
)
238 self
._resized
_inputs
= [
239 Signal(output_width
, name
=f
"resized_inputs[{i}]")
240 for i
in range(len(self
.inputs
))]
241 self
.register_levels
= list(register_levels
)
242 self
.output
= Signal(output_width
)
243 self
.partition_points
= PartitionPoints(partition_points
)
244 if not self
.partition_points
.fits_in_width(output_width
):
245 raise ValueError("partition_points doesn't fit in output_width")
246 self
._reg
_partition
_points
= self
.partition_points
.like()
247 max_level
= AddReduce
.get_max_level(len(self
.inputs
))
248 for level
in self
.register_levels
:
249 if level
> max_level
:
251 "not enough adder levels for specified register levels")
254 def get_max_level(input_count
):
255 """Get the maximum level.
257 All ``register_levels`` must be less than or equal to the maximum
262 groups
= AddReduce
.full_adder_groups(input_count
)
265 input_count
%= FULL_ADDER_INPUT_COUNT
266 input_count
+= 2 * len(groups
)
269 def next_register_levels(self
):
270 """``Iterable`` of ``register_levels`` for next recursive level."""
271 for level
in self
.register_levels
:
276 def full_adder_groups(input_count
):
277 """Get ``inputs`` indices for which a full adder should be built."""
279 input_count
- FULL_ADDER_INPUT_COUNT
+ 1,
280 FULL_ADDER_INPUT_COUNT
)
282 def elaborate(self
, platform
):
283 """Elaborate this module."""
286 # resize inputs to correct bit-width and optionally add in
288 resized_input_assignments
= [self
._resized
_inputs
[i
].eq(self
.inputs
[i
])
289 for i
in range(len(self
.inputs
))]
290 if 0 in self
.register_levels
:
291 m
.d
.sync
+= resized_input_assignments
292 m
.d
.sync
+= self
._reg
_partition
_points
.eq(self
.partition_points
)
294 m
.d
.comb
+= resized_input_assignments
295 m
.d
.comb
+= self
._reg
_partition
_points
.eq(self
.partition_points
)
297 groups
= AddReduce
.full_adder_groups(len(self
.inputs
))
298 # if there are no full adders to create, then we handle the base cases
299 # and return, otherwise we go on to the recursive case
301 if len(self
.inputs
) == 0:
302 # use 0 as the default output value
303 m
.d
.comb
+= self
.output
.eq(0)
304 elif len(self
.inputs
) == 1:
305 # handle single input
306 m
.d
.comb
+= self
.output
.eq(self
._resized
_inputs
[0])
308 # base case for adding 2 or more inputs, which get recursively
309 # reduced to 2 inputs
310 assert len(self
.inputs
) == 2
311 adder
= PartitionedAdder(len(self
.output
),
312 self
._reg
_partition
_points
)
313 m
.submodules
.final_adder
= adder
314 m
.d
.comb
+= adder
.a
.eq(self
._resized
_inputs
[0])
315 m
.d
.comb
+= adder
.b
.eq(self
._resized
_inputs
[1])
316 m
.d
.comb
+= self
.output
.eq(adder
.output
)
318 # go on to handle recursive case
319 intermediate_terms
= []
321 def add_intermediate_term(value
):
322 intermediate_term
= Signal(
324 name
=f
"intermediate_terms[{len(intermediate_terms)}]")
325 intermediate_terms
.append(intermediate_term
)
326 m
.d
.comb
+= intermediate_term
.eq(value
)
328 # store mask in intermediary (simplifies graph)
329 part_mask
= Signal(len(self
.output
), reset_less
=True)
330 mask
= self
._reg
_partition
_points
.as_mask(len(self
.output
))
331 m
.d
.comb
+= part_mask
.eq(mask
)
333 # create full adders for this recursive level.
334 # this shrinks N terms to 2 * (N // 3) plus the remainder
336 adder_i
= FullAdder(len(self
.output
))
337 setattr(m
.submodules
, f
"adder_{i}", adder_i
)
338 m
.d
.comb
+= adder_i
.in0
.eq(self
._resized
_inputs
[i
])
339 m
.d
.comb
+= adder_i
.in1
.eq(self
._resized
_inputs
[i
+ 1])
340 m
.d
.comb
+= adder_i
.in2
.eq(self
._resized
_inputs
[i
+ 2])
341 add_intermediate_term(adder_i
.sum)
342 shifted_carry
= adder_i
.carry
<< 1
343 # mask out carry bits to prevent carries between partitions
344 add_intermediate_term((adder_i
.carry
<< 1) & part_mask
)
345 # handle the remaining inputs.
346 if len(self
.inputs
) % FULL_ADDER_INPUT_COUNT
== 1:
347 add_intermediate_term(self
._resized
_inputs
[-1])
348 elif len(self
.inputs
) % FULL_ADDER_INPUT_COUNT
== 2:
349 # Just pass the terms to the next layer, since we wouldn't gain
350 # anything by using a half adder since there would still be 2 terms
351 # and just passing the terms to the next layer saves gates.
352 add_intermediate_term(self
._resized
_inputs
[-2])
353 add_intermediate_term(self
._resized
_inputs
[-1])
355 assert len(self
.inputs
) % FULL_ADDER_INPUT_COUNT
== 0
356 # recursive invocation of ``AddReduce``
357 next_level
= AddReduce(intermediate_terms
,
359 self
.next_register_levels(),
360 self
._reg
_partition
_points
)
361 m
.submodules
.next_level
= next_level
362 m
.d
.comb
+= self
.output
.eq(next_level
.output
)
367 OP_MUL_SIGNED_HIGH
= 1
368 OP_MUL_SIGNED_UNSIGNED_HIGH
= 2 # a is signed, b is unsigned
369 OP_MUL_UNSIGNED_HIGH
= 3
372 def get_term(value
, shift
=0, enabled
=None):
373 if enabled
is not None:
374 value
= Mux(enabled
, value
, 0)
376 value
= Cat(Repl(C(0, 1), shift
), value
)
382 class Term(Elaboratable
):
383 def __init__(self
, width
, twidth
, shift
=0, enabled
=None):
386 self
.enabled
= enabled
387 self
.ti
= Signal(width
, reset_less
=True)
388 self
.term
= Signal(twidth
, reset_less
=True)
390 def elaborate(self
, platform
):
393 m
.d
.comb
+= self
.term
.eq(get_term(self
.ti
, self
.shift
, self
.enabled
))
398 class ProductTerm(Elaboratable
):
399 def __init__(self
, width
, twidth
, pbwid
, a_index
, b_index
):
400 self
.a_index
= a_index
401 self
.b_index
= b_index
402 shift
= 8 * (self
.a_index
+ self
.b_index
)
404 self
.a
= Signal(twidth
//2, reset_less
=True)
405 self
.b
= Signal(twidth
//2, reset_less
=True)
406 self
.pb_en
= Signal(pbwid
, reset_less
=True)
409 min_index
= min(self
.a_index
, self
.b_index
)
410 max_index
= max(self
.a_index
, self
.b_index
)
411 for i
in range(min_index
, max_index
):
412 tl
.append(self
.pb_en
[i
])
413 name
= "te_%d_%d" % (self
.a_index
, self
.b_index
)
415 term_enabled
= Signal(name
=name
, reset_less
=True)
419 Term
.__init
__(self
, width
*2, twidth
, shift
, term_enabled
)
420 self
.term
.name
= "term_%d_%d" % (a_index
, b_index
) # rename
422 def elaborate(self
, platform
):
424 m
= Term
.elaborate(self
, platform
)
425 if self
.enabled
is not None:
426 m
.d
.comb
+= self
.enabled
.eq(~
(Cat(*self
.tl
).bool()))
428 bsa
= Signal(self
.width
, reset_less
=True)
429 bsb
= Signal(self
.width
, reset_less
=True)
430 a_index
, b_index
= self
.a_index
, self
.b_index
432 m
.d
.comb
+= bsa
.eq(self
.a
.bit_select(a_index
* pwidth
, pwidth
))
433 m
.d
.comb
+= bsb
.eq(self
.b
.bit_select(b_index
* pwidth
, pwidth
))
434 m
.d
.comb
+= self
.ti
.eq(bsa
* bsb
)
439 class ProductTerms(Elaboratable
):
441 def __init__(self
, width
, twidth
, pbwid
, a_index
, blen
):
442 self
.a_index
= a_index
447 self
.a
= Signal(twidth
//2, reset_less
=True)
448 self
.b
= Signal(twidth
//2, reset_less
=True)
449 self
.pb_en
= Signal(pbwid
, reset_less
=True)
450 self
.terms
= [Signal(twidth
, name
="term%d"%i, reset_less
=True) \
451 for i
in range(blen
)]
453 def elaborate(self
, platform
):
457 for b_index
in range(self
.blen
):
458 t
= ProductTerm(self
.pwidth
, self
.twidth
, self
.pbwid
,
459 self
.a_index
, b_index
)
460 setattr(m
.submodules
, "term_%d" % b_index
, t
)
462 m
.d
.comb
+= t
.a
.eq(self
.a
)
463 m
.d
.comb
+= t
.b
.eq(self
.b
)
464 m
.d
.comb
+= t
.pb_en
.eq(self
.pb_en
)
466 m
.d
.comb
+= self
.terms
[b_index
].eq(t
.term
)
471 class Part(Elaboratable
):
472 def __init__(self
, width
, n_parts
, n_levels
, pbwid
):
477 self
.a_signed
= [Signal(name
=f
"a_signed_{i}") for i
in range(8)]
478 self
.b_signed
= [Signal(name
=f
"_b_signed_{i}") for i
in range(8)]
479 self
.pbs
= Signal(pbwid
, reset_less
=True)
482 self
.parts
= [Signal(name
=f
"part_{i}") for i
in range(n_parts
)]
483 self
.delayed_parts
= [
484 [Signal(name
=f
"delayed_part_{delay}_{i}")
485 for i
in range(n_parts
)]
486 for delay
in range(n_levels
)]
487 # XXX REALLY WEIRD BUG - have to take a copy of the last delayed_parts
488 self
.dplast
= [Signal(name
=f
"dplast_{i}")
489 for i
in range(n_parts
)]
491 self
.not_a_term
= Signal(width
)
492 self
.neg_lsb_a_term
= Signal(width
)
493 self
.not_b_term
= Signal(width
)
494 self
.neg_lsb_b_term
= Signal(width
)
496 def elaborate(self
, platform
):
499 pbs
, parts
, delayed_parts
= self
.pbs
, self
.parts
, self
.delayed_parts
500 byte_count
= 8 // len(parts
)
501 for i
in range(len(parts
)):
503 pbl
.append(~pbs
[i
* byte_count
- 1])
504 for j
in range(i
* byte_count
, (i
+ 1) * byte_count
- 1):
506 pbl
.append(~pbs
[(i
+ 1) * byte_count
- 1])
507 value
= Signal(len(pbl
), reset_less
=True)
508 m
.d
.comb
+= value
.eq(Cat(*pbl
))
509 m
.d
.comb
+= parts
[i
].eq(~
(value
).bool())
510 m
.d
.comb
+= delayed_parts
[0][i
].eq(parts
[i
])
511 m
.d
.sync
+= [delayed_parts
[j
+ 1][i
].eq(delayed_parts
[j
][i
])
512 for j
in range(len(delayed_parts
)-1)]
513 m
.d
.comb
+= self
.dplast
[i
].eq(delayed_parts
[-1][i
])
515 not_a_term
, neg_lsb_a_term
, not_b_term
, neg_lsb_b_term
= \
516 self
.not_a_term
, self
.neg_lsb_a_term
, \
517 self
.not_b_term
, self
.neg_lsb_b_term
519 byte_width
= 8 // len(parts
)
520 bit_width
= 8 * byte_width
521 nat
, nbt
, nla
, nlb
= [], [], [], []
522 for i
in range(len(parts
)):
523 be
= parts
[i
] & self
.a
[(i
+ 1) * bit_width
- 1] \
524 & self
.a_signed
[i
* byte_width
]
525 ae
= parts
[i
] & self
.b
[(i
+ 1) * bit_width
- 1] \
526 & self
.b_signed
[i
* byte_width
]
527 a_enabled
= Signal(name
="a_en_%d" % i
, reset_less
=True)
528 b_enabled
= Signal(name
="b_en_%d" % i
, reset_less
=True)
529 m
.d
.comb
+= a_enabled
.eq(ae
)
530 m
.d
.comb
+= b_enabled
.eq(be
)
532 # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
533 # negation operation is split into a bitwise not and a +1.
534 # likewise for 16, 32, and 64-bit values.
535 nat
.append(Mux(a_enabled
,
536 Cat(Repl(0, bit_width
),
537 ~self
.a
.bit_select(bit_width
* i
, bit_width
)),
540 nla
.append(Cat(Repl(0, bit_width
), a_enabled
,
541 Repl(0, bit_width
-1)))
543 nbt
.append(Mux(b_enabled
,
544 Cat(Repl(0, bit_width
),
545 ~self
.b
.bit_select(bit_width
* i
, bit_width
)),
548 nlb
.append(Cat(Repl(0, bit_width
), b_enabled
,
549 Repl(0, bit_width
-1)))
551 m
.d
.comb
+= [not_a_term
.eq(Cat(*nat
)),
552 not_b_term
.eq(Cat(*nbt
)),
553 neg_lsb_a_term
.eq(Cat(*nla
)),
554 neg_lsb_b_term
.eq(Cat(*nlb
)),
560 class IntermediateOut(Elaboratable
):
561 def __init__(self
, width
, out_wid
, n_parts
):
563 self
.n_parts
= n_parts
564 self
.delayed_part_ops
= [Signal(2, name
="dpop%d" % i
, reset_less
=True)
566 self
.intermed
= Signal(out_wid
, reset_less
=True)
567 self
.output
= Signal(out_wid
//2, reset_less
=True)
569 def elaborate(self
, platform
):
575 for i
in range(self
.n_parts
):
576 op
= Signal(w
, reset_less
=True, name
="op%d_%d" % (w
, i
))
578 Mux(self
.delayed_part_ops
[sel
* i
] == OP_MUL_LOW
,
579 self
.intermed
.bit_select(i
* w
*2, w
),
580 self
.intermed
.bit_select(i
* w
*2 + w
, w
)))
582 m
.d
.comb
+= self
.output
.eq(Cat(*ol
))
587 class FinalOut(Elaboratable
):
588 def __init__(self
, out_wid
):
590 self
.d8
= [Signal(name
=f
"d8_{i}", reset_less
=True) for i
in range(8)]
591 self
.d16
= [Signal(name
=f
"d16_{i}", reset_less
=True) for i
in range(4)]
592 self
.d32
= [Signal(name
=f
"d32_{i}", reset_less
=True) for i
in range(2)]
594 self
.i8
= Signal(out_wid
, reset_less
=True)
595 self
.i16
= Signal(out_wid
, reset_less
=True)
596 self
.i32
= Signal(out_wid
, reset_less
=True)
597 self
.i64
= Signal(out_wid
, reset_less
=True)
600 self
.out
= Signal(out_wid
, reset_less
=True)
602 def elaborate(self
, platform
):
606 op
= Signal(8, reset_less
=True, name
="op_%d" % i
)
608 Mux(self
.d8
[i
] | self
.d16
[i
// 2],
609 Mux(self
.d8
[i
], self
.i8
.bit_select(i
* 8, 8),
610 self
.i16
.bit_select(i
* 8, 8)),
611 Mux(self
.d32
[i
// 4], self
.i32
.bit_select(i
* 8, 8),
612 self
.i64
.bit_select(i
* 8, 8))))
614 m
.d
.comb
+= self
.out
.eq(Cat(*ol
))
618 class OrMod(Elaboratable
):
619 def __init__(self
, wid
):
621 self
.orin
= [Signal(wid
, name
="orin%d" % i
, reset_less
=True)
623 self
.orout
= Signal(wid
, reset_less
=True)
625 def elaborate(self
, platform
):
627 or1
= Signal(self
.wid
, reset_less
=True)
628 or2
= Signal(self
.wid
, reset_less
=True)
629 m
.d
.comb
+= or1
.eq(self
.orin
[0] | self
.orin
[1])
630 m
.d
.comb
+= or2
.eq(self
.orin
[2] | self
.orin
[3])
631 m
.d
.comb
+= self
.orout
.eq(or1 | or2
)
636 class Signs(Elaboratable
):
639 self
.part_ops
= Signal(2, reset_less
=True)
640 self
.a_signed
= Signal(reset_less
=True)
641 self
.b_signed
= Signal(reset_less
=True)
643 def elaborate(self
, platform
):
647 asig
= self
.part_ops
!= OP_MUL_UNSIGNED_HIGH
648 bsig
= (self
.part_ops
== OP_MUL_LOW
) \
649 |
(self
.part_ops
== OP_MUL_SIGNED_HIGH
)
650 m
.d
.comb
+= self
.a_signed
.eq(asig
)
651 m
.d
.comb
+= self
.b_signed
.eq(bsig
)
656 class Mul8_16_32_64(Elaboratable
):
657 """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
659 Supports partitioning into any combination of 8, 16, 32, and 64-bit
660 partitions on naturally-aligned boundaries. Supports the operation being
661 set for each partition independently.
663 :attribute part_pts: the input partition points. Has a partition point at
664 multiples of 8 in 0 < i < 64. Each partition point's associated
665 ``Value`` is a ``Signal``. Modification not supported, except for by
667 :attribute part_ops: the operation for each byte. The operation for a
668 particular partition is selected by assigning the selected operation
669 code to each byte in the partition. The allowed operation codes are:
671 :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
672 RISC-V's `mul` instruction.
673 :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
674 ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
676 :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
677 where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
678 `mulhsu` instruction.
679 :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
680 ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
684 def __init__(self
, register_levels
= ()):
687 self
.register_levels
= list(register_levels
)
690 self
.part_pts
= PartitionPoints()
691 for i
in range(8, 64, 8):
692 self
.part_pts
[i
] = Signal(name
=f
"part_pts_{i}")
693 self
.part_ops
= [Signal(2, name
=f
"part_ops_{i}") for i
in range(8)]
697 # intermediates (needed for unit tests)
698 self
._intermediate
_output
= Signal(128)
701 self
.output
= Signal(64)
703 def _part_byte(self
, index
):
704 if index
== -1 or index
== 7:
706 assert index
>= 0 and index
< 8
707 return self
.part_pts
[index
* 8 + 8]
709 def elaborate(self
, platform
):
713 pbs
= Signal(8, reset_less
=True)
716 pb
= Signal(name
="pb%d" % i
, reset_less
=True)
717 m
.d
.comb
+= pb
.eq(self
._part
_byte
(i
))
719 m
.d
.comb
+= pbs
.eq(Cat(*tl
))
726 setattr(m
.submodules
, "signs%d" % i
, s
)
727 m
.d
.comb
+= s
.part_ops
.eq(self
.part_ops
[i
])
730 [Signal(2, name
=f
"_delayed_part_ops_{delay}_{i}")
732 for delay
in range(1 + len(self
.register_levels
))]
733 for i
in range(len(self
.part_ops
)):
734 m
.d
.comb
+= delayed_part_ops
[0][i
].eq(self
.part_ops
[i
])
735 m
.d
.sync
+= [delayed_part_ops
[j
+ 1][i
].eq(delayed_part_ops
[j
][i
])
736 for j
in range(len(self
.register_levels
))]
738 n_levels
= len(self
.register_levels
)+1
739 m
.submodules
.part_8
= part_8
= Part(128, 8, n_levels
, 8)
740 m
.submodules
.part_16
= part_16
= Part(128, 4, n_levels
, 8)
741 m
.submodules
.part_32
= part_32
= Part(128, 2, n_levels
, 8)
742 m
.submodules
.part_64
= part_64
= Part(128, 1, n_levels
, 8)
743 nat_l
, nbt_l
, nla_l
, nlb_l
= [], [], [], []
744 for mod
in [part_8
, part_16
, part_32
, part_64
]:
745 m
.d
.comb
+= mod
.a
.eq(self
.a
)
746 m
.d
.comb
+= mod
.b
.eq(self
.b
)
747 for i
in range(len(signs
)):
748 m
.d
.comb
+= mod
.a_signed
[i
].eq(signs
[i
].a_signed
)
749 m
.d
.comb
+= mod
.b_signed
[i
].eq(signs
[i
].b_signed
)
750 m
.d
.comb
+= mod
.pbs
.eq(pbs
)
751 nat_l
.append(mod
.not_a_term
)
752 nbt_l
.append(mod
.not_b_term
)
753 nla_l
.append(mod
.neg_lsb_a_term
)
754 nlb_l
.append(mod
.neg_lsb_b_term
)
758 for a_index
in range(8):
759 t
= ProductTerms(8, 128, 8, a_index
, 8)
760 setattr(m
.submodules
, "terms_%d" % a_index
, t
)
762 m
.d
.comb
+= t
.a
.eq(self
.a
)
763 m
.d
.comb
+= t
.b
.eq(self
.b
)
764 m
.d
.comb
+= t
.pb_en
.eq(pbs
)
769 # it's fine to bitwise-or data together since they are never enabled
771 m
.submodules
.nat_or
= nat_or
= OrMod(128)
772 m
.submodules
.nbt_or
= nbt_or
= OrMod(128)
773 m
.submodules
.nla_or
= nla_or
= OrMod(128)
774 m
.submodules
.nlb_or
= nlb_or
= OrMod(128)
775 for l
, mod
in [(nat_l
, nat_or
),
779 for i
in range(len(l
)):
780 m
.d
.comb
+= mod
.orin
[i
].eq(l
[i
])
781 terms
.append(mod
.orout
)
783 expanded_part_pts
= PartitionPoints()
784 for i
, v
in self
.part_pts
.items():
785 signal
= Signal(name
=f
"expanded_part_pts_{i*2}", reset_less
=True)
786 expanded_part_pts
[i
* 2] = signal
787 m
.d
.comb
+= signal
.eq(v
)
789 add_reduce
= AddReduce(terms
,
791 self
.register_levels
,
793 m
.submodules
.add_reduce
= add_reduce
794 m
.d
.comb
+= self
._intermediate
_output
.eq(add_reduce
.output
)
796 m
.submodules
.io64
= io64
= IntermediateOut(64, 128, 1)
797 m
.d
.comb
+= io64
.intermed
.eq(self
._intermediate
_output
)
799 m
.d
.comb
+= io64
.delayed_part_ops
[i
].eq(delayed_part_ops
[-1][i
])
802 m
.submodules
.io32
= io32
= IntermediateOut(32, 128, 2)
803 m
.d
.comb
+= io32
.intermed
.eq(self
._intermediate
_output
)
805 m
.d
.comb
+= io32
.delayed_part_ops
[i
].eq(delayed_part_ops
[-1][i
])
808 m
.submodules
.io16
= io16
= IntermediateOut(16, 128, 4)
809 m
.d
.comb
+= io16
.intermed
.eq(self
._intermediate
_output
)
811 m
.d
.comb
+= io16
.delayed_part_ops
[i
].eq(delayed_part_ops
[-1][i
])
814 m
.submodules
.io8
= io8
= IntermediateOut(8, 128, 8)
815 m
.d
.comb
+= io8
.intermed
.eq(self
._intermediate
_output
)
817 m
.d
.comb
+= io8
.delayed_part_ops
[i
].eq(delayed_part_ops
[-1][i
])
820 m
.submodules
.fo
= fo
= FinalOut(64)
821 for i
in range(len(part_8
.delayed_parts
[-1])):
822 m
.d
.comb
+= fo
.d8
[i
].eq(part_8
.dplast
[i
])
823 for i
in range(len(part_16
.delayed_parts
[-1])):
824 m
.d
.comb
+= fo
.d16
[i
].eq(part_16
.dplast
[i
])
825 for i
in range(len(part_32
.delayed_parts
[-1])):
826 m
.d
.comb
+= fo
.d32
[i
].eq(part_32
.dplast
[i
])
827 m
.d
.comb
+= fo
.i8
.eq(io8
.output
)
828 m
.d
.comb
+= fo
.i16
.eq(io16
.output
)
829 m
.d
.comb
+= fo
.i32
.eq(io32
.output
)
830 m
.d
.comb
+= fo
.i64
.eq(io64
.output
)
831 m
.d
.comb
+= self
.output
.eq(fo
.out
)
836 if __name__
== "__main__":
840 m
._intermediate
_output
,
843 *m
.part_pts
.values()])