src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.partition_points = PartitionPoints(partition_points)
 354         if not self.partition_points.fits_in_width(output_width):
 355             raise ValueError("partition_points doesn't fit in output_width")
 356
 357         self.i = self.ispec()
 358         self.o = self.ospec()
 359
 360     def ispec(self):
 361         return AddReduceData(self.partition_points, self.n_inputs,
 362                              self.output_width, self.n_parts)
 363
 364     def ospec(self):
 365         return FinalReduceData(self.partition_points,
 366                                  self.output_width, self.n_parts)
 367
 368     def elaborate(self, platform):
 369         """Elaborate this module."""
 370         m = Module()
 371
 372         output_width = self.output_width
 373         output = Signal(output_width, reset_less=True)
 374         if self.n_inputs == 0:
 375             # use 0 as the default output value
 376             m.d.comb += output.eq(0)
 377         elif self.n_inputs == 1:
 378             # handle single input
 379             m.d.comb += output.eq(self.i.terms[0])
 380         else:
 381             # base case for adding 2 inputs
 382             assert self.n_inputs == 2
 383             adder = PartitionedAdder(output_width,
 384                                      self.i.part_pts, 2)
 385             m.submodules.final_adder = adder
 386             m.d.comb += adder.a.eq(self.i.terms[0])
 387             m.d.comb += adder.b.eq(self.i.terms[1])
 388             m.d.comb += output.eq(adder.output)
 389
 390         # create output
 391         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 392                                    self.i.part_ops)
 393
 394         return m
 395
 396
 397 class AddReduceSingle(Elaboratable):
 398     """Add list of numbers together.
 399
 400     :attribute inputs: input ``Signal``s to be summed. Modification not
 401         supported, except for by ``Signal.eq``.
 402     :attribute register_levels: List of nesting levels that should have
 403         pipeline registers.
 404     :attribute output: output sum.
 405     :attribute partition_points: the input partition points. Modification not
 406         supported, except for by ``Signal.eq``.
 407     """
 408
 409     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 410         """Create an ``AddReduce``.
 411
 412         :param inputs: input ``Signal``s to be summed.
 413         :param output_width: bit-width of ``output``.
 414         :param partition_points: the input partition points.
 415         """
 416         self.n_inputs = n_inputs
 417         self.n_parts = n_parts
 418         self.output_width = output_width
 419         self.partition_points = PartitionPoints(partition_points)
 420         if not self.partition_points.fits_in_width(output_width):
 421             raise ValueError("partition_points doesn't fit in output_width")
 422
 423         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 424         self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 425
 426         self.i = self.ispec()
 427         self.o = self.ospec()
 428
 429     def ispec(self):
 430         return AddReduceData(self.partition_points, self.n_inputs,
 431                              self.output_width, self.n_parts)
 432
 433     def ospec(self):
 434         return AddReduceData(self.partition_points, self.n_terms,
 435                              self.output_width, self.n_parts)
 436
 437     @staticmethod
 438     def calc_n_inputs(n_inputs, groups):
 439         retval = len(groups)*2
 440         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 441             retval += 1
 442         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 443             retval += 2
 444         else:
 445             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 446         return retval
 447
 448     @staticmethod
 449     def get_max_level(input_count):
 450         """Get the maximum level.
 451
 452         All ``register_levels`` must be less than or equal to the maximum
 453         level.
 454         """
 455         retval = 0
 456         while True:
 457             groups = AddReduceSingle.full_adder_groups(input_count)
 458             if len(groups) == 0:
 459                 return retval
 460             input_count %= FULL_ADDER_INPUT_COUNT
 461             input_count += 2 * len(groups)
 462             retval += 1
 463
 464     @staticmethod
 465     def full_adder_groups(input_count):
 466         """Get ``inputs`` indices for which a full adder should be built."""
 467         return range(0,
 468                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 469                      FULL_ADDER_INPUT_COUNT)
 470
 471     def create_next_terms(self):
 472         """ create next intermediate terms, for linking up in elaborate, below
 473         """
 474         terms = []
 475         adders = []
 476
 477         # create full adders for this recursive level.
 478         # this shrinks N terms to 2 * (N // 3) plus the remainder
 479         for i in self.groups:
 480             adder_i = MaskedFullAdder(self.output_width)
 481             adders.append((i, adder_i))
 482             # add both the sum and the masked-carry to the next level.
 483             # 3 inputs have now been reduced to 2...
 484             terms.append(adder_i.sum)
 485             terms.append(adder_i.mcarry)
 486         # handle the remaining inputs.
 487         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 488             terms.append(self.i.terms[-1])
 489         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 490             # Just pass the terms to the next layer, since we wouldn't gain
 491             # anything by using a half adder since there would still be 2 terms
 492             # and just passing the terms to the next layer saves gates.
 493             terms.append(self.i.terms[-2])
 494             terms.append(self.i.terms[-1])
 495         else:
 496             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 497
 498         return terms, adders
 499
 500     def elaborate(self, platform):
 501         """Elaborate this module."""
 502         m = Module()
 503
 504         terms, adders = self.create_next_terms()
 505
 506         # copy the intermediate terms to the output
 507         for i, value in enumerate(terms):
 508             m.d.comb += self.o.terms[i].eq(value)
 509
 510         # copy reg part points and part ops to output
 511         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 512         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 513                                      for i in range(len(self.i.part_ops))]
 514
 515         # set up the partition mask (for the adders)
 516         part_mask = Signal(self.output_width, reset_less=True)
 517
 518         # get partition points as a mask
 519         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 520         m.d.comb += part_mask.eq(mask)
 521
 522         # add and link the intermediate term modules
 523         for i, (iidx, adder_i) in enumerate(adders):
 524             setattr(m.submodules, f"adder_{i}", adder_i)
 525
 526             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 527             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 528             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 529             m.d.comb += adder_i.mask.eq(part_mask)
 530
 531         return m
 532
 533
 534 class AddReduceInternal:
 535     """Recursively Add list of numbers together.
 536
 537     :attribute inputs: input ``Signal``s to be summed. Modification not
 538         supported, except for by ``Signal.eq``.
 539     :attribute register_levels: List of nesting levels that should have
 540         pipeline registers.
 541     :attribute output: output sum.
 542     :attribute partition_points: the input partition points. Modification not
 543         supported, except for by ``Signal.eq``.
 544     """
 545
 546     def __init__(self, inputs, output_width, partition_points,
 547                        part_ops):
 548         """Create an ``AddReduce``.
 549
 550         :param inputs: input ``Signal``s to be summed.
 551         :param output_width: bit-width of ``output``.
 552         :param partition_points: the input partition points.
 553         """
 554         self.inputs = inputs
 555         self.part_ops = part_ops
 556         self.output_width = output_width
 557         self.partition_points = partition_points
 558
 559         self.create_levels()
 560
 561     def create_levels(self):
 562         """creates reduction levels"""
 563
 564         mods = []
 565         partition_points = self.partition_points
 566         part_ops = self.part_ops
 567         n_parts = len(part_ops)
 568         inputs = self.inputs
 569         ilen = len(inputs)
 570         while True:
 571             groups = AddReduceSingle.full_adder_groups(len(inputs))
 572             if len(groups) == 0:
 573                 break
 574             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 575                                          partition_points)
 576             mods.append(next_level)
 577             partition_points = next_level.i.part_pts
 578             inputs = next_level.o.terms
 579             ilen = len(inputs)
 580             part_ops = next_level.i.part_ops
 581
 582         next_level = FinalAdd(ilen, self.output_width, n_parts,
 583                               partition_points)
 584         mods.append(next_level)
 585
 586         self.levels = mods
 587
 588
 589 class AddReduce(AddReduceInternal, Elaboratable):
 590     """Recursively Add list of numbers together.
 591
 592     :attribute inputs: input ``Signal``s to be summed. Modification not
 593         supported, except for by ``Signal.eq``.
 594     :attribute register_levels: List of nesting levels that should have
 595         pipeline registers.
 596     :attribute output: output sum.
 597     :attribute partition_points: the input partition points. Modification not
 598         supported, except for by ``Signal.eq``.
 599     """
 600
 601     def __init__(self, inputs, output_width, register_levels, partition_points,
 602                        part_ops):
 603         """Create an ``AddReduce``.
 604
 605         :param inputs: input ``Signal``s to be summed.
 606         :param output_width: bit-width of ``output``.
 607         :param register_levels: List of nesting levels that should have
 608             pipeline registers.
 609         :param partition_points: the input partition points.
 610         """
 611         AddReduceInternal.__init__(self, inputs, output_width,
 612                                    partition_points, part_ops)
 613         n_parts = len(part_ops)
 614         self.o = FinalReduceData(partition_points, output_width, n_parts)
 615         self.register_levels = register_levels
 616
 617     @staticmethod
 618     def get_max_level(input_count):
 619         return AddReduceSingle.get_max_level(input_count)
 620
 621     @staticmethod
 622     def next_register_levels(register_levels):
 623         """``Iterable`` of ``register_levels`` for next recursive level."""
 624         for level in register_levels:
 625             if level > 0:
 626                 yield level - 1
 627
 628     def create_levels(self):
 629         """creates reduction levels"""
 630
 631         mods = []
 632         partition_points = self.partition_points
 633         part_ops = self.part_ops
 634         n_parts = len(part_ops)
 635         inputs = self.inputs
 636         ilen = len(inputs)
 637         while True:
 638             groups = AddReduceSingle.full_adder_groups(len(inputs))
 639             if len(groups) == 0:
 640                 break
 641             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 642                                          partition_points)
 643             mods.append(next_level)
 644             partition_points = next_level.i.part_pts
 645             inputs = next_level.o.terms
 646             ilen = len(inputs)
 647             part_ops = next_level.i.part_ops
 648
 649         next_level = FinalAdd(ilen, self.output_width, n_parts,
 650                               partition_points)
 651         mods.append(next_level)
 652
 653         self.levels = mods
 654
 655     def elaborate(self, platform):
 656         """Elaborate this module."""
 657         m = Module()
 658
 659         for i, next_level in enumerate(self.levels):
 660             setattr(m.submodules, "next_level%d" % i, next_level)
 661
 662         partition_points = self.partition_points
 663         inputs = self.inputs
 664         part_ops = self.part_ops
 665         n_parts = len(part_ops)
 666         n_inputs = len(inputs)
 667         output_width = self.output_width
 668         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 669         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 670         for idx in range(len(self.levels)):
 671             mcur = self.levels[idx]
 672             if idx in self.register_levels:
 673                 m.d.sync += mcur.i.eq(i)
 674             else:
 675                 m.d.comb += mcur.i.eq(i)
 676             i = mcur.o # for next loop
 677
 678         # output comes from last module
 679         m.d.comb += self.o.eq(i)
 680
 681         return m
 682
 683
 684 OP_MUL_LOW = 0
 685 OP_MUL_SIGNED_HIGH = 1
 686 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 687 OP_MUL_UNSIGNED_HIGH = 3
 688
 689
 690 def get_term(value, shift=0, enabled=None):
 691     if enabled is not None:
 692         value = Mux(enabled, value, 0)
 693     if shift > 0:
 694         value = Cat(Repl(C(0, 1), shift), value)
 695     else:
 696         assert shift == 0
 697     return value
 698
 699
 700 class ProductTerm(Elaboratable):
 701     """ this class creates a single product term (a[..]*b[..]).
 702         it has a design flaw in that is the *output* that is selected,
 703         where the multiplication(s) are combinatorially generated
 704         all the time.
 705     """
 706
 707     def __init__(self, width, twidth, pbwid, a_index, b_index):
 708         self.a_index = a_index
 709         self.b_index = b_index
 710         shift = 8 * (self.a_index + self.b_index)
 711         self.pwidth = width
 712         self.twidth = twidth
 713         self.width = width*2
 714         self.shift = shift
 715
 716         self.ti = Signal(self.width, reset_less=True)
 717         self.term = Signal(twidth, reset_less=True)
 718         self.a = Signal(twidth//2, reset_less=True)
 719         self.b = Signal(twidth//2, reset_less=True)
 720         self.pb_en = Signal(pbwid, reset_less=True)
 721
 722         self.tl = tl = []
 723         min_index = min(self.a_index, self.b_index)
 724         max_index = max(self.a_index, self.b_index)
 725         for i in range(min_index, max_index):
 726             tl.append(self.pb_en[i])
 727         name = "te_%d_%d" % (self.a_index, self.b_index)
 728         if len(tl) > 0:
 729             term_enabled = Signal(name=name, reset_less=True)
 730         else:
 731             term_enabled = None
 732         self.enabled = term_enabled
 733         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 734
 735     def elaborate(self, platform):
 736
 737         m = Module()
 738         if self.enabled is not None:
 739             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 740
 741         bsa = Signal(self.width, reset_less=True)
 742         bsb = Signal(self.width, reset_less=True)
 743         a_index, b_index = self.a_index, self.b_index
 744         pwidth = self.pwidth
 745         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 746         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 747         m.d.comb += self.ti.eq(bsa * bsb)
 748         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 749         """
 750         #TODO: sort out width issues, get inputs a/b switched on/off.
 751         #data going into Muxes is 1/2 the required width
 752
 753         pwidth = self.pwidth
 754         width = self.width
 755         bsa = Signal(self.twidth//2, reset_less=True)
 756         bsb = Signal(self.twidth//2, reset_less=True)
 757         asel = Signal(width, reset_less=True)
 758         bsel = Signal(width, reset_less=True)
 759         a_index, b_index = self.a_index, self.b_index
 760         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 761         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 762         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 763         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 764         m.d.comb += self.ti.eq(bsa * bsb)
 765         m.d.comb += self.term.eq(self.ti)
 766         """
 767
 768         return m
 769
 770
 771 class ProductTerms(Elaboratable):
 772     """ creates a bank of product terms.  also performs the actual bit-selection
 773         this class is to be wrapped with a for-loop on the "a" operand.
 774         it creates a second-level for-loop on the "b" operand.
 775     """
 776     def __init__(self, width, twidth, pbwid, a_index, blen):
 777         self.a_index = a_index
 778         self.blen = blen
 779         self.pwidth = width
 780         self.twidth = twidth
 781         self.pbwid = pbwid
 782         self.a = Signal(twidth//2, reset_less=True)
 783         self.b = Signal(twidth//2, reset_less=True)
 784         self.pb_en = Signal(pbwid, reset_less=True)
 785         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 786                             for i in range(blen)]
 787
 788     def elaborate(self, platform):
 789
 790         m = Module()
 791
 792         for b_index in range(self.blen):
 793             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 794                             self.a_index, b_index)
 795             setattr(m.submodules, "term_%d" % b_index, t)
 796
 797             m.d.comb += t.a.eq(self.a)
 798             m.d.comb += t.b.eq(self.b)
 799             m.d.comb += t.pb_en.eq(self.pb_en)
 800
 801             m.d.comb += self.terms[b_index].eq(t.term)
 802
 803         return m
 804
 805
 806 class LSBNegTerm(Elaboratable):
 807
 808     def __init__(self, bit_width):
 809         self.bit_width = bit_width
 810         self.part = Signal(reset_less=True)
 811         self.signed = Signal(reset_less=True)
 812         self.op = Signal(bit_width, reset_less=True)
 813         self.msb = Signal(reset_less=True)
 814         self.nt = Signal(bit_width*2, reset_less=True)
 815         self.nl = Signal(bit_width*2, reset_less=True)
 816
 817     def elaborate(self, platform):
 818         m = Module()
 819         comb = m.d.comb
 820         bit_wid = self.bit_width
 821         ext = Repl(0, bit_wid) # extend output to HI part
 822
 823         # determine sign of each incoming number *in this partition*
 824         enabled = Signal(reset_less=True)
 825         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 826
 827         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 828         # negation operation is split into a bitwise not and a +1.
 829         # likewise for 16, 32, and 64-bit values.
 830
 831         # width-extended 1s complement if a is signed, otherwise zero
 832         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 833
 834         # add 1 if signed, otherwise add zero
 835         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 836
 837         return m
 838
 839
 840 class Parts(Elaboratable):
 841
 842     def __init__(self, pbwid, part_pts, n_parts):
 843         self.pbwid = pbwid
 844         # inputs
 845         self.part_pts = PartitionPoints.like(part_pts)
 846         # outputs
 847         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 848                       for i in range(n_parts)]
 849
 850     def elaborate(self, platform):
 851         m = Module()
 852
 853         part_pts, parts = self.part_pts, self.parts
 854         # collect part-bytes (double factor because the input is extended)
 855         pbs = Signal(self.pbwid, reset_less=True)
 856         tl = []
 857         for i in range(self.pbwid):
 858             pb = Signal(name="pb%d" % i, reset_less=True)
 859             m.d.comb += pb.eq(part_pts.part_byte(i))
 860             tl.append(pb)
 861         m.d.comb += pbs.eq(Cat(*tl))
 862
 863         # negated-temporary copy of partition bits
 864         npbs = Signal.like(pbs, reset_less=True)
 865         m.d.comb += npbs.eq(~pbs)
 866         byte_count = 8 // len(parts)
 867         for i in range(len(parts)):
 868             pbl = []
 869             pbl.append(npbs[i * byte_count - 1])
 870             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 871                 pbl.append(pbs[j])
 872             pbl.append(npbs[(i + 1) * byte_count - 1])
 873             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 874             m.d.comb += value.eq(Cat(*pbl))
 875             m.d.comb += parts[i].eq(~(value).bool())
 876
 877         return m
 878
 879
 880 class Part(Elaboratable):
 881     """ a key class which, depending on the partitioning, will determine
 882         what action to take when parts of the output are signed or unsigned.
 883
 884         this requires 2 pieces of data *per operand, per partition*:
 885         whether the MSB is HI/LO (per partition!), and whether a signed
 886         or unsigned operation has been *requested*.
 887
 888         once that is determined, signed is basically carried out
 889         by splitting 2's complement into 1's complement plus one.
 890         1's complement is just a bit-inversion.
 891
 892         the extra terms - as separate terms - are then thrown at the
 893         AddReduce alongside the multiplication part-results.
 894     """
 895     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 896
 897         self.pbwid = pbwid
 898         self.part_pts = part_pts
 899
 900         # inputs
 901         self.a = Signal(64, reset_less=True)
 902         self.b = Signal(64, reset_less=True)
 903         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 904                             for i in range(8)]
 905         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 906                             for i in range(8)]
 907         self.pbs = Signal(pbwid, reset_less=True)
 908
 909         # outputs
 910         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 911                             for i in range(n_parts)]
 912
 913         self.not_a_term = Signal(width, reset_less=True)
 914         self.neg_lsb_a_term = Signal(width, reset_less=True)
 915         self.not_b_term = Signal(width, reset_less=True)
 916         self.neg_lsb_b_term = Signal(width, reset_less=True)
 917
 918     def elaborate(self, platform):
 919         m = Module()
 920
 921         pbs, parts = self.pbs, self.parts
 922         part_pts = self.part_pts
 923         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 924         m.d.comb += p.part_pts.eq(part_pts)
 925         parts = p.parts
 926
 927         byte_count = 8 // len(parts)
 928
 929         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 930                 self.not_a_term, self.neg_lsb_a_term,
 931                 self.not_b_term, self.neg_lsb_b_term)
 932
 933         byte_width = 8 // len(parts) # byte width
 934         bit_wid = 8 * byte_width     # bit width
 935         nat, nbt, nla, nlb = [], [], [], []
 936         for i in range(len(parts)):
 937             # work out bit-inverted and +1 term for a.
 938             pa = LSBNegTerm(bit_wid)
 939             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 940             m.d.comb += pa.part.eq(parts[i])
 941             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 942             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 943             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 944             nat.append(pa.nt)
 945             nla.append(pa.nl)
 946
 947             # work out bit-inverted and +1 term for b
 948             pb = LSBNegTerm(bit_wid)
 949             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 950             m.d.comb += pb.part.eq(parts[i])
 951             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 952             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 953             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 954             nbt.append(pb.nt)
 955             nlb.append(pb.nl)
 956
 957         # concatenate together and return all 4 results.
 958         m.d.comb += [not_a_term.eq(Cat(*nat)),
 959                      not_b_term.eq(Cat(*nbt)),
 960                      neg_lsb_a_term.eq(Cat(*nla)),
 961                      neg_lsb_b_term.eq(Cat(*nlb)),
 962                     ]
 963
 964         return m
 965
 966
 967 class IntermediateOut(Elaboratable):
 968     """ selects the HI/LO part of the multiplication, for a given bit-width
 969         the output is also reconstructed in its SIMD (partition) lanes.
 970     """
 971     def __init__(self, width, out_wid, n_parts):
 972         self.width = width
 973         self.n_parts = n_parts
 974         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 975                                      for i in range(8)]
 976         self.intermed = Signal(out_wid, reset_less=True)
 977         self.output = Signal(out_wid//2, reset_less=True)
 978
 979     def elaborate(self, platform):
 980         m = Module()
 981
 982         ol = []
 983         w = self.width
 984         sel = w // 8
 985         for i in range(self.n_parts):
 986             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 987             m.d.comb += op.eq(
 988                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 989                     self.intermed.bit_select(i * w*2, w),
 990                     self.intermed.bit_select(i * w*2 + w, w)))
 991             ol.append(op)
 992         m.d.comb += self.output.eq(Cat(*ol))
 993
 994         return m
 995
 996
 997 class FinalOut(Elaboratable):
 998     """ selects the final output based on the partitioning.
 999
1000         each byte is selectable independently, i.e. it is possible
1001         that some partitions requested 8-bit computation whilst others
1002         requested 16 or 32 bit.
1003     """
1004     def __init__(self, output_width, n_parts, part_pts):
1005         self.part_pts = part_pts
1006         self.output_width = output_width
1007         self.n_parts = n_parts
1008         self.out_wid = output_width//2
1009
1010         self.i = self.ispec()
1011         self.o = self.ospec()
1012
1013     def ispec(self):
1014         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1015
1016     def ospec(self):
1017         return OutputData()
1018
1019     def elaborate(self, platform):
1020         m = Module()
1021
1022         part_pts = self.part_pts
1023         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1024         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1025         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1026         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1027
1028         out_part_pts = self.i.part_pts
1029
1030         # temporaries
1031         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1032         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1033         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1034
1035         i8 = Signal(self.out_wid, reset_less=True)
1036         i16 = Signal(self.out_wid, reset_less=True)
1037         i32 = Signal(self.out_wid, reset_less=True)
1038         i64 = Signal(self.out_wid, reset_less=True)
1039
1040         m.d.comb += p_8.part_pts.eq(out_part_pts)
1041         m.d.comb += p_16.part_pts.eq(out_part_pts)
1042         m.d.comb += p_32.part_pts.eq(out_part_pts)
1043         m.d.comb += p_64.part_pts.eq(out_part_pts)
1044
1045         for i in range(len(p_8.parts)):
1046             m.d.comb += d8[i].eq(p_8.parts[i])
1047         for i in range(len(p_16.parts)):
1048             m.d.comb += d16[i].eq(p_16.parts[i])
1049         for i in range(len(p_32.parts)):
1050             m.d.comb += d32[i].eq(p_32.parts[i])
1051         m.d.comb += i8.eq(self.i.outputs[0])
1052         m.d.comb += i16.eq(self.i.outputs[1])
1053         m.d.comb += i32.eq(self.i.outputs[2])
1054         m.d.comb += i64.eq(self.i.outputs[3])
1055
1056         ol = []
1057         for i in range(8):
1058             # select one of the outputs: d8 selects i8, d16 selects i16
1059             # d32 selects i32, and the default is i64.
1060             # d8 and d16 are ORed together in the first Mux
1061             # then the 2nd selects either i8 or i16.
1062             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1063             op = Signal(8, reset_less=True, name="op_%d" % i)
1064             m.d.comb += op.eq(
1065                 Mux(d8[i] | d16[i // 2],
1066                     Mux(d8[i], i8.bit_select(i * 8, 8),
1067                                i16.bit_select(i * 8, 8)),
1068                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1069                                       i64.bit_select(i * 8, 8))))
1070             ol.append(op)
1071
1072         # create outputs
1073         m.d.comb += self.o.output.eq(Cat(*ol))
1074         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1075
1076         return m
1077
1078
1079 class OrMod(Elaboratable):
1080     """ ORs four values together in a hierarchical tree
1081     """
1082     def __init__(self, wid):
1083         self.wid = wid
1084         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1085                      for i in range(4)]
1086         self.orout = Signal(wid, reset_less=True)
1087
1088     def elaborate(self, platform):
1089         m = Module()
1090         or1 = Signal(self.wid, reset_less=True)
1091         or2 = Signal(self.wid, reset_less=True)
1092         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1093         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1094         m.d.comb += self.orout.eq(or1 | or2)
1095
1096         return m
1097
1098
1099 class Signs(Elaboratable):
1100     """ determines whether a or b are signed numbers
1101         based on the required operation type (OP_MUL_*)
1102     """
1103
1104     def __init__(self):
1105         self.part_ops = Signal(2, reset_less=True)
1106         self.a_signed = Signal(reset_less=True)
1107         self.b_signed = Signal(reset_less=True)
1108
1109     def elaborate(self, platform):
1110
1111         m = Module()
1112
1113         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1114         bsig = (self.part_ops == OP_MUL_LOW) \
1115                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1116         m.d.comb += self.a_signed.eq(asig)
1117         m.d.comb += self.b_signed.eq(bsig)
1118
1119         return m
1120
1121
1122 class IntermediateData:
1123
1124     def __init__(self, part_pts, output_width, n_parts):
1125         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1126                           for i in range(n_parts)]
1127         self.part_pts = part_pts.like()
1128         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1129                           for i in range(4)]
1130         # intermediates (needed for unit tests)
1131         self.intermediate_output = Signal(output_width)
1132
1133     def eq_from(self, part_pts, outputs, intermediate_output,
1134                       part_ops):
1135         return [self.part_pts.eq(part_pts)] + \
1136                [self.intermediate_output.eq(intermediate_output)] + \
1137                [self.outputs[i].eq(outputs[i])
1138                                      for i in range(4)] + \
1139                [self.part_ops[i].eq(part_ops[i])
1140                                      for i in range(len(self.part_ops))]
1141
1142     def eq(self, rhs):
1143         return self.eq_from(rhs.part_pts, rhs.outputs,
1144                             rhs.intermediate_output, rhs.part_ops)
1145
1146
1147 class InputData:
1148
1149     def __init__(self):
1150         self.a = Signal(64)
1151         self.b = Signal(64)
1152         self.part_pts = PartitionPoints()
1153         for i in range(8, 64, 8):
1154             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1155         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1156
1157     def eq_from(self, part_pts, a, b, part_ops):
1158         return [self.part_pts.eq(part_pts)] + \
1159                [self.a.eq(a), self.b.eq(b)] + \
1160                [self.part_ops[i].eq(part_ops[i])
1161                                      for i in range(len(self.part_ops))]
1162
1163     def eq(self, rhs):
1164         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1165
1166
1167 class OutputData:
1168
1169     def __init__(self):
1170         self.intermediate_output = Signal(128) # needed for unit tests
1171         self.output = Signal(64)
1172
1173     def eq(self, rhs):
1174         return [self.intermediate_output.eq(rhs.intermediate_output),
1175                 self.output.eq(rhs.output)]
1176
1177
1178 class AllTerms(Elaboratable):
1179     """Set of terms to be added together
1180     """
1181
1182     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1183         """Create an ``AddReduce``.
1184
1185         :param inputs: input ``Signal``s to be summed.
1186         :param output_width: bit-width of ``output``.
1187         :param register_levels: List of nesting levels that should have
1188             pipeline registers.
1189         :param partition_points: the input partition points.
1190         """
1191         self.register_levels = register_levels
1192         self.n_inputs = n_inputs
1193         self.n_parts = n_parts
1194         self.output_width = output_width
1195
1196         self.i = self.ispec()
1197         self.o = self.ospec()
1198
1199     def ispec(self):
1200         return InputData()
1201
1202     def ospec(self):
1203         return AddReduceData(self.i.part_pts, self.n_inputs,
1204                              self.output_width, self.n_parts)
1205
1206     def elaborate(self, platform):
1207         m = Module()
1208
1209         eps = self.i.part_pts
1210
1211         # collect part-bytes
1212         pbs = Signal(8, reset_less=True)
1213         tl = []
1214         for i in range(8):
1215             pb = Signal(name="pb%d" % i, reset_less=True)
1216             m.d.comb += pb.eq(eps.part_byte(i))
1217             tl.append(pb)
1218         m.d.comb += pbs.eq(Cat(*tl))
1219
1220         # local variables
1221         signs = []
1222         for i in range(8):
1223             s = Signs()
1224             signs.append(s)
1225             setattr(m.submodules, "signs%d" % i, s)
1226             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1227
1228         n_levels = len(self.register_levels)+1
1229         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1230         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1231         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1232         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1233         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1234         for mod in [part_8, part_16, part_32, part_64]:
1235             m.d.comb += mod.a.eq(self.i.a)
1236             m.d.comb += mod.b.eq(self.i.b)
1237             for i in range(len(signs)):
1238                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1239                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1240             m.d.comb += mod.pbs.eq(pbs)
1241             nat_l.append(mod.not_a_term)
1242             nbt_l.append(mod.not_b_term)
1243             nla_l.append(mod.neg_lsb_a_term)
1244             nlb_l.append(mod.neg_lsb_b_term)
1245
1246         terms = []
1247
1248         for a_index in range(8):
1249             t = ProductTerms(8, 128, 8, a_index, 8)
1250             setattr(m.submodules, "terms_%d" % a_index, t)
1251
1252             m.d.comb += t.a.eq(self.i.a)
1253             m.d.comb += t.b.eq(self.i.b)
1254             m.d.comb += t.pb_en.eq(pbs)
1255
1256             for term in t.terms:
1257                 terms.append(term)
1258
1259         # it's fine to bitwise-or data together since they are never enabled
1260         # at the same time
1261         m.submodules.nat_or = nat_or = OrMod(128)
1262         m.submodules.nbt_or = nbt_or = OrMod(128)
1263         m.submodules.nla_or = nla_or = OrMod(128)
1264         m.submodules.nlb_or = nlb_or = OrMod(128)
1265         for l, mod in [(nat_l, nat_or),
1266                              (nbt_l, nbt_or),
1267                              (nla_l, nla_or),
1268                              (nlb_l, nlb_or)]:
1269             for i in range(len(l)):
1270                 m.d.comb += mod.orin[i].eq(l[i])
1271             terms.append(mod.orout)
1272
1273         # copy the intermediate terms to the output
1274         for i, value in enumerate(terms):
1275             m.d.comb += self.o.terms[i].eq(value)
1276
1277         # copy reg part points and part ops to output
1278         m.d.comb += self.o.part_pts.eq(eps)
1279         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1280                                      for i in range(len(self.i.part_ops))]
1281
1282         return m
1283
1284
1285 class Intermediates(Elaboratable):
1286     """ Intermediate output modules
1287     """
1288
1289     def __init__(self, output_width, n_parts, part_pts):
1290         self.part_pts = part_pts
1291         self.output_width = output_width
1292         self.n_parts = n_parts
1293
1294         self.i = self.ispec()
1295         self.o = self.ospec()
1296
1297     def ispec(self):
1298         return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
1299
1300     def ospec(self):
1301         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1302
1303     def elaborate(self, platform):
1304         m = Module()
1305
1306         out_part_ops = self.i.part_ops
1307         out_part_pts = self.i.part_pts
1308
1309         # create _output_64
1310         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1311         m.d.comb += io64.intermed.eq(self.i.output)
1312         for i in range(8):
1313             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1314         m.d.comb += self.o.outputs[3].eq(io64.output)
1315
1316         # create _output_32
1317         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1318         m.d.comb += io32.intermed.eq(self.i.output)
1319         for i in range(8):
1320             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1321         m.d.comb += self.o.outputs[2].eq(io32.output)
1322
1323         # create _output_16
1324         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1325         m.d.comb += io16.intermed.eq(self.i.output)
1326         for i in range(8):
1327             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1328         m.d.comb += self.o.outputs[1].eq(io16.output)
1329
1330         # create _output_8
1331         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1332         m.d.comb += io8.intermed.eq(self.i.output)
1333         for i in range(8):
1334             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1335         m.d.comb += self.o.outputs[0].eq(io8.output)
1336
1337         for i in range(8):
1338             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1339         m.d.comb += self.o.part_pts.eq(out_part_pts)
1340         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1341
1342         return m
1343
1344
1345 class Mul8_16_32_64(Elaboratable):
1346     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1347
1348     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1349     partitions on naturally-aligned boundaries. Supports the operation being
1350     set for each partition independently.
1351
1352     :attribute part_pts: the input partition points. Has a partition point at
1353         multiples of 8 in 0 < i < 64. Each partition point's associated
1354         ``Value`` is a ``Signal``. Modification not supported, except for by
1355         ``Signal.eq``.
1356     :attribute part_ops: the operation for each byte. The operation for a
1357         particular partition is selected by assigning the selected operation
1358         code to each byte in the partition. The allowed operation codes are:
1359
1360         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1361             RISC-V's `mul` instruction.
1362         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1363             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1364             instruction.
1365         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1366             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1367             `mulhsu` instruction.
1368         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1369             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1370             instruction.
1371     """
1372
1373     def __init__(self, register_levels=()):
1374         """ register_levels: specifies the points in the cascade at which
1375             flip-flops are to be inserted.
1376         """
1377
1378         # parameter(s)
1379         self.register_levels = list(register_levels)
1380
1381         self.i = self.ispec()
1382         self.o = self.ospec()
1383
1384         # inputs
1385         self.part_pts = self.i.part_pts
1386         self.part_ops = self.i.part_ops
1387         self.a = self.i.a
1388         self.b = self.i.b
1389
1390         # output
1391         self.intermediate_output = self.o.intermediate_output
1392         self.output = self.o.output
1393
1394     def ispec(self):
1395         return InputData()
1396
1397     def ospec(self):
1398         return OutputData()
1399
1400     def elaborate(self, platform):
1401         m = Module()
1402
1403         part_pts = self.part_pts
1404
1405         n_inputs = 64 + 4
1406         n_parts = 8
1407         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1408         m.submodules.allterms = t
1409         m.d.comb += t.i.eq(self.i)
1410
1411         terms = t.o.terms
1412
1413         add_reduce = AddReduce(terms,
1414                                128,
1415                                self.register_levels,
1416                                t.o.part_pts,
1417                                t.o.part_ops)
1418
1419         out_part_ops = add_reduce.o.part_ops
1420         out_part_pts = add_reduce.o.part_pts
1421
1422         m.submodules.add_reduce = add_reduce
1423
1424         interm = Intermediates(128, 8, part_pts)
1425         m.submodules.intermediates = interm
1426         m.d.comb += interm.i.eq(add_reduce.o)
1427
1428         # final output
1429         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1430         m.d.comb += finalout.i.eq(interm.o)
1431         m.d.comb += self.o.eq(finalout.o)
1432
1433         return m
1434
1435
1436 if __name__ == "__main__":
1437     m = Mul8_16_32_64()
1438     main(m, ports=[m.a,
1439                    m.b,
1440                    m.intermediate_output,
1441                    m.output,
1442                    *m.part_ops,
1443                    *m.part_pts.values()])