src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.i = AddReduceData(partition_points, n_inputs,
 351                                output_width, n_parts)
 352         self.o = FinalReduceData(partition_points, output_width, n_parts)
 353         self.output_width = output_width
 354         self.n_inputs = n_inputs
 355         self.n_parts = n_parts
 356         self.partition_points = PartitionPoints(partition_points)
 357         if not self.partition_points.fits_in_width(output_width):
 358             raise ValueError("partition_points doesn't fit in output_width")
 359
 360     def elaborate(self, platform):
 361         """Elaborate this module."""
 362         m = Module()
 363
 364         output_width = self.output_width
 365         output = Signal(output_width, reset_less=True)
 366         if self.n_inputs == 0:
 367             # use 0 as the default output value
 368             m.d.comb += output.eq(0)
 369         elif self.n_inputs == 1:
 370             # handle single input
 371             m.d.comb += output.eq(self.i.terms[0])
 372         else:
 373             # base case for adding 2 inputs
 374             assert self.n_inputs == 2
 375             adder = PartitionedAdder(output_width,
 376                                      self.i.part_pts, 2)
 377             m.submodules.final_adder = adder
 378             m.d.comb += adder.a.eq(self.i.terms[0])
 379             m.d.comb += adder.b.eq(self.i.terms[1])
 380             m.d.comb += output.eq(adder.output)
 381
 382         # create output
 383         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 384                                    self.i.part_ops)
 385
 386         return m
 387
 388
 389 class AddReduceSingle(Elaboratable):
 390     """Add list of numbers together.
 391
 392     :attribute inputs: input ``Signal``s to be summed. Modification not
 393         supported, except for by ``Signal.eq``.
 394     :attribute register_levels: List of nesting levels that should have
 395         pipeline registers.
 396     :attribute output: output sum.
 397     :attribute partition_points: the input partition points. Modification not
 398         supported, except for by ``Signal.eq``.
 399     """
 400
 401     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 402         """Create an ``AddReduce``.
 403
 404         :param inputs: input ``Signal``s to be summed.
 405         :param output_width: bit-width of ``output``.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.partition_points = PartitionPoints(partition_points)
 414         if not self.partition_points.fits_in_width(output_width):
 415             raise ValueError("partition_points doesn't fit in output_width")
 416
 417         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 418         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 419         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 420
 421     @staticmethod
 422     def calc_n_inputs(n_inputs, groups):
 423         retval = len(groups)*2
 424         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 425             retval += 1
 426         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 427             retval += 2
 428         else:
 429             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 430         return retval
 431
 432     @staticmethod
 433     def get_max_level(input_count):
 434         """Get the maximum level.
 435
 436         All ``register_levels`` must be less than or equal to the maximum
 437         level.
 438         """
 439         retval = 0
 440         while True:
 441             groups = AddReduceSingle.full_adder_groups(input_count)
 442             if len(groups) == 0:
 443                 return retval
 444             input_count %= FULL_ADDER_INPUT_COUNT
 445             input_count += 2 * len(groups)
 446             retval += 1
 447
 448     @staticmethod
 449     def full_adder_groups(input_count):
 450         """Get ``inputs`` indices for which a full adder should be built."""
 451         return range(0,
 452                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 453                      FULL_ADDER_INPUT_COUNT)
 454
 455     def create_next_terms(self):
 456         """ create next intermediate terms, for linking up in elaborate, below
 457         """
 458         terms = []
 459         adders = []
 460
 461         # create full adders for this recursive level.
 462         # this shrinks N terms to 2 * (N // 3) plus the remainder
 463         for i in self.groups:
 464             adder_i = MaskedFullAdder(self.output_width)
 465             adders.append((i, adder_i))
 466             # add both the sum and the masked-carry to the next level.
 467             # 3 inputs have now been reduced to 2...
 468             terms.append(adder_i.sum)
 469             terms.append(adder_i.mcarry)
 470         # handle the remaining inputs.
 471         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 472             terms.append(self.i.terms[-1])
 473         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 474             # Just pass the terms to the next layer, since we wouldn't gain
 475             # anything by using a half adder since there would still be 2 terms
 476             # and just passing the terms to the next layer saves gates.
 477             terms.append(self.i.terms[-2])
 478             terms.append(self.i.terms[-1])
 479         else:
 480             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 481
 482         return terms, adders
 483
 484     def elaborate(self, platform):
 485         """Elaborate this module."""
 486         m = Module()
 487
 488         terms, adders = self.create_next_terms()
 489
 490         # copy the intermediate terms to the output
 491         for i, value in enumerate(terms):
 492             m.d.comb += self.o.terms[i].eq(value)
 493
 494         # copy reg part points and part ops to output
 495         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 496         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 497                                      for i in range(len(self.i.part_ops))]
 498
 499         # set up the partition mask (for the adders)
 500         part_mask = Signal(self.output_width, reset_less=True)
 501
 502         # get partition points as a mask
 503         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 504         m.d.comb += part_mask.eq(mask)
 505
 506         # add and link the intermediate term modules
 507         for i, (iidx, adder_i) in enumerate(adders):
 508             setattr(m.submodules, f"adder_{i}", adder_i)
 509
 510             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 511             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 512             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 513             m.d.comb += adder_i.mask.eq(part_mask)
 514
 515         return m
 516
 517
 518 class AddReduceInternal:
 519     """Recursively Add list of numbers together.
 520
 521     :attribute inputs: input ``Signal``s to be summed. Modification not
 522         supported, except for by ``Signal.eq``.
 523     :attribute register_levels: List of nesting levels that should have
 524         pipeline registers.
 525     :attribute output: output sum.
 526     :attribute partition_points: the input partition points. Modification not
 527         supported, except for by ``Signal.eq``.
 528     """
 529
 530     def __init__(self, inputs, output_width, partition_points,
 531                        part_ops):
 532         """Create an ``AddReduce``.
 533
 534         :param inputs: input ``Signal``s to be summed.
 535         :param output_width: bit-width of ``output``.
 536         :param partition_points: the input partition points.
 537         """
 538         self.inputs = inputs
 539         self.part_ops = part_ops
 540         self.output_width = output_width
 541         self.partition_points = partition_points
 542
 543         self.create_levels()
 544
 545     def create_levels(self):
 546         """creates reduction levels"""
 547
 548         mods = []
 549         partition_points = self.partition_points
 550         part_ops = self.part_ops
 551         n_parts = len(part_ops)
 552         inputs = self.inputs
 553         ilen = len(inputs)
 554         while True:
 555             groups = AddReduceSingle.full_adder_groups(len(inputs))
 556             if len(groups) == 0:
 557                 break
 558             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 559                                          partition_points)
 560             mods.append(next_level)
 561             partition_points = next_level.i.part_pts
 562             inputs = next_level.o.terms
 563             ilen = len(inputs)
 564             part_ops = next_level.i.part_ops
 565
 566         next_level = FinalAdd(ilen, self.output_width, n_parts,
 567                               partition_points)
 568         mods.append(next_level)
 569
 570         self.levels = mods
 571
 572
 573 class AddReduce(AddReduceInternal, Elaboratable):
 574     """Recursively Add list of numbers together.
 575
 576     :attribute inputs: input ``Signal``s to be summed. Modification not
 577         supported, except for by ``Signal.eq``.
 578     :attribute register_levels: List of nesting levels that should have
 579         pipeline registers.
 580     :attribute output: output sum.
 581     :attribute partition_points: the input partition points. Modification not
 582         supported, except for by ``Signal.eq``.
 583     """
 584
 585     def __init__(self, inputs, output_width, register_levels, partition_points,
 586                        part_ops):
 587         """Create an ``AddReduce``.
 588
 589         :param inputs: input ``Signal``s to be summed.
 590         :param output_width: bit-width of ``output``.
 591         :param register_levels: List of nesting levels that should have
 592             pipeline registers.
 593         :param partition_points: the input partition points.
 594         """
 595         AddReduceInternal.__init__(self, inputs, output_width,
 596                                    partition_points, part_ops)
 597         n_parts = len(part_ops)
 598         self.o = FinalReduceData(partition_points, output_width, n_parts)
 599         self.register_levels = register_levels
 600
 601     @staticmethod
 602     def get_max_level(input_count):
 603         return AddReduceSingle.get_max_level(input_count)
 604
 605     @staticmethod
 606     def next_register_levels(register_levels):
 607         """``Iterable`` of ``register_levels`` for next recursive level."""
 608         for level in register_levels:
 609             if level > 0:
 610                 yield level - 1
 611
 612     def create_levels(self):
 613         """creates reduction levels"""
 614
 615         mods = []
 616         partition_points = self.partition_points
 617         part_ops = self.part_ops
 618         n_parts = len(part_ops)
 619         inputs = self.inputs
 620         ilen = len(inputs)
 621         while True:
 622             groups = AddReduceSingle.full_adder_groups(len(inputs))
 623             if len(groups) == 0:
 624                 break
 625             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 626                                          partition_points)
 627             mods.append(next_level)
 628             partition_points = next_level.i.part_pts
 629             inputs = next_level.o.terms
 630             ilen = len(inputs)
 631             part_ops = next_level.i.part_ops
 632
 633         next_level = FinalAdd(ilen, self.output_width, n_parts,
 634                               partition_points)
 635         mods.append(next_level)
 636
 637         self.levels = mods
 638
 639     def elaborate(self, platform):
 640         """Elaborate this module."""
 641         m = Module()
 642
 643         for i, next_level in enumerate(self.levels):
 644             setattr(m.submodules, "next_level%d" % i, next_level)
 645
 646         partition_points = self.partition_points
 647         inputs = self.inputs
 648         part_ops = self.part_ops
 649         n_parts = len(part_ops)
 650         n_inputs = len(inputs)
 651         output_width = self.output_width
 652         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 653         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 654         for idx in range(len(self.levels)):
 655             mcur = self.levels[idx]
 656             if idx in self.register_levels:
 657                 m.d.sync += mcur.i.eq(i)
 658             else:
 659                 m.d.comb += mcur.i.eq(i)
 660             i = mcur.o # for next loop
 661
 662         # output comes from last module
 663         m.d.comb += self.o.eq(i)
 664
 665         return m
 666
 667
 668 OP_MUL_LOW = 0
 669 OP_MUL_SIGNED_HIGH = 1
 670 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 671 OP_MUL_UNSIGNED_HIGH = 3
 672
 673
 674 def get_term(value, shift=0, enabled=None):
 675     if enabled is not None:
 676         value = Mux(enabled, value, 0)
 677     if shift > 0:
 678         value = Cat(Repl(C(0, 1), shift), value)
 679     else:
 680         assert shift == 0
 681     return value
 682
 683
 684 class ProductTerm(Elaboratable):
 685     """ this class creates a single product term (a[..]*b[..]).
 686         it has a design flaw in that is the *output* that is selected,
 687         where the multiplication(s) are combinatorially generated
 688         all the time.
 689     """
 690
 691     def __init__(self, width, twidth, pbwid, a_index, b_index):
 692         self.a_index = a_index
 693         self.b_index = b_index
 694         shift = 8 * (self.a_index + self.b_index)
 695         self.pwidth = width
 696         self.twidth = twidth
 697         self.width = width*2
 698         self.shift = shift
 699
 700         self.ti = Signal(self.width, reset_less=True)
 701         self.term = Signal(twidth, reset_less=True)
 702         self.a = Signal(twidth//2, reset_less=True)
 703         self.b = Signal(twidth//2, reset_less=True)
 704         self.pb_en = Signal(pbwid, reset_less=True)
 705
 706         self.tl = tl = []
 707         min_index = min(self.a_index, self.b_index)
 708         max_index = max(self.a_index, self.b_index)
 709         for i in range(min_index, max_index):
 710             tl.append(self.pb_en[i])
 711         name = "te_%d_%d" % (self.a_index, self.b_index)
 712         if len(tl) > 0:
 713             term_enabled = Signal(name=name, reset_less=True)
 714         else:
 715             term_enabled = None
 716         self.enabled = term_enabled
 717         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 718
 719     def elaborate(self, platform):
 720
 721         m = Module()
 722         if self.enabled is not None:
 723             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 724
 725         bsa = Signal(self.width, reset_less=True)
 726         bsb = Signal(self.width, reset_less=True)
 727         a_index, b_index = self.a_index, self.b_index
 728         pwidth = self.pwidth
 729         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 730         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 731         m.d.comb += self.ti.eq(bsa * bsb)
 732         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 733         """
 734         #TODO: sort out width issues, get inputs a/b switched on/off.
 735         #data going into Muxes is 1/2 the required width
 736
 737         pwidth = self.pwidth
 738         width = self.width
 739         bsa = Signal(self.twidth//2, reset_less=True)
 740         bsb = Signal(self.twidth//2, reset_less=True)
 741         asel = Signal(width, reset_less=True)
 742         bsel = Signal(width, reset_less=True)
 743         a_index, b_index = self.a_index, self.b_index
 744         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 745         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 746         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 747         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 748         m.d.comb += self.ti.eq(bsa * bsb)
 749         m.d.comb += self.term.eq(self.ti)
 750         """
 751
 752         return m
 753
 754
 755 class ProductTerms(Elaboratable):
 756     """ creates a bank of product terms.  also performs the actual bit-selection
 757         this class is to be wrapped with a for-loop on the "a" operand.
 758         it creates a second-level for-loop on the "b" operand.
 759     """
 760     def __init__(self, width, twidth, pbwid, a_index, blen):
 761         self.a_index = a_index
 762         self.blen = blen
 763         self.pwidth = width
 764         self.twidth = twidth
 765         self.pbwid = pbwid
 766         self.a = Signal(twidth//2, reset_less=True)
 767         self.b = Signal(twidth//2, reset_less=True)
 768         self.pb_en = Signal(pbwid, reset_less=True)
 769         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 770                             for i in range(blen)]
 771
 772     def elaborate(self, platform):
 773
 774         m = Module()
 775
 776         for b_index in range(self.blen):
 777             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 778                             self.a_index, b_index)
 779             setattr(m.submodules, "term_%d" % b_index, t)
 780
 781             m.d.comb += t.a.eq(self.a)
 782             m.d.comb += t.b.eq(self.b)
 783             m.d.comb += t.pb_en.eq(self.pb_en)
 784
 785             m.d.comb += self.terms[b_index].eq(t.term)
 786
 787         return m
 788
 789
 790 class LSBNegTerm(Elaboratable):
 791
 792     def __init__(self, bit_width):
 793         self.bit_width = bit_width
 794         self.part = Signal(reset_less=True)
 795         self.signed = Signal(reset_less=True)
 796         self.op = Signal(bit_width, reset_less=True)
 797         self.msb = Signal(reset_less=True)
 798         self.nt = Signal(bit_width*2, reset_less=True)
 799         self.nl = Signal(bit_width*2, reset_less=True)
 800
 801     def elaborate(self, platform):
 802         m = Module()
 803         comb = m.d.comb
 804         bit_wid = self.bit_width
 805         ext = Repl(0, bit_wid) # extend output to HI part
 806
 807         # determine sign of each incoming number *in this partition*
 808         enabled = Signal(reset_less=True)
 809         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 810
 811         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 812         # negation operation is split into a bitwise not and a +1.
 813         # likewise for 16, 32, and 64-bit values.
 814
 815         # width-extended 1s complement if a is signed, otherwise zero
 816         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 817
 818         # add 1 if signed, otherwise add zero
 819         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 820
 821         return m
 822
 823
 824 class Parts(Elaboratable):
 825
 826     def __init__(self, pbwid, part_pts, n_parts):
 827         self.pbwid = pbwid
 828         # inputs
 829         self.part_pts = PartitionPoints.like(part_pts)
 830         # outputs
 831         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 832                       for i in range(n_parts)]
 833
 834     def elaborate(self, platform):
 835         m = Module()
 836
 837         part_pts, parts = self.part_pts, self.parts
 838         # collect part-bytes (double factor because the input is extended)
 839         pbs = Signal(self.pbwid, reset_less=True)
 840         tl = []
 841         for i in range(self.pbwid):
 842             pb = Signal(name="pb%d" % i, reset_less=True)
 843             m.d.comb += pb.eq(part_pts.part_byte(i))
 844             tl.append(pb)
 845         m.d.comb += pbs.eq(Cat(*tl))
 846
 847         # negated-temporary copy of partition bits
 848         npbs = Signal.like(pbs, reset_less=True)
 849         m.d.comb += npbs.eq(~pbs)
 850         byte_count = 8 // len(parts)
 851         for i in range(len(parts)):
 852             pbl = []
 853             pbl.append(npbs[i * byte_count - 1])
 854             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 855                 pbl.append(pbs[j])
 856             pbl.append(npbs[(i + 1) * byte_count - 1])
 857             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 858             m.d.comb += value.eq(Cat(*pbl))
 859             m.d.comb += parts[i].eq(~(value).bool())
 860
 861         return m
 862
 863
 864 class Part(Elaboratable):
 865     """ a key class which, depending on the partitioning, will determine
 866         what action to take when parts of the output are signed or unsigned.
 867
 868         this requires 2 pieces of data *per operand, per partition*:
 869         whether the MSB is HI/LO (per partition!), and whether a signed
 870         or unsigned operation has been *requested*.
 871
 872         once that is determined, signed is basically carried out
 873         by splitting 2's complement into 1's complement plus one.
 874         1's complement is just a bit-inversion.
 875
 876         the extra terms - as separate terms - are then thrown at the
 877         AddReduce alongside the multiplication part-results.
 878     """
 879     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 880
 881         self.pbwid = pbwid
 882         self.part_pts = part_pts
 883
 884         # inputs
 885         self.a = Signal(64, reset_less=True)
 886         self.b = Signal(64, reset_less=True)
 887         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 888                             for i in range(8)]
 889         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 890                             for i in range(8)]
 891         self.pbs = Signal(pbwid, reset_less=True)
 892
 893         # outputs
 894         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 895                             for i in range(n_parts)]
 896
 897         self.not_a_term = Signal(width, reset_less=True)
 898         self.neg_lsb_a_term = Signal(width, reset_less=True)
 899         self.not_b_term = Signal(width, reset_less=True)
 900         self.neg_lsb_b_term = Signal(width, reset_less=True)
 901
 902     def elaborate(self, platform):
 903         m = Module()
 904
 905         pbs, parts = self.pbs, self.parts
 906         part_pts = self.part_pts
 907         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 908         m.d.comb += p.part_pts.eq(part_pts)
 909         parts = p.parts
 910
 911         byte_count = 8 // len(parts)
 912
 913         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 914                 self.not_a_term, self.neg_lsb_a_term,
 915                 self.not_b_term, self.neg_lsb_b_term)
 916
 917         byte_width = 8 // len(parts) # byte width
 918         bit_wid = 8 * byte_width     # bit width
 919         nat, nbt, nla, nlb = [], [], [], []
 920         for i in range(len(parts)):
 921             # work out bit-inverted and +1 term for a.
 922             pa = LSBNegTerm(bit_wid)
 923             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 924             m.d.comb += pa.part.eq(parts[i])
 925             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 926             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 927             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 928             nat.append(pa.nt)
 929             nla.append(pa.nl)
 930
 931             # work out bit-inverted and +1 term for b
 932             pb = LSBNegTerm(bit_wid)
 933             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 934             m.d.comb += pb.part.eq(parts[i])
 935             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 936             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 937             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 938             nbt.append(pb.nt)
 939             nlb.append(pb.nl)
 940
 941         # concatenate together and return all 4 results.
 942         m.d.comb += [not_a_term.eq(Cat(*nat)),
 943                      not_b_term.eq(Cat(*nbt)),
 944                      neg_lsb_a_term.eq(Cat(*nla)),
 945                      neg_lsb_b_term.eq(Cat(*nlb)),
 946                     ]
 947
 948         return m
 949
 950
 951 class IntermediateOut(Elaboratable):
 952     """ selects the HI/LO part of the multiplication, for a given bit-width
 953         the output is also reconstructed in its SIMD (partition) lanes.
 954     """
 955     def __init__(self, width, out_wid, n_parts):
 956         self.width = width
 957         self.n_parts = n_parts
 958         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 959                                      for i in range(8)]
 960         self.intermed = Signal(out_wid, reset_less=True)
 961         self.output = Signal(out_wid//2, reset_less=True)
 962
 963     def elaborate(self, platform):
 964         m = Module()
 965
 966         ol = []
 967         w = self.width
 968         sel = w // 8
 969         for i in range(self.n_parts):
 970             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 971             m.d.comb += op.eq(
 972                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 973                     self.intermed.part(i * w*2, w),
 974                     self.intermed.part(i * w*2 + w, w)))
 975             ol.append(op)
 976         m.d.comb += self.output.eq(Cat(*ol))
 977
 978         return m
 979
 980
 981 class FinalOut(Elaboratable):
 982     """ selects the final output based on the partitioning.
 983
 984         each byte is selectable independently, i.e. it is possible
 985         that some partitions requested 8-bit computation whilst others
 986         requested 16 or 32 bit.
 987     """
 988     def __init__(self, output_width, n_parts, part_pts):
 989         self.part_pts = part_pts
 990         self.i = IntermediateData(part_pts, output_width, n_parts)
 991         self.out_wid = output_width//2
 992         # output
 993         self.out = Signal(self.out_wid, reset_less=True)
 994         self.intermediate_output = Signal(output_width, reset_less=True)
 995
 996     def elaborate(self, platform):
 997         m = Module()
 998
 999         part_pts = self.part_pts
1000         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1001         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1002         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1003         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1004
1005         out_part_pts = self.i.part_pts
1006
1007         # temporaries
1008         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1009         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1010         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1011
1012         i8 = Signal(self.out_wid, reset_less=True)
1013         i16 = Signal(self.out_wid, reset_less=True)
1014         i32 = Signal(self.out_wid, reset_less=True)
1015         i64 = Signal(self.out_wid, reset_less=True)
1016
1017         m.d.comb += p_8.part_pts.eq(out_part_pts)
1018         m.d.comb += p_16.part_pts.eq(out_part_pts)
1019         m.d.comb += p_32.part_pts.eq(out_part_pts)
1020         m.d.comb += p_64.part_pts.eq(out_part_pts)
1021
1022         for i in range(len(p_8.parts)):
1023             m.d.comb += d8[i].eq(p_8.parts[i])
1024         for i in range(len(p_16.parts)):
1025             m.d.comb += d16[i].eq(p_16.parts[i])
1026         for i in range(len(p_32.parts)):
1027             m.d.comb += d32[i].eq(p_32.parts[i])
1028         m.d.comb += i8.eq(self.i.outputs[0])
1029         m.d.comb += i16.eq(self.i.outputs[1])
1030         m.d.comb += i32.eq(self.i.outputs[2])
1031         m.d.comb += i64.eq(self.i.outputs[3])
1032
1033         ol = []
1034         for i in range(8):
1035             # select one of the outputs: d8 selects i8, d16 selects i16
1036             # d32 selects i32, and the default is i64.
1037             # d8 and d16 are ORed together in the first Mux
1038             # then the 2nd selects either i8 or i16.
1039             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1040             op = Signal(8, reset_less=True, name="op_%d" % i)
1041             m.d.comb += op.eq(
1042                 Mux(d8[i] | d16[i // 2],
1043                     Mux(d8[i], i8.part(i * 8, 8), i16.part(i * 8, 8)),
1044                     Mux(d32[i // 4], i32.part(i * 8, 8), i64.part(i * 8, 8))))
1045             ol.append(op)
1046         m.d.comb += self.out.eq(Cat(*ol))
1047         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
1048         return m
1049
1050
1051 class OrMod(Elaboratable):
1052     """ ORs four values together in a hierarchical tree
1053     """
1054     def __init__(self, wid):
1055         self.wid = wid
1056         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1057                      for i in range(4)]
1058         self.orout = Signal(wid, reset_less=True)
1059
1060     def elaborate(self, platform):
1061         m = Module()
1062         or1 = Signal(self.wid, reset_less=True)
1063         or2 = Signal(self.wid, reset_less=True)
1064         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1065         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1066         m.d.comb += self.orout.eq(or1 | or2)
1067
1068         return m
1069
1070
1071 class Signs(Elaboratable):
1072     """ determines whether a or b are signed numbers
1073         based on the required operation type (OP_MUL_*)
1074     """
1075
1076     def __init__(self):
1077         self.part_ops = Signal(2, reset_less=True)
1078         self.a_signed = Signal(reset_less=True)
1079         self.b_signed = Signal(reset_less=True)
1080
1081     def elaborate(self, platform):
1082
1083         m = Module()
1084
1085         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1086         bsig = (self.part_ops == OP_MUL_LOW) \
1087                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1088         m.d.comb += self.a_signed.eq(asig)
1089         m.d.comb += self.b_signed.eq(bsig)
1090
1091         return m
1092
1093
1094 class IntermediateData:
1095
1096     def __init__(self, part_pts, output_width, n_parts):
1097         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1098                           for i in range(n_parts)]
1099         self.part_pts = part_pts.like()
1100         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1101                           for i in range(4)]
1102         # intermediates (needed for unit tests)
1103         self.intermediate_output = Signal(output_width)
1104
1105     def eq_from(self, part_pts, outputs, intermediate_output,
1106                       part_ops):
1107         return [self.part_pts.eq(part_pts)] + \
1108                [self.intermediate_output.eq(intermediate_output)] + \
1109                [self.outputs[i].eq(outputs[i])
1110                                      for i in range(4)] + \
1111                [self.part_ops[i].eq(part_ops[i])
1112                                      for i in range(len(self.part_ops))]
1113
1114     def eq(self, rhs):
1115         return self.eq_from(rhs.part_pts, rhs.outputs,
1116                             rhs.intermediate_output, rhs.part_ops)
1117
1118
1119 class InputData:
1120
1121     def __init__(self):
1122         self.a = Signal(64)
1123         self.b = Signal(64)
1124         self.part_pts = PartitionPoints()
1125         for i in range(8, 64, 8):
1126             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1127         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1128
1129     def eq_from(self, part_pts, inputs, part_ops):
1130         return [self.part_pts.eq(part_pts)] + \
1131                [self.a.eq(a), self.b.eq(b)] + \
1132                [self.part_ops[i].eq(part_ops[i])
1133                                      for i in range(len(self.part_ops))]
1134
1135     def eq(self, rhs):
1136         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1137
1138
1139 class AllTerms(Elaboratable):
1140     """Set of terms to be added together
1141     """
1142
1143     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1144         """Create an ``AddReduce``.
1145
1146         :param inputs: input ``Signal``s to be summed.
1147         :param output_width: bit-width of ``output``.
1148         :param register_levels: List of nesting levels that should have
1149             pipeline registers.
1150         :param partition_points: the input partition points.
1151         """
1152         self.i = InputData()
1153         self.register_levels = register_levels
1154         self.n_inputs = n_inputs
1155         self.n_parts = n_parts
1156         self.output_width = output_width
1157         self.o = AddReduceData(self.i.part_pts, n_inputs,
1158                                output_width, n_parts)
1159
1160     def elaborate(self, platform):
1161         m = Module()
1162
1163         eps = self.i.part_pts
1164
1165         # collect part-bytes
1166         pbs = Signal(8, reset_less=True)
1167         tl = []
1168         for i in range(8):
1169             pb = Signal(name="pb%d" % i, reset_less=True)
1170             m.d.comb += pb.eq(eps.part_byte(i))
1171             tl.append(pb)
1172         m.d.comb += pbs.eq(Cat(*tl))
1173
1174         # local variables
1175         signs = []
1176         for i in range(8):
1177             s = Signs()
1178             signs.append(s)
1179             setattr(m.submodules, "signs%d" % i, s)
1180             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1181
1182         n_levels = len(self.register_levels)+1
1183         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1184         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1185         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1186         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1187         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1188         for mod in [part_8, part_16, part_32, part_64]:
1189             m.d.comb += mod.a.eq(self.i.a)
1190             m.d.comb += mod.b.eq(self.i.b)
1191             for i in range(len(signs)):
1192                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1193                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1194             m.d.comb += mod.pbs.eq(pbs)
1195             nat_l.append(mod.not_a_term)
1196             nbt_l.append(mod.not_b_term)
1197             nla_l.append(mod.neg_lsb_a_term)
1198             nlb_l.append(mod.neg_lsb_b_term)
1199
1200         terms = []
1201
1202         for a_index in range(8):
1203             t = ProductTerms(8, 128, 8, a_index, 8)
1204             setattr(m.submodules, "terms_%d" % a_index, t)
1205
1206             m.d.comb += t.a.eq(self.i.a)
1207             m.d.comb += t.b.eq(self.i.b)
1208             m.d.comb += t.pb_en.eq(pbs)
1209
1210             for term in t.terms:
1211                 terms.append(term)
1212
1213         # it's fine to bitwise-or data together since they are never enabled
1214         # at the same time
1215         m.submodules.nat_or = nat_or = OrMod(128)
1216         m.submodules.nbt_or = nbt_or = OrMod(128)
1217         m.submodules.nla_or = nla_or = OrMod(128)
1218         m.submodules.nlb_or = nlb_or = OrMod(128)
1219         for l, mod in [(nat_l, nat_or),
1220                              (nbt_l, nbt_or),
1221                              (nla_l, nla_or),
1222                              (nlb_l, nlb_or)]:
1223             for i in range(len(l)):
1224                 m.d.comb += mod.orin[i].eq(l[i])
1225             terms.append(mod.orout)
1226
1227         # copy the intermediate terms to the output
1228         for i, value in enumerate(terms):
1229             m.d.comb += self.o.terms[i].eq(value)
1230
1231         # copy reg part points and part ops to output
1232         m.d.comb += self.o.part_pts.eq(eps)
1233         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1234                                      for i in range(len(self.i.part_ops))]
1235
1236         return m
1237
1238
1239 class Intermediates(Elaboratable):
1240     """ Intermediate output modules
1241     """
1242
1243     def __init__(self, output_width, n_parts, partition_points):
1244         self.i = FinalReduceData(partition_points, output_width, n_parts)
1245         self.o = IntermediateData(partition_points, output_width, n_parts)
1246
1247     def elaborate(self, platform):
1248         m = Module()
1249
1250         out_part_ops = self.i.part_ops
1251         out_part_pts = self.i.part_pts
1252
1253         # create _output_64
1254         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1255         m.d.comb += io64.intermed.eq(self.i.output)
1256         for i in range(8):
1257             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1258         m.d.comb += self.o.outputs[3].eq(io64.output)
1259
1260         # create _output_32
1261         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1262         m.d.comb += io32.intermed.eq(self.i.output)
1263         for i in range(8):
1264             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1265         m.d.comb += self.o.outputs[2].eq(io32.output)
1266
1267         # create _output_16
1268         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1269         m.d.comb += io16.intermed.eq(self.i.output)
1270         for i in range(8):
1271             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1272         m.d.comb += self.o.outputs[1].eq(io16.output)
1273
1274         # create _output_8
1275         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1276         m.d.comb += io8.intermed.eq(self.i.output)
1277         for i in range(8):
1278             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1279         m.d.comb += self.o.outputs[0].eq(io8.output)
1280
1281         for i in range(8):
1282             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1283         m.d.comb += self.o.part_pts.eq(out_part_pts)
1284         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1285
1286         return m
1287
1288
1289 class Mul8_16_32_64(Elaboratable):
1290     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1291
1292     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1293     partitions on naturally-aligned boundaries. Supports the operation being
1294     set for each partition independently.
1295
1296     :attribute part_pts: the input partition points. Has a partition point at
1297         multiples of 8 in 0 < i < 64. Each partition point's associated
1298         ``Value`` is a ``Signal``. Modification not supported, except for by
1299         ``Signal.eq``.
1300     :attribute part_ops: the operation for each byte. The operation for a
1301         particular partition is selected by assigning the selected operation
1302         code to each byte in the partition. The allowed operation codes are:
1303
1304         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1305             RISC-V's `mul` instruction.
1306         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1307             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1308             instruction.
1309         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1310             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1311             `mulhsu` instruction.
1312         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1313             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1314             instruction.
1315     """
1316
1317     def __init__(self, register_levels=()):
1318         """ register_levels: specifies the points in the cascade at which
1319             flip-flops are to be inserted.
1320         """
1321
1322         # parameter(s)
1323         self.register_levels = list(register_levels)
1324
1325         # inputs
1326         self.i = InputData()
1327         self.part_pts = self.i.part_pts
1328         self.part_ops = self.i.part_ops
1329         self.a = self.i.a
1330         self.b = self.i.b
1331
1332         # intermediates (needed for unit tests)
1333         self.intermediate_output = Signal(128)
1334
1335         # output
1336         self.output = Signal(64)
1337
1338     def elaborate(self, platform):
1339         m = Module()
1340
1341         part_pts = self.part_pts
1342
1343         n_inputs = 64 + 4
1344         n_parts = 8
1345         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1346         m.submodules.allterms = t
1347         m.d.comb += t.i.eq(self.i)
1348
1349         terms = t.o.terms
1350
1351         add_reduce = AddReduce(terms,
1352                                128,
1353                                self.register_levels,
1354                                t.o.part_pts,
1355                                t.o.part_ops)
1356
1357         out_part_ops = add_reduce.o.part_ops
1358         out_part_pts = add_reduce.o.part_pts
1359
1360         m.submodules.add_reduce = add_reduce
1361
1362         interm = Intermediates(128, 8, part_pts)
1363         m.submodules.intermediates = interm
1364         m.d.comb += interm.i.eq(add_reduce.o)
1365
1366         # final output
1367         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1368         m.d.comb += finalout.i.eq(interm.o)
1369         m.d.comb += self.output.eq(finalout.out)
1370         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1371
1372         return m
1373
1374
1375 if __name__ == "__main__":
1376     m = Mul8_16_32_64()
1377     main(m, ports=[m.a,
1378                    m.b,
1379                    m.intermediate_output,
1380                    m.output,
1381                    *m.part_ops,
1382                    *m.part_pts.values()])