switch to exact version of cython

[ieee754fpu.git] / src / ieee754 / part_mul_add / multiply.py
diff --git a/src/ieee754/part_mul_add/multiply.py b/src/ieee754/part_mul_add/multiply.py

index 4c6b570ce4474008c4fc590110afb1719cb818cb..b132de56f0ecc90316b13fd1ee1f8da5b7ea4075 100644 (file)
--- a/src/ieee754/part_mul_add/multiply.py
+++ b/src/ieee754/part_mul_add/multiply.py
@@ -8,317 +8,32 @@ from abc import ABCMeta, abstractmethod
  from nmigen.cli import main
  from functools import reduce
  from operator import or_
  from nmigen.cli import main
  from functools import reduce
  from operator import or_
+from ieee754.pipeline import PipelineSpec
+from nmutil.pipemodbase import PipeModBase
  
  
-
-class PartitionPoints(dict):
-    """Partition points and corresponding ``Value``s.
-
-    The points at where an ALU is partitioned along with ``Value``s that
-    specify if the corresponding partition points are enabled.
-
-    For example: ``{1: True, 5: True, 10: True}`` with
-    ``width == 16`` specifies that the ALU is split into 4 sections:
-    * bits 0 <= ``i`` < 1
-    * bits 1 <= ``i`` < 5
-    * bits 5 <= ``i`` < 10
-    * bits 10 <= ``i`` < 16
-
-    If the partition_points were instead ``{1: True, 5: a, 10: True}``
-    where ``a`` is a 1-bit ``Signal``:
-    * If ``a`` is asserted:
-        * bits 0 <= ``i`` < 1
-        * bits 1 <= ``i`` < 5
-        * bits 5 <= ``i`` < 10
-        * bits 10 <= ``i`` < 16
-    * Otherwise
-        * bits 0 <= ``i`` < 1
-        * bits 1 <= ``i`` < 10
-        * bits 10 <= ``i`` < 16
-    """
-
-    def __init__(self, partition_points=None):
-        """Create a new ``PartitionPoints``.
-
-        :param partition_points: the input partition points to values mapping.
-        """
-        super().__init__()
-        if partition_points is not None:
-            for point, enabled in partition_points.items():
-                if not isinstance(point, int):
-                    raise TypeError("point must be a non-negative integer")
-                if point < 0:
-                    raise ValueError("point must be a non-negative integer")
-                self[point] = Value.wrap(enabled)
-
-    def like(self, name=None, src_loc_at=0, mul=1):
-        """Create a new ``PartitionPoints`` with ``Signal``s for all values.
-
-        :param name: the base name for the new ``Signal``s.
-        :param mul: a multiplication factor on the indices
-        """
-        if name is None:
-            name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
-        retval = PartitionPoints()
-        for point, enabled in self.items():
-            point *= mul
-            retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
-        return retval
-
-    def eq(self, rhs):
-        """Assign ``PartitionPoints`` using ``Signal.eq``."""
-        if set(self.keys()) != set(rhs.keys()):
-            raise ValueError("incompatible point set")
-        for point, enabled in self.items():
-            yield enabled.eq(rhs[point])
-
-    def as_mask(self, width, mul=1):
-        """Create a bit-mask from `self`.
-
-        Each bit in the returned mask is clear only if the partition point at
-        the same bit-index is enabled.
-
-        :param width: the bit width of the resulting mask
-        :param mul: a "multiplier" which in-place expands the partition points
-                    typically set to "2" when used for multipliers
-        """
-        bits = []
-        for i in range(width):
-            i /= mul
-            if i.is_integer() and int(i) in self:
-                bits.append(~self[i])
-            else:
-                bits.append(True)
-        return Cat(*bits)
-
-    def get_max_partition_count(self, width):
-        """Get the maximum number of partitions.
-
-        Gets the number of partitions when all partition points are enabled.
-        """
-        retval = 1
-        for point in self.keys():
-            if point < width:
-                retval += 1
-        return retval
-
-    def fits_in_width(self, width):
-        """Check if all partition points are smaller than `width`."""
-        for point in self.keys():
-            if point >= width:
-                return False
-        return True
-
-    def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
-        if index == -1 or index == 7:
-            return C(True, 1)
-        assert index >= 0 and index < 8
-        return self[(index * 8 + 8)*mfactor]
-
-
-class FullAdder(Elaboratable):
-    """Full Adder.
-
-    :attribute in0: the first input
-    :attribute in1: the second input
-    :attribute in2: the third input
-    :attribute sum: the sum output
-    :attribute carry: the carry output
-
-    Rather than do individual full adders (and have an array of them,
-    which would be very slow to simulate), this module can specify the
-    bit width of the inputs and outputs: in effect it performs multiple
-    Full 3-2 Add operations "in parallel".
-    """
-
-    def __init__(self, width):
-        """Create a ``FullAdder``.
-
-        :param width: the bit width of the input and output
-        """
-        self.in0 = Signal(width, reset_less=True)
-        self.in1 = Signal(width, reset_less=True)
-        self.in2 = Signal(width, reset_less=True)
-        self.sum = Signal(width, reset_less=True)
-        self.carry = Signal(width, reset_less=True)
-
-    def elaborate(self, platform):
-        """Elaborate this module."""
-        m = Module()
-        m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
-        m.d.comb += self.carry.eq((self.in0 & self.in1)
-                                  | (self.in1 & self.in2)
-                                  | (self.in2 & self.in0))
-        return m
-
-
-class MaskedFullAdder(Elaboratable):
-    """Masked Full Adder.
-
-    :attribute mask: the carry partition mask
-    :attribute in0: the first input
-    :attribute in1: the second input
-    :attribute in2: the third input
-    :attribute sum: the sum output
-    :attribute mcarry: the masked carry output
-
-    FullAdders are always used with a "mask" on the output.  To keep
-    the graphviz "clean", this class performs the masking here rather
-    than inside a large for-loop.
-
-    See the following discussion as to why this is no longer derived
-    from FullAdder.  Each carry is shifted here *before* being ANDed
-    with the mask, so that an AOI cell may be used (which is more
-    gate-efficient)
-    https://en.wikipedia.org/wiki/AND-OR-Invert
-    https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
-    """
-
-    def __init__(self, width):
-        """Create a ``MaskedFullAdder``.
-
-        :param width: the bit width of the input and output
-        """
-        self.width = width
-        self.mask = Signal(width, reset_less=True)
-        self.mcarry = Signal(width, reset_less=True)
-        self.in0 = Signal(width, reset_less=True)
-        self.in1 = Signal(width, reset_less=True)
-        self.in2 = Signal(width, reset_less=True)
-        self.sum = Signal(width, reset_less=True)
-
-    def elaborate(self, platform):
-        """Elaborate this module."""
-        m = Module()
-        s1 = Signal(self.width, reset_less=True)
-        s2 = Signal(self.width, reset_less=True)
-        s3 = Signal(self.width, reset_less=True)
-        c1 = Signal(self.width, reset_less=True)
-        c2 = Signal(self.width, reset_less=True)
-        c3 = Signal(self.width, reset_less=True)
-        m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
-        m.d.comb += s1.eq(Cat(0, self.in0))
-        m.d.comb += s2.eq(Cat(0, self.in1))
-        m.d.comb += s3.eq(Cat(0, self.in2))
-        m.d.comb += c1.eq(s1 & s2 & self.mask)
-        m.d.comb += c2.eq(s2 & s3 & self.mask)
-        m.d.comb += c3.eq(s3 & s1 & self.mask)
-        m.d.comb += self.mcarry.eq(c1 | c2 | c3)
-        return m
-
-
-class PartitionedAdder(Elaboratable):
-    """Partitioned Adder.
-
-    Performs the final add.  The partition points are included in the
-    actual add (in one of the operands only), which causes a carry over
-    to the next bit.  Then the final output *removes* the extra bits from
-    the result.
-
-    partition: .... P... P... P... P... (32 bits)
-    a        : .... .... .... .... .... (32 bits)
-    b        : .... .... .... .... .... (32 bits)
-    exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
-    exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
-    exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
-    o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
-
-    :attribute width: the bit width of the input and output. Read-only.
-    :attribute a: the first input to the adder
-    :attribute b: the second input to the adder
-    :attribute output: the sum output
-    :attribute partition_points: the input partition points. Modification not
-        supported, except for by ``Signal.eq``.
-    """
-
-    def __init__(self, width, partition_points, partition_step=1):
-        """Create a ``PartitionedAdder``.
-
-        :param width: the bit width of the input and output
-        :param partition_points: the input partition points
-        :param partition_step: a multiplier (typically double) step
-                               which in-place "expands" the partition points
-        """
-        self.width = width
-        self.pmul = partition_step
-        self.a = Signal(width, reset_less=True)
-        self.b = Signal(width, reset_less=True)
-        self.output = Signal(width, reset_less=True)
-        self.partition_points = PartitionPoints(partition_points)
-        if not self.partition_points.fits_in_width(width):
-            raise ValueError("partition_points doesn't fit in width")
-        expanded_width = 0
-        for i in range(self.width):
-            if i in self.partition_points:
-                expanded_width += 1
-            expanded_width += 1
-        self._expanded_width = expanded_width
-
-    def elaborate(self, platform):
-        """Elaborate this module."""
-        m = Module()
-        expanded_a = Signal(self._expanded_width, reset_less=True)
-        expanded_b = Signal(self._expanded_width, reset_less=True)
-        expanded_o = Signal(self._expanded_width, reset_less=True)
-
-        expanded_index = 0
-        # store bits in a list, use Cat later.  graphviz is much cleaner
-        al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
-
-        # partition points are "breaks" (extra zeros or 1s) in what would
-        # otherwise be a massive long add.  when the "break" points are 0,
-        # whatever is in it (in the output) is discarded.  however when
-        # there is a "1", it causes a roll-over carry to the *next* bit.
-        # we still ignore the "break" bit in the [intermediate] output,
-        # however by that time we've got the effect that we wanted: the
-        # carry has been carried *over* the break point.
-
-        for i in range(self.width):
-            pi = i/self.pmul # double the range of the partition point test
-            if pi.is_integer() and pi in self.partition_points:
-                # add extra bit set to 0 + 0 for enabled partition points
-                # and 1 + 0 for disabled partition points
-                ea.append(expanded_a[expanded_index])
-                al.append(~self.partition_points[pi]) # add extra bit in a
-                eb.append(expanded_b[expanded_index])
-                bl.append(C(0)) # yes, add a zero
-                expanded_index += 1 # skip the extra point.  NOT in the output
-            ea.append(expanded_a[expanded_index])
-            eb.append(expanded_b[expanded_index])
-            eo.append(expanded_o[expanded_index])
-            al.append(self.a[i])
-            bl.append(self.b[i])
-            ol.append(self.output[i])
-            expanded_index += 1
-
-        # combine above using Cat
-        m.d.comb += Cat(*ea).eq(Cat(*al))
-        m.d.comb += Cat(*eb).eq(Cat(*bl))
-        m.d.comb += Cat(*ol).eq(Cat(*eo))
-
-        # use only one addition to take advantage of look-ahead carry and
-        # special hardware on FPGAs
-        m.d.comb += expanded_o.eq(expanded_a + expanded_b)
-        return m
+from ieee754.part_mul_add.partpoints import PartitionPoints
+from ieee754.part_mul_add.adder import PartitionedAdder, MaskedFullAdder
  
  
  FULL_ADDER_INPUT_COUNT = 3
  
  
  
  FULL_ADDER_INPUT_COUNT = 3
  
+
  class AddReduceData:
  
      def __init__(self, part_pts, n_inputs, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
  class AddReduceData:
  
      def __init__(self, part_pts, n_inputs, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
-                          for i in range(n_parts)]
-        self.terms = [Signal(output_width, name=f"inputs_{i}",
-                              reset_less=True)
-                        for i in range(n_inputs)]
+                         for i in range(n_parts)]
+        self.terms = [Signal(output_width, name=f"terms_{i}",
+                             reset_less=True)
+                      for i in range(n_inputs)]
          self.part_pts = part_pts.like()
  
      def eq_from(self, part_pts, inputs, part_ops):
          return [self.part_pts.eq(part_pts)] + \
                 [self.terms[i].eq(inputs[i])
          self.part_pts = part_pts.like()
  
      def eq_from(self, part_pts, inputs, part_ops):
          return [self.part_pts.eq(part_pts)] + \
                 [self.terms[i].eq(inputs[i])
-                                     for i in range(len(self.terms))] + \
+                for i in range(len(self.terms))] + \
                 [self.part_ops[i].eq(part_ops[i])
                 [self.part_ops[i].eq(part_ops[i])
-                                     for i in range(len(self.part_ops))]
+                for i in range(len(self.part_ops))]
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
@@ -328,7 +43,7 @@ class FinalReduceData:
  
      def __init__(self, part_pts, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
  
      def __init__(self, part_pts, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
-                          for i in range(n_parts)]
+                         for i in range(n_parts)]
          self.output = Signal(output_width, reset_less=True)
          self.part_pts = part_pts.like()
  
          self.output = Signal(output_width, reset_less=True)
          self.part_pts = part_pts.like()
  
@@ -336,26 +51,28 @@ class FinalReduceData:
          return [self.part_pts.eq(part_pts)] + \
                 [self.output.eq(output)] + \
                 [self.part_ops[i].eq(part_ops[i])
          return [self.part_pts.eq(part_pts)] + \
                 [self.output.eq(output)] + \
                 [self.part_ops[i].eq(part_ops[i])
-                                     for i in range(len(self.part_ops))]
+                for i in range(len(self.part_ops))]
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
  
  
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
  
  
-class FinalAdd(Elaboratable):
+class FinalAdd(PipeModBase):
      """ Final stage of add reduce
      """
  
      """ Final stage of add reduce
      """
  
-    def __init__(self, n_inputs, output_width, n_parts, partition_points):
-        self.output_width = output_width
+    def __init__(self, pspec, lidx, n_inputs, partition_points,
+                 partition_step=1):
+        self.lidx = lidx
+        self.partition_step = partition_step
+        self.output_width = pspec.width * 2
          self.n_inputs = n_inputs
          self.n_inputs = n_inputs
-        self.n_parts = n_parts
+        self.n_parts = pspec.n_parts
          self.partition_points = PartitionPoints(partition_points)
          self.partition_points = PartitionPoints(partition_points)
-        if not self.partition_points.fits_in_width(output_width):
+        if not self.partition_points.fits_in_width(self.output_width):
              raise ValueError("partition_points doesn't fit in output_width")
  
              raise ValueError("partition_points doesn't fit in output_width")
  
-        self.i = self.ispec()
-        self.o = self.ospec()
+        super().__init__(pspec, "finaladd")
  
      def ispec(self):
          return AddReduceData(self.partition_points, self.n_inputs,
  
      def ispec(self):
          return AddReduceData(self.partition_points, self.n_inputs,
@@ -363,7 +80,7 @@ class FinalAdd(Elaboratable):
  
      def ospec(self):
          return FinalReduceData(self.partition_points,
  
      def ospec(self):
          return FinalReduceData(self.partition_points,
-                                 self.output_width, self.n_parts)
+                               self.output_width, self.n_parts)
  
      def elaborate(self, platform):
          """Elaborate this module."""
  
      def elaborate(self, platform):
          """Elaborate this module."""
@@ -381,7 +98,7 @@ class FinalAdd(Elaboratable):
              # base case for adding 2 inputs
              assert self.n_inputs == 2
              adder = PartitionedAdder(output_width,
              # base case for adding 2 inputs
              assert self.n_inputs == 2
              adder = PartitionedAdder(output_width,
-                                     self.i.part_pts, 2)
+                                     self.i.part_pts, self.partition_step)
              m.submodules.final_adder = adder
              m.d.comb += adder.a.eq(self.i.terms[0])
              m.d.comb += adder.b.eq(self.i.terms[1])
              m.submodules.final_adder = adder
              m.d.comb += adder.a.eq(self.i.terms[0])
              m.d.comb += adder.b.eq(self.i.terms[1])
@@ -394,7 +111,7 @@ class FinalAdd(Elaboratable):
          return m
  
  
          return m
  
  
-class AddReduceSingle(Elaboratable):
+class AddReduceSingle(PipeModBase):
      """Add list of numbers together.
  
      :attribute inputs: input ``Signal``s to be summed. Modification not
      """Add list of numbers together.
  
      :attribute inputs: input ``Signal``s to be summed. Modification not
@@ -406,25 +123,27 @@ class AddReduceSingle(Elaboratable):
          supported, except for by ``Signal.eq``.
      """
  
          supported, except for by ``Signal.eq``.
      """
  
-    def __init__(self, n_inputs, output_width, n_parts, partition_points):
+    def __init__(self, pspec, lidx, n_inputs, partition_points,
+                 partition_step=1):
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
          :param output_width: bit-width of ``output``.
          :param partition_points: the input partition points.
          """
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
          :param output_width: bit-width of ``output``.
          :param partition_points: the input partition points.
          """
+        self.lidx = lidx
+        self.partition_step = partition_step
          self.n_inputs = n_inputs
          self.n_inputs = n_inputs
-        self.n_parts = n_parts
-        self.output_width = output_width
+        self.n_parts = pspec.n_parts
+        self.output_width = pspec.width * 2
          self.partition_points = PartitionPoints(partition_points)
          self.partition_points = PartitionPoints(partition_points)
-        if not self.partition_points.fits_in_width(output_width):
+        if not self.partition_points.fits_in_width(self.output_width):
              raise ValueError("partition_points doesn't fit in output_width")
  
          self.groups = AddReduceSingle.full_adder_groups(n_inputs)
          self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
  
              raise ValueError("partition_points doesn't fit in output_width")
  
          self.groups = AddReduceSingle.full_adder_groups(n_inputs)
          self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
  
-        self.i = self.ispec()
-        self.o = self.ospec()
+        super().__init__(pspec, "addreduce_%d" % lidx)
  
      def ispec(self):
          return AddReduceData(self.partition_points, self.n_inputs,
  
      def ispec(self):
          return AddReduceData(self.partition_points, self.n_inputs,
@@ -510,13 +229,14 @@ class AddReduceSingle(Elaboratable):
          # copy reg part points and part ops to output
          m.d.comb += self.o.part_pts.eq(self.i.part_pts)
          m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
          # copy reg part points and part ops to output
          m.d.comb += self.o.part_pts.eq(self.i.part_pts)
          m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
-                                     for i in range(len(self.i.part_ops))]
+                     for i in range(len(self.i.part_ops))]
  
          # set up the partition mask (for the adders)
          part_mask = Signal(self.output_width, reset_less=True)
  
          # get partition points as a mask
  
          # set up the partition mask (for the adders)
          part_mask = Signal(self.output_width, reset_less=True)
  
          # get partition points as a mask
-        mask = self.i.part_pts.as_mask(self.output_width, mul=2)
+        mask = self.i.part_pts.as_mask(self.output_width,
+                                       mul=self.partition_step)
          m.d.comb += part_mask.eq(mask)
  
          # add and link the intermediate term modules
          m.d.comb += part_mask.eq(mask)
  
          # add and link the intermediate term modules
@@ -532,7 +252,7 @@ class AddReduceSingle(Elaboratable):
  
  
  class AddReduceInternal:
  
  
  class AddReduceInternal:
-    """Recursively Add list of numbers together.
+    """Iteratively Add list of numbers together.
  
      :attribute inputs: input ``Signal``s to be summed. Modification not
          supported, except for by ``Signal.eq``.
  
      :attribute inputs: input ``Signal``s to be summed. Modification not
          supported, except for by ``Signal.eq``.
@@ -543,18 +263,18 @@ class AddReduceInternal:
          supported, except for by ``Signal.eq``.
      """
  
          supported, except for by ``Signal.eq``.
      """
  
-    def __init__(self, i, output_width):
+    def __init__(self, pspec, n_inputs, part_pts, partition_step=1):
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
          :param output_width: bit-width of ``output``.
          :param partition_points: the input partition points.
          """
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
          :param output_width: bit-width of ``output``.
          :param partition_points: the input partition points.
          """
-        self.i = i
-        self.inputs = i.terms
-        self.part_ops = i.part_ops
-        self.output_width = output_width
-        self.partition_points = i.part_pts
+        self.pspec = pspec
+        self.n_inputs = n_inputs
+        self.output_width = pspec.width * 2
+        self.partition_points = part_pts
+        self.partition_step = partition_step
  
          self.create_levels()
  
  
          self.create_levels()
  
@@ -563,24 +283,22 @@ class AddReduceInternal:
  
          mods = []
          partition_points = self.partition_points
  
          mods = []
          partition_points = self.partition_points
-        part_ops = self.part_ops
-        n_parts = len(part_ops)
-        inputs = self.inputs
-        ilen = len(inputs)
+        ilen = self.n_inputs
          while True:
          while True:
-            groups = AddReduceSingle.full_adder_groups(len(inputs))
+            groups = AddReduceSingle.full_adder_groups(ilen)
              if len(groups) == 0:
                  break
              if len(groups) == 0:
                  break
-            next_level = AddReduceSingle(ilen, self.output_width, n_parts,
-                                         partition_points)
+            lidx = len(mods)
+            next_level = AddReduceSingle(self.pspec, lidx, ilen,
+                                         partition_points,
+                                         self.partition_step)
              mods.append(next_level)
              partition_points = next_level.i.part_pts
              mods.append(next_level)
              partition_points = next_level.i.part_pts
-            inputs = next_level.o.terms
-            ilen = len(inputs)
-            part_ops = next_level.i.part_ops
+            ilen = len(next_level.o.terms)
  
  
-        next_level = FinalAdd(ilen, self.output_width, n_parts,
-                              partition_points)
+        lidx = len(mods)
+        next_level = FinalAdd(self.pspec, lidx, ilen,
+                              partition_points, self.partition_step)
          mods.append(next_level)
  
          self.levels = mods
          mods.append(next_level)
  
          self.levels = mods
@@ -599,7 +317,7 @@ class AddReduce(AddReduceInternal, Elaboratable):
      """
  
      def __init__(self, inputs, output_width, register_levels, part_pts,
      """
  
      def __init__(self, inputs, output_width, register_levels, part_pts,
-                       part_ops):
+                 part_ops, partition_step=1):
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
          """Create an ``AddReduce``.
  
          :param inputs: input ``Signal``s to be summed.
@@ -613,8 +331,9 @@ class AddReduce(AddReduceInternal, Elaboratable):
          self._part_ops = part_ops
          n_parts = len(part_ops)
          self.i = AddReduceData(part_pts, len(inputs),
          self._part_ops = part_ops
          n_parts = len(part_ops)
          self.i = AddReduceData(part_pts, len(inputs),
-                             output_width, n_parts)
-        AddReduceInternal.__init__(self, self.i, output_width)
+                               output_width, n_parts)
+        AddReduceInternal.__init__(self, pspec, n_inputs, part_pts,
+                                   partition_step)
          self.o = FinalReduceData(part_pts, output_width, n_parts)
          self.register_levels = register_levels
  
          self.o = FinalReduceData(part_pts, output_width, n_parts)
          self.register_levels = register_levels
  
@@ -633,7 +352,8 @@ class AddReduce(AddReduceInternal, Elaboratable):
          """Elaborate this module."""
          m = Module()
  
          """Elaborate this module."""
          m = Module()
  
-        m.d.comb += self.i.eq_from(self._part_pts, self._inputs, self._part_ops)
+        m.d.comb += self.i.eq_from(self._part_pts,
+                                   self._inputs, self._part_ops)
  
          for i, next_level in enumerate(self.levels):
              setattr(m.submodules, "next_level%d" % i, next_level)
  
          for i, next_level in enumerate(self.levels):
              setattr(m.submodules, "next_level%d" % i, next_level)
@@ -645,7 +365,7 @@ class AddReduce(AddReduceInternal, Elaboratable):
                  m.d.sync += mcur.i.eq(i)
              else:
                  m.d.comb += mcur.i.eq(i)
                  m.d.sync += mcur.i.eq(i)
              else:
                  m.d.comb += mcur.i.eq(i)
-            i = mcur.o # for next loop
+            i = mcur.o  # for next loop
  
          # output comes from last module
          m.d.comb += self.o.eq(i)
  
          # output comes from last module
          m.d.comb += self.o.eq(i)
@@ -702,7 +422,7 @@ class ProductTerm(Elaboratable):
          else:
              term_enabled = None
          self.enabled = term_enabled
          else:
              term_enabled = None
          self.enabled = term_enabled
-        self.term.name = "term_%d_%d" % (a_index, b_index) # rename
+        self.term.name = "term_%d_%d" % (a_index, b_index)  # rename
  
      def elaborate(self, platform):
  
  
      def elaborate(self, platform):
  
@@ -745,6 +465,7 @@ class ProductTerms(Elaboratable):
          this class is to be wrapped with a for-loop on the "a" operand.
          it creates a second-level for-loop on the "b" operand.
      """
          this class is to be wrapped with a for-loop on the "a" operand.
          it creates a second-level for-loop on the "b" operand.
      """
+
      def __init__(self, width, twidth, pbwid, a_index, blen):
          self.a_index = a_index
          self.blen = blen
      def __init__(self, width, twidth, pbwid, a_index, blen):
          self.a_index = a_index
          self.blen = blen
@@ -754,8 +475,8 @@ class ProductTerms(Elaboratable):
          self.a = Signal(twidth//2, reset_less=True)
          self.b = Signal(twidth//2, reset_less=True)
          self.pb_en = Signal(pbwid, reset_less=True)
          self.a = Signal(twidth//2, reset_less=True)
          self.b = Signal(twidth//2, reset_less=True)
          self.pb_en = Signal(pbwid, reset_less=True)
-        self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
-                            for i in range(blen)]
+        self.terms = [Signal(twidth, name="term%d" % i, reset_less=True)
+                      for i in range(blen)]
  
      def elaborate(self, platform):
  
  
      def elaborate(self, platform):
  
@@ -790,7 +511,7 @@ class LSBNegTerm(Elaboratable):
          m = Module()
          comb = m.d.comb
          bit_wid = self.bit_width
          m = Module()
          comb = m.d.comb
          bit_wid = self.bit_width
-        ext = Repl(0, bit_wid) # extend output to HI part
+        ext = Repl(0, bit_wid)  # extend output to HI part
  
          # determine sign of each incoming number *in this partition*
          enabled = Signal(reset_less=True)
  
          # determine sign of each incoming number *in this partition*
          enabled = Signal(reset_less=True)
@@ -864,7 +585,8 @@ class Part(Elaboratable):
          the extra terms - as separate terms - are then thrown at the
          AddReduce alongside the multiplication part-results.
      """
          the extra terms - as separate terms - are then thrown at the
          AddReduce alongside the multiplication part-results.
      """
-    def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
+
+    def __init__(self, part_pts, width, n_parts, pbwid):
  
          self.pbwid = pbwid
          self.part_pts = part_pts
  
          self.pbwid = pbwid
          self.part_pts = part_pts
@@ -873,14 +595,14 @@ class Part(Elaboratable):
          self.a = Signal(64, reset_less=True)
          self.b = Signal(64, reset_less=True)
          self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
          self.a = Signal(64, reset_less=True)
          self.b = Signal(64, reset_less=True)
          self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
-                            for i in range(8)]
+                         for i in range(8)]
          self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
          self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
-                            for i in range(8)]
+                         for i in range(8)]
          self.pbs = Signal(pbwid, reset_less=True)
  
          # outputs
          self.parts = [Signal(name=f"part_{i}", reset_less=True)
          self.pbs = Signal(pbwid, reset_less=True)
  
          # outputs
          self.parts = [Signal(name=f"part_{i}", reset_less=True)
-                            for i in range(n_parts)]
+                      for i in range(n_parts)]
  
          self.not_a_term = Signal(width, reset_less=True)
          self.neg_lsb_a_term = Signal(width, reset_less=True)
  
          self.not_a_term = Signal(width, reset_less=True)
          self.neg_lsb_a_term = Signal(width, reset_less=True)
@@ -899,10 +621,10 @@ class Part(Elaboratable):
          byte_count = 8 // len(parts)
  
          not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
          byte_count = 8 // len(parts)
  
          not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
-                self.not_a_term, self.neg_lsb_a_term,
-                self.not_b_term, self.neg_lsb_b_term)
+            self.not_a_term, self.neg_lsb_a_term,
+            self.not_b_term, self.neg_lsb_b_term)
  
  
-        byte_width = 8 // len(parts) # byte width
+        byte_width = 8 // len(parts)  # byte width
          bit_wid = 8 * byte_width     # bit width
          nat, nbt, nla, nlb = [], [], [], []
          for i in range(len(parts)):
          bit_wid = 8 * byte_width     # bit width
          nat, nbt, nla, nlb = [], [], [], []
          for i in range(len(parts)):
@@ -911,8 +633,8 @@ class Part(Elaboratable):
              setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
              m.d.comb += pa.part.eq(parts[i])
              m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
              setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
              m.d.comb += pa.part.eq(parts[i])
              m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
-            m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
-            m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
+            m.d.comb += pa.signed.eq(self.b_signed[i * byte_width])  # yes b
+            m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1])  # really, b
              nat.append(pa.nt)
              nla.append(pa.nl)
  
              nat.append(pa.nt)
              nla.append(pa.nl)
  
@@ -921,8 +643,8 @@ class Part(Elaboratable):
              setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
              m.d.comb += pb.part.eq(parts[i])
              m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
              setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
              m.d.comb += pb.part.eq(parts[i])
              m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
-            m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
-            m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
+            m.d.comb += pb.signed.eq(self.a_signed[i * byte_width])  # yes a
+            m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1])  # really, a
              nbt.append(pb.nt)
              nlb.append(pb.nl)
  
              nbt.append(pb.nt)
              nlb.append(pb.nl)
  
@@ -931,7 +653,7 @@ class Part(Elaboratable):
                       not_b_term.eq(Cat(*nbt)),
                       neg_lsb_a_term.eq(Cat(*nla)),
                       neg_lsb_b_term.eq(Cat(*nlb)),
                       not_b_term.eq(Cat(*nbt)),
                       neg_lsb_a_term.eq(Cat(*nla)),
                       neg_lsb_b_term.eq(Cat(*nlb)),
-                    ]
+                     ]
  
          return m
  
  
          return m
  
@@ -940,11 +662,12 @@ class IntermediateOut(Elaboratable):
      """ selects the HI/LO part of the multiplication, for a given bit-width
          the output is also reconstructed in its SIMD (partition) lanes.
      """
      """ selects the HI/LO part of the multiplication, for a given bit-width
          the output is also reconstructed in its SIMD (partition) lanes.
      """
+
      def __init__(self, width, out_wid, n_parts):
          self.width = width
          self.n_parts = n_parts
          self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
      def __init__(self, width, out_wid, n_parts):
          self.width = width
          self.n_parts = n_parts
          self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
-                                     for i in range(8)]
+                         for i in range(8)]
          self.intermed = Signal(out_wid, reset_less=True)
          self.output = Signal(out_wid//2, reset_less=True)
  
          self.intermed = Signal(out_wid, reset_less=True)
          self.output = Signal(out_wid//2, reset_less=True)
  
@@ -966,21 +689,22 @@ class IntermediateOut(Elaboratable):
          return m
  
  
          return m
  
  
-class FinalOut(Elaboratable):
+class FinalOut(PipeModBase):
      """ selects the final output based on the partitioning.
  
          each byte is selectable independently, i.e. it is possible
          that some partitions requested 8-bit computation whilst others
          requested 16 or 32 bit.
      """
      """ selects the final output based on the partitioning.
  
          each byte is selectable independently, i.e. it is possible
          that some partitions requested 8-bit computation whilst others
          requested 16 or 32 bit.
      """
-    def __init__(self, output_width, n_parts, part_pts):
+
+    def __init__(self, pspec, part_pts):
+
          self.part_pts = part_pts
          self.part_pts = part_pts
-        self.output_width = output_width
-        self.n_parts = n_parts
-        self.out_wid = output_width//2
+        self.output_width = pspec.width * 2
+        self.n_parts = pspec.n_parts
+        self.out_wid = pspec.width
  
  
-        self.i = self.ispec()
-        self.o = self.ospec()
+        super().__init__(pspec, "finalout")
  
      def ispec(self):
          return IntermediateData(self.part_pts, self.output_width, self.n_parts)
  
      def ispec(self):
          return IntermediateData(self.part_pts, self.output_width, self.n_parts)
@@ -1036,9 +760,9 @@ class FinalOut(Elaboratable):
              m.d.comb += op.eq(
                  Mux(d8[i] | d16[i // 2],
                      Mux(d8[i], i8.bit_select(i * 8, 8),
              m.d.comb += op.eq(
                  Mux(d8[i] | d16[i // 2],
                      Mux(d8[i], i8.bit_select(i * 8, 8),
-                               i16.bit_select(i * 8, 8)),
+                        i16.bit_select(i * 8, 8)),
                      Mux(d32[i // 4], i32.bit_select(i * 8, 8),
                      Mux(d32[i // 4], i32.bit_select(i * 8, 8),
-                                      i64.bit_select(i * 8, 8))))
+                        i64.bit_select(i * 8, 8))))
              ol.append(op)
  
          # create outputs
              ol.append(op)
  
          # create outputs
@@ -1051,6 +775,7 @@ class FinalOut(Elaboratable):
  class OrMod(Elaboratable):
      """ ORs four values together in a hierarchical tree
      """
  class OrMod(Elaboratable):
      """ ORs four values together in a hierarchical tree
      """
+
      def __init__(self, wid):
          self.wid = wid
          self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
      def __init__(self, wid):
          self.wid = wid
          self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
@@ -1084,7 +809,7 @@ class Signs(Elaboratable):
  
          asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
          bsig = (self.part_ops == OP_MUL_LOW) \
  
          asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
          bsig = (self.part_ops == OP_MUL_LOW) \
-                    | (self.part_ops == OP_MUL_SIGNED_HIGH)
+            | (self.part_ops == OP_MUL_SIGNED_HIGH)
          m.d.comb += self.a_signed.eq(asig)
          m.d.comb += self.b_signed.eq(bsig)
  
          m.d.comb += self.a_signed.eq(asig)
          m.d.comb += self.b_signed.eq(bsig)
  
@@ -1095,21 +820,21 @@ class IntermediateData:
  
      def __init__(self, part_pts, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
  
      def __init__(self, part_pts, output_width, n_parts):
          self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
-                          for i in range(n_parts)]
+                         for i in range(n_parts)]
          self.part_pts = part_pts.like()
          self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
          self.part_pts = part_pts.like()
          self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
-                          for i in range(4)]
+                        for i in range(4)]
          # intermediates (needed for unit tests)
          self.intermediate_output = Signal(output_width)
  
      def eq_from(self, part_pts, outputs, intermediate_output,
          # intermediates (needed for unit tests)
          self.intermediate_output = Signal(output_width)
  
      def eq_from(self, part_pts, outputs, intermediate_output,
-                      part_ops):
+                part_ops):
          return [self.part_pts.eq(part_pts)] + \
                 [self.intermediate_output.eq(intermediate_output)] + \
                 [self.outputs[i].eq(outputs[i])
          return [self.part_pts.eq(part_pts)] + \
                 [self.intermediate_output.eq(intermediate_output)] + \
                 [self.outputs[i].eq(outputs[i])
-                                     for i in range(4)] + \
+                for i in range(4)] + \
                 [self.part_ops[i].eq(part_ops[i])
                 [self.part_ops[i].eq(part_ops[i])
-                                     for i in range(len(self.part_ops))]
+                for i in range(len(self.part_ops))]
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.outputs,
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.outputs,
@@ -1130,7 +855,7 @@ class InputData:
          return [self.part_pts.eq(part_pts)] + \
                 [self.a.eq(a), self.b.eq(b)] + \
                 [self.part_ops[i].eq(part_ops[i])
          return [self.part_pts.eq(part_pts)] + \
                 [self.a.eq(a), self.b.eq(b)] + \
                 [self.part_ops[i].eq(part_ops[i])
-                                     for i in range(len(self.part_ops))]
+                for i in range(len(self.part_ops))]
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
  
      def eq(self, rhs):
          return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
@@ -1139,7 +864,7 @@ class InputData:
  class OutputData:
  
      def __init__(self):
  class OutputData:
  
      def __init__(self):
-        self.intermediate_output = Signal(128) # needed for unit tests
+        self.intermediate_output = Signal(128)  # needed for unit tests
          self.output = Signal(64)
  
      def eq(self, rhs):
          self.output = Signal(64)
  
      def eq(self, rhs):
@@ -1147,26 +872,17 @@ class OutputData:
                  self.output.eq(rhs.output)]
  
  
                  self.output.eq(rhs.output)]
  
  
-class AllTerms(Elaboratable):
+class AllTerms(PipeModBase):
      """Set of terms to be added together
      """
  
      """Set of terms to be added together
      """
  
-    def __init__(self, n_inputs, output_width, n_parts, register_levels):
-        """Create an ``AddReduce``.
-
-        :param inputs: input ``Signal``s to be summed.
-        :param output_width: bit-width of ``output``.
-        :param register_levels: List of nesting levels that should have
-            pipeline registers.
-        :param partition_points: the input partition points.
+    def __init__(self, pspec, n_inputs):
+        """Create an ``AllTerms``.
          """
          """
-        self.register_levels = register_levels
          self.n_inputs = n_inputs
          self.n_inputs = n_inputs
-        self.n_parts = n_parts
-        self.output_width = output_width
-
-        self.i = self.ispec()
-        self.o = self.ospec()
+        self.n_parts = pspec.n_parts
+        self.output_width = pspec.width * 2
+        super().__init__(pspec, "allterms")
  
      def ispec(self):
          return InputData()
  
      def ispec(self):
          return InputData()
@@ -1197,11 +913,10 @@ class AllTerms(Elaboratable):
              setattr(m.submodules, "signs%d" % i, s)
              m.d.comb += s.part_ops.eq(self.i.part_ops[i])
  
              setattr(m.submodules, "signs%d" % i, s)
              m.d.comb += s.part_ops.eq(self.i.part_ops[i])
  
-        n_levels = len(self.register_levels)+1
-        m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
-        m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
-        m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
-        m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
+        m.submodules.part_8 = part_8 = Part(eps, 128, 8, 8)
+        m.submodules.part_16 = part_16 = Part(eps, 128, 4, 8)
+        m.submodules.part_32 = part_32 = Part(eps, 128, 2, 8)
+        m.submodules.part_64 = part_64 = Part(eps, 128, 1, 8)
          nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
          for mod in [part_8, part_16, part_32, part_64]:
              m.d.comb += mod.a.eq(self.i.a)
          nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
          for mod in [part_8, part_16, part_32, part_64]:
              m.d.comb += mod.a.eq(self.i.a)
@@ -1235,9 +950,9 @@ class AllTerms(Elaboratable):
          m.submodules.nla_or = nla_or = OrMod(128)
          m.submodules.nlb_or = nlb_or = OrMod(128)
          for l, mod in [(nat_l, nat_or),
          m.submodules.nla_or = nla_or = OrMod(128)
          m.submodules.nlb_or = nlb_or = OrMod(128)
          for l, mod in [(nat_l, nat_or),
-                             (nbt_l, nbt_or),
-                             (nla_l, nla_or),
-                             (nlb_l, nlb_or)]:
+                       (nbt_l, nbt_or),
+                       (nla_l, nla_or),
+                       (nlb_l, nlb_or)]:
              for i in range(len(l)):
                  m.d.comb += mod.orin[i].eq(l[i])
              terms.append(mod.orout)
              for i in range(len(l)):
                  m.d.comb += mod.orin[i].eq(l[i])
              terms.append(mod.orout)
@@ -1249,22 +964,21 @@ class AllTerms(Elaboratable):
          # copy reg part points and part ops to output
          m.d.comb += self.o.part_pts.eq(eps)
          m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
          # copy reg part points and part ops to output
          m.d.comb += self.o.part_pts.eq(eps)
          m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
-                                     for i in range(len(self.i.part_ops))]
+                     for i in range(len(self.i.part_ops))]
  
          return m
  
  
  
          return m
  
  
-class Intermediates(Elaboratable):
+class Intermediates(PipeModBase):
      """ Intermediate output modules
      """
  
      """ Intermediate output modules
      """
  
-    def __init__(self, output_width, n_parts, part_pts):
+    def __init__(self, pspec, part_pts):
          self.part_pts = part_pts
          self.part_pts = part_pts
-        self.output_width = output_width
-        self.n_parts = n_parts
+        self.output_width = pspec.width * 2
+        self.n_parts = pspec.n_parts
  
  
-        self.i = self.ispec()
-        self.o = self.ospec()
+        super().__init__(pspec, "intermediates")
  
      def ispec(self):
          return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
  
      def ispec(self):
          return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
@@ -1317,6 +1031,8 @@ class Intermediates(Elaboratable):
  class Mul8_16_32_64(Elaboratable):
      """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
  
  class Mul8_16_32_64(Elaboratable):
      """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
  
+    XXX NOTE: this class is intended for unit test purposes ONLY.
+
      Supports partitioning into any combination of 8, 16, 32, and 64-bit
      partitions on naturally-aligned boundaries. Supports the operation being
      set for each partition independently.
      Supports partitioning into any combination of 8, 16, 32, and 64-bit
      partitions on naturally-aligned boundaries. Supports the operation being
      set for each partition independently.
@@ -1347,6 +1063,11 @@ class Mul8_16_32_64(Elaboratable):
              flip-flops are to be inserted.
          """
  
              flip-flops are to be inserted.
          """
  
+        self.id_wid = 0  # num_bits(num_rows)
+        self.op_wid = 0
+        self.pspec = PipelineSpec(64, self.id_wid, self.op_wid, n_ops=3)
+        self.pspec.n_parts = 8
+
          # parameter(s)
          self.register_levels = list(register_levels)
  
          # parameter(s)
          self.register_levels = list(register_levels)
  
@@ -1375,29 +1096,33 @@ class Mul8_16_32_64(Elaboratable):
          part_pts = self.part_pts
  
          n_inputs = 64 + 4
          part_pts = self.part_pts
  
          n_inputs = 64 + 4
-        n_parts = 8
-        t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
-        m.submodules.allterms = t
-        m.d.comb += t.i.eq(self.i)
+        t = AllTerms(self.pspec, n_inputs)
+        t.setup(m, self.i)
  
          terms = t.o.terms
  
  
          terms = t.o.terms
  
-        add_reduce = AddReduce(terms,
-                               128,
-                               self.register_levels,
-                               t.o.part_pts,
-                               t.o.part_ops)
+        at = AddReduceInternal(self.pspec, n_inputs,
+                               part_pts, partition_step=2)
  
  
-        m.submodules.add_reduce = add_reduce
+        i = t.o
+        for idx in range(len(at.levels)):
+            mcur = at.levels[idx]
+            mcur.setup(m, i)
+            o = mcur.ospec()
+            if idx in self.register_levels:
+                m.d.sync += o.eq(mcur.process(i))
+            else:
+                m.d.comb += o.eq(mcur.process(i))
+            i = o  # for next loop
  
  
-        interm = Intermediates(128, 8, part_pts)
-        m.submodules.intermediates = interm
-        m.d.comb += interm.i.eq(add_reduce.o)
+        interm = Intermediates(self.pspec, part_pts)
+        interm.setup(m, i)
+        o = interm.process(interm.i)
  
          # final output
  
          # final output
-        m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
-        m.d.comb += finalout.i.eq(interm.o)
-        m.d.comb += self.o.eq(finalout.o)
+        finalout = FinalOut(self.pspec, part_pts)
+        finalout.setup(m, o)
+        m.d.comb += self.o.eq(finalout.process(o))
  
          return m
  
  
          return m