From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 9 Feb 2020 19:54:38 +0000 (+0000)
Subject: split out nmutil library based on ieee754fpu code
X-Git-Tag: 24jan2021_ls180~90
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4241aad8550d189a4aff51a0ecba9777347ac3bf;p=nmutil.git

split out nmutil library based on ieee754fpu code
---

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..77d4f5d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+*.vcd
+*.py?
+!*.pyi
+.*.sw?
+__pycache__
+*.v
+*.il
+*.il.*
+.eggs
+*.egg-info
+*.gtkw
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e33b622
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,7 @@
+PYTHON3 ?= "python3"
+
+install:
+	$(PYTHON3) setup.py develop # yes, develop, not install
+
+test:
+	$(PYTHON3) setup.py test # could just run nosetest3...
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d96fb45
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# NMigen Util 
+
+This project implements utilities for nmigen
+
+# Requirements
+
+* nmigen
+* yosys (latest git repository, required by nmigen)
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..2dfe140
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,39 @@
+from setuptools import setup, find_packages
+import sys, os
+
+here = os.path.abspath(os.path.dirname(__file__))
+README = open(os.path.join(here, 'README.md')).read()
+NEWS = open(os.path.join(here, 'NEWS.txt')).read()
+
+version = '0.0.1'
+
+install_requires = [
+]
+
+test_requires = [
+    'nose',
+]
+
+setup(
+    name='nmigen',
+    version=version,
+    description="A nmigen utility library",
+    long_description=README + '\n\n' + NEWS,
+    classifiers=[
+        "Topic :: Software Development :: Libraries",
+        "License :: OSI Approved :: LGPLv3+",
+        "Programming Language :: Python :: 3",
+    ],
+    keywords='nmigen utilities',
+    author='Luke Kenneth Casson Leighton',
+    author_email='lkcl@libre-riscv.org',
+    url='http://git.libre-riscv.org/?p=nmutil',
+    license='GPLv3+',
+    packages=find_packages('src'),
+    package_dir = {'': 'src'},
+    include_package_data=True,
+    zip_safe=False,
+    install_requires=install_requires,
+    tests_require=test_requires,
+    test_suite='nose.collector',
+)
diff --git a/src/nmutil/__init__.py b/src/nmutil/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/nmutil/concurrentunit.py b/src/nmutil/concurrentunit.py
new file mode 100644
index 0000000..da63d32
--- /dev/null
+++ b/src/nmutil/concurrentunit.py
@@ -0,0 +1,81 @@
+""" concurrent unit from mitch alsup augmentations to 6600 scoreboard
+
+    * data fans in
+    * data goes through a pipeline
+    * results fan back out.
+
+    the output data format has to have a member "muxid", which is used
+    as the array index on fan-out
+"""
+
+from math import log
+from nmigen import Module, Elaboratable
+from nmigen.cli import main, verilog
+
+from nmutil.singlepipe import PassThroughStage
+from nmutil.multipipe import CombMuxOutPipe
+from nmutil.multipipe import PriorityCombMuxInPipe
+
+
+def num_bits(n):
+    return int(log(n) / log(2))
+
+
+class FPADDInMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows, iospecfn, maskwid=0):
+        self.num_rows = num_rows
+        stage = PassThroughStage(iospecfn)
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows,
+                                       maskwid=maskwid)
+
+
+class FPADDMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows, iospecfn, maskwid=0):
+        self.num_rows = num_rows
+        stage = PassThroughStage(iospecfn)
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows,
+                                maskwid=maskwid)
+
+
+class ReservationStations(Elaboratable):
+    """ Reservation-Station pipeline
+
+        Input: num_rows - number of input and output Reservation Stations
+
+        Requires: the addition of an "alu" object, from which ispec and ospec
+        are taken, and inpipe and outpipe are connected to it
+
+        * fan-in on inputs (an array of FPADDBaseData: a,b,mid)
+        * ALU pipeline
+        * fan-out on outputs (an array of FPPackData: z,mid)
+
+        Fan-in and Fan-out are combinatorial.
+    """
+    def __init__(self, num_rows, maskwid=0):
+        self.num_rows = nr = num_rows
+        self.inpipe = FPADDInMuxPipe(nr, self.i_specfn, maskwid)   # fan-in
+        self.outpipe = FPADDMuxOutPipe(nr, self.o_specfn, maskwid) # fan-out
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.alu = self.alu
+        m.submodules.outpipe = self.outpipe
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.alu.p)
+        m.d.comb += self.alu.connect_to_next(self.outpipe)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+    def i_specfn(self):
+        return self.alu.ispec()
+
+    def o_specfn(self):
+        return self.alu.ospec()
diff --git a/src/nmutil/dynamicpipe.py b/src/nmutil/dynamicpipe.py
new file mode 100644
index 0000000..f9c649c
--- /dev/null
+++ b/src/nmutil/dynamicpipe.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+
+""" Meta-class that allows a dynamic runtime parameter-selectable "mixin"
+
+The reasons why this technique is being deployed is because SimpleHandshake
+needs to be dynamically replaced at the end-users' choice, without having
+to duplicate dozens of classes using multiple-inheritanc "Mix-in" techniques.
+
+It is however extremely unusual, and has been explicitly limited to this *one*
+module.  DO NOT try to use this technique elsewhere, it is extremely hard to
+understand (meta-class programming).
+
+"""
+
+from abc import ABCMeta
+
+from nmutil.singlepipe import SimpleHandshake
+from nmutil.singlepipe import MaskCancellable
+
+import threading
+
+# with many thanks to jsbueno on stackexchange for this one
+# https://stackoverflow.com/questions/57273070/
+# list post:
+# http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-July/002259.html
+
+class Meta(ABCMeta):
+    registry = {}
+    recursing = threading.local()
+    recursing.check = False
+    mlock = threading.Lock()
+
+    def __call__(cls, *args, **kw):
+        mcls = cls.__class__
+        if mcls.recursing.check:
+            return super().__call__(*args, **kw)
+        spec = args[0]
+        base = spec.pipekls # pick up the dynamic class from PipelineSpec, HERE
+
+        if (cls, base) not in mcls.registry:
+            print ("__call__", args, kw, cls, base,
+                   base.__bases__, cls.__bases__)
+            mcls.registry[cls, base] = type(
+                cls.__name__,
+                (cls, base) + cls.__bases__[1:],
+                {}
+            )
+        real_cls = mcls.registry[cls, base]
+
+        with mcls.mlock:
+            mcls.recursing.check = True
+            instance = real_cls.__class__.__call__(real_cls, *args, **kw)
+            mcls.recursing.check = False
+        return instance
+
+
+# Inherit from this class instead of SimpleHandshake (or other ControlBase
+# derivative), and the metaclass will instead *replace* DynamicPipe -
+# *at runtime* - with the class that is specified *as a parameter*
+# in PipelineSpec.
+#
+# as explained in the list posting and in the stackexchange post, this is
+# needed to avoid a MASSIVE suite of duplicated multiple-inheritance classes
+# that "Mix in" SimpleHandshake (or other).
+#
+# unfortunately, composition does not work in this instance
+# (make an *instance* of SimpleHandshake or other class and pass it in)
+# due to the multiple level inheritance, and in several places
+# the inheriting class needs to do some setup that the deriving class
+# needs in order to function correctly.
+
+class DynamicPipe(metaclass=Meta):
+    def __init__(self, *args):
+        print ("DynamicPipe init", super(), args)
+        super().__init__(self, *args)
+
+
+# bad hack: the DynamicPipe metaclass ends up creating an __init__ signature
+# for the dynamically-derived class.  luckily, SimpleHandshake only needs
+# "self" as the 1st argument (it is its own "Stage").  anything else
+# could hypothetically be passed through the pspec.
+class SimpleHandshakeRedir(SimpleHandshake):
+    def __init__(self, mod, *args):
+        print ("redir", mod, args)
+        stage = self
+        if args and args[0].stage:
+            stage = args[0].stage
+        SimpleHandshake.__init__(self, stage)
+
+
+class MaskCancellableRedir(MaskCancellable):
+    def __init__(self, mod, *args):
+        stage = self
+        maskwid = args[0].maskwid
+        if args[0].stage:
+            stage = args[0].stage
+        print ("redir mask", mod, args, maskwid)
+        MaskCancellable.__init__(self, stage, maskwid)
+
diff --git a/src/nmutil/iocontrol.py b/src/nmutil/iocontrol.py
new file mode 100644
index 0000000..efe0c38
--- /dev/null
+++ b/src/nmutil/iocontrol.py
@@ -0,0 +1,282 @@
+""" IO Control API
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=148
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Important: see Stage API (stageapi.py) in combination with below
+
+    Main classes: PrevControl and NextControl.
+
+    These classes manage the data and the synchronisation state
+    to the previous and next stage, respectively.  ready/valid
+    signals are used by the Pipeline classes to tell if data
+    may be safely passed from stage to stage.
+
+    The connection from one stage to the next is carried out with
+    NextControl.connect_to_next.  It is *not* necessary to have
+    a PrevControl.connect_to_prev because it is functionally
+    directly equivalent to prev->next->connect_to_next.
+"""
+
+from nmigen import Signal, Cat, Const, Module, Value, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.hdl.rec import Record
+
+from collections.abc import Sequence, Iterable
+from collections import OrderedDict
+
+from nmutil import nmoperator
+
+
+class Object:
+    def __init__(self):
+        self.fields = OrderedDict()
+
+    def __setattr__(self, k, v):
+        print ("kv", k, v)
+        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
+           k in dir(Object) or "fields" not in self.__dict__):
+            return object.__setattr__(self, k, v)
+        self.fields[k] = v
+
+    def __getattr__(self, k):
+        if k in self.__dict__:
+            return object.__getattr__(self, k)
+        try:
+            return self.fields[k]
+        except KeyError as e:
+            raise AttributeError(e)
+
+    def __iter__(self):
+        for x in self.fields.values():  # OrderedDict so order is preserved
+            if isinstance(x, Iterable):
+                yield from x
+            else:
+                yield x
+
+    def eq(self, inp):
+        res = []
+        for (k, o) in self.fields.items():
+            i = getattr(inp, k)
+            print ("eq", o, i)
+            rres = o.eq(i)
+            if isinstance(rres, Sequence):
+                res += rres
+            else:
+                res.append(rres)
+        print (res)
+        return res
+
+    def ports(self): # being called "keys" would be much better
+        return list(self)
+
+
+class RecordObject(Record):
+    def __init__(self, layout=None, name=None):
+        Record.__init__(self, layout=layout or [], name=name)
+
+    def __setattr__(self, k, v):
+        #print (dir(Record))
+        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
+           k in dir(Record) or "fields" not in self.__dict__):
+            return object.__setattr__(self, k, v)
+        self.fields[k] = v
+        #print ("RecordObject setattr", k, v)
+        if isinstance(v, Record):
+            newlayout = {k: (k, v.layout)}
+        elif isinstance(v, Value):
+            newlayout = {k: (k, v.shape())}
+        else:
+            newlayout = {k: (k, nmoperator.shape(v))}
+        self.layout.fields.update(newlayout)
+
+    def __iter__(self):
+        for x in self.fields.values(): # remember: fields is an OrderedDict
+            if isinstance(x, Iterable):
+                yield from x           # a bit like flatten (nmigen.tools)
+            else:
+                yield x
+
+    def ports(self): # would be better being called "keys"
+        return list(self)
+
+
+class PrevControl(Elaboratable):
+    """ contains signals that come *from* the previous stage (both in and out)
+        * valid_i: previous stage indicating all incoming data is valid.
+                   may be a multi-bit signal, where all bits are required
+                   to be asserted to indicate "valid".
+        * ready_o: output to next stage indicating readiness to accept data
+        * data_i : an input - MUST be added by the USER of this class
+    """
+
+    def __init__(self, i_width=1, stage_ctl=False, maskwid=0, offs=0):
+        self.stage_ctl = stage_ctl
+        self.maskwid = maskwid
+        if maskwid:
+            self.mask_i = Signal(maskwid)                # prev   >>in  self
+            self.stop_i = Signal(maskwid)                # prev   >>in  self
+        self.valid_i = Signal(i_width, name="p_valid_i") # prev   >>in  self
+        self._ready_o = Signal(name="p_ready_o")         # prev   <<out self
+        self.data_i = None # XXX MUST BE ADDED BY USER
+        if stage_ctl:
+            self.s_ready_o = Signal(name="p_s_o_rdy")    # prev   <<out self
+        self.trigger = Signal(reset_less=True)
+
+    @property
+    def ready_o(self):
+        """ public-facing API: indicates (externally) that stage is ready
+        """
+        if self.stage_ctl:
+            return self.s_ready_o # set dynamically by stage
+        return self._ready_o      # return this when not under dynamic control
+
+    def _connect_in(self, prev, direct=False, fn=None,
+                    do_data=True, do_stop=True):
+        """ internal helper function to connect stage to an input source.
+            do not use to connect stage-to-stage!
+        """
+        valid_i = prev.valid_i if direct else prev.valid_i_test
+        res = [self.valid_i.eq(valid_i),
+               prev.ready_o.eq(self.ready_o)]
+        if self.maskwid:
+            res.append(self.mask_i.eq(prev.mask_i))
+            if do_stop:
+                res.append(self.stop_i.eq(prev.stop_i))
+        if do_data is False:
+            return res
+        data_i = fn(prev.data_i) if fn is not None else prev.data_i
+        return res + [nmoperator.eq(self.data_i, data_i)]
+
+    @property
+    def valid_i_test(self):
+        vlen = len(self.valid_i)
+        if vlen > 1:
+            # multi-bit case: valid only when valid_i is all 1s
+            all1s = Const(-1, (len(self.valid_i), False))
+            valid_i = (self.valid_i == all1s)
+        else:
+            # single-bit valid_i case
+            valid_i = self.valid_i
+
+        # when stage indicates not ready, incoming data
+        # must "appear" to be not ready too
+        if self.stage_ctl:
+            valid_i = valid_i & self.s_ready_o
+
+        return valid_i
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.trigger.eq(self.valid_i_test & self.ready_o)
+        return m
+
+    def eq(self, i):
+        res = [nmoperator.eq(self.data_i, i.data_i),
+                self.ready_o.eq(i.ready_o),
+                self.valid_i.eq(i.valid_i)]
+        if self.maskwid:
+            res.append(self.mask_i.eq(i.mask_i))
+        return res
+
+    def __iter__(self):
+        yield self.valid_i
+        yield self.ready_o
+        if self.maskwid:
+            yield self.mask_i
+            yield self.stop_i
+        if hasattr(self.data_i, "ports"):
+            yield from self.data_i.ports()
+        elif isinstance(self.data_i, Sequence):
+            yield from self.data_i
+        else:
+            yield self.data_i
+
+    def ports(self):
+        return list(self)
+
+
+class NextControl(Elaboratable):
+    """ contains the signals that go *to* the next stage (both in and out)
+        * valid_o: output indicating to next stage that data is valid
+        * ready_i: input from next stage indicating that it can accept data
+        * data_o : an output - MUST be added by the USER of this class
+    """
+    def __init__(self, stage_ctl=False, maskwid=0):
+        self.stage_ctl = stage_ctl
+        self.maskwid = maskwid
+        if maskwid:
+            self.mask_o = Signal(maskwid)       # self out>>  next
+            self.stop_o = Signal(maskwid)       # self out>>  next
+        self.valid_o = Signal(name="n_valid_o") # self out>>  next
+        self.ready_i = Signal(name="n_ready_i") # self <<in   next
+        self.data_o = None # XXX MUST BE ADDED BY USER
+        #if self.stage_ctl:
+        self.d_valid = Signal(reset=1) # INTERNAL (data valid)
+        self.trigger = Signal(reset_less=True)
+
+    @property
+    def ready_i_test(self):
+        if self.stage_ctl:
+            return self.ready_i & self.d_valid
+        return self.ready_i
+
+    def connect_to_next(self, nxt, do_data=True, do_stop=True):
+        """ helper function to connect to the next stage data/valid/ready.
+            data/valid is passed *TO* nxt, and ready comes *IN* from nxt.
+            use this when connecting stage-to-stage
+
+            note: a "connect_from_prev" is completely unnecessary: it's
+            just nxt.connect_to_next(self)
+        """
+        res = [nxt.valid_i.eq(self.valid_o),
+               self.ready_i.eq(nxt.ready_o)]
+        if self.maskwid:
+            res.append(nxt.mask_i.eq(self.mask_o))
+            if do_stop:
+                res.append(nxt.stop_i.eq(self.stop_o))
+        if do_data:
+            res.append(nmoperator.eq(nxt.data_i, self.data_o))
+        print ("connect to next", self, self.maskwid, nxt.data_i, do_data, do_stop)
+        return res
+
+    def _connect_out(self, nxt, direct=False, fn=None,
+                     do_data=True, do_stop=True):
+        """ internal helper function to connect stage to an output source.
+            do not use to connect stage-to-stage!
+        """
+        ready_i = nxt.ready_i if direct else nxt.ready_i_test
+        res = [nxt.valid_o.eq(self.valid_o),
+               self.ready_i.eq(ready_i)]
+        if self.maskwid:
+            res.append(nxt.mask_o.eq(self.mask_o))
+            if do_stop:
+                res.append(nxt.stop_o.eq(self.stop_o))
+        if not do_data:
+            return res
+        data_o = fn(nxt.data_o) if fn is not None else nxt.data_o
+        return res + [nmoperator.eq(data_o, self.data_o)]
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.trigger.eq(self.ready_i_test & self.valid_o)
+        return m
+
+    def __iter__(self):
+        yield self.ready_i
+        yield self.valid_o
+        if self.maskwid:
+            yield self.mask_o
+            yield self.stop_o
+        if hasattr(self.data_o, "ports"):
+            yield from self.data_o.ports()
+        elif isinstance(self.data_o, Sequence):
+            yield from self.data_o
+        else:
+            yield self.data_o
+
+    def ports(self):
+        return list(self)
+
diff --git a/src/nmutil/latch.py b/src/nmutil/latch.py
new file mode 100644
index 0000000..84235ff
--- /dev/null
+++ b/src/nmutil/latch.py
@@ -0,0 +1,100 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Signal, Module, Const, Elaboratable
+
+""" jk latch
+
+module jk(q,q1,j,k,c);
+output q,q1;
+input j,k,c;
+reg q,q1;
+initial begin q=1'b0; q1=1'b1; end
+always @ (posedge c)
+  begin
+    case({j,k})
+         {1'b0,1'b0}:begin q=q; q1=q1; end
+         {1'b0,1'b1}: begin q=1'b0; q1=1'b1; end
+         {1'b1,1'b0}:begin q=1'b1; q1=1'b0; end
+         {1'b1,1'b1}: begin q=~q; q1=~q1; end
+    endcase
+   end
+endmodule
+"""
+
+def latchregister(m, incoming, outgoing, settrue):
+    reg = Signal.like(incoming) # make register same as input. reset is OK.
+    with m.If(settrue):
+        m.d.sync += reg.eq(incoming)      # latch input into register
+        m.d.comb += outgoing.eq(incoming) # return input (combinatorial)
+    with m.Else():
+        m.d.comb += outgoing.eq(reg) # return input (combinatorial)
+
+
+class SRLatch(Elaboratable):
+    def __init__(self, sync=True, llen=1):
+        self.sync = sync
+        self.llen = llen
+        self.s = Signal(llen, reset=0)
+        self.r = Signal(llen, reset=(1<<llen)-1) # defaults to off
+        self.q = Signal(llen, reset_less=True)
+        self.qn = Signal(llen, reset_less=True)
+        self.qlq = Signal(llen, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        q_int = Signal(self.llen)
+
+        m.d.sync += q_int.eq((q_int & ~self.r) | self.s)
+        if self.sync:
+            m.d.comb += self.q.eq(q_int)
+        else:
+            m.d.comb += self.q.eq((q_int & ~self.r) | self.s)
+        m.d.comb += self.qn.eq(~self.q)
+        m.d.comb += self.qlq.eq(self.q | q_int) # useful output
+
+        return m
+
+    def ports(self):
+        return self.s, self.r, self.q, self.qn
+
+
+def sr_sim(dut):
+    yield dut.s.eq(0)
+    yield dut.r.eq(0)
+    yield
+    yield
+    yield
+    yield dut.s.eq(1)
+    yield
+    yield
+    yield
+    yield dut.s.eq(0)
+    yield
+    yield
+    yield
+    yield dut.r.eq(1)
+    yield
+    yield
+    yield
+    yield dut.r.eq(0)
+    yield
+    yield
+    yield
+
+def test_sr():
+    dut = SRLatch(llen=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_srlatch.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, sr_sim(dut), vcd_name='test_srlatch.vcd')
+
+    dut = SRLatch(sync=False, llen=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_srlatch_async.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, sr_sim(dut), vcd_name='test_srlatch_async.vcd')
+
+if __name__ == '__main__':
+    test_sr()
diff --git a/src/nmutil/multipipe.py b/src/nmutil/multipipe.py
new file mode 100644
index 0000000..9527925
--- /dev/null
+++ b/src/nmutil/multipipe.py
@@ -0,0 +1,592 @@
+""" Combinatorial Multi-input and Multi-output multiplexer blocks
+    conforming to Pipeline API
+
+    Multi-input is complex because if any one input is ready, the output
+    can be ready, and the decision comes from a separate module.
+
+    Multi-output is simple (pretty much identical to UnbufferedPipeline),
+    and the selection is just a mux.  The only proviso (difference) being:
+    the outputs not being selected have to have their ready_o signals
+    DEASSERTED.
+"""
+
+from math import log
+from nmigen import Signal, Cat, Const, Mux, Module, Array, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.coding import PriorityEncoder
+from nmigen.hdl.rec import Record, Layout
+from nmutil.stageapi import _spec
+
+from collections.abc import Sequence
+
+from nmutil.nmoperator import eq
+from nmutil.iocontrol import NextControl, PrevControl
+
+
+class MultiInControlBase(Elaboratable):
+    """ Common functions for Pipeline API
+    """
+    def __init__(self, in_multi=None, p_len=1, maskwid=0, routemask=False):
+        """ Multi-input Control class.  Conforms to same API as ControlBase...
+            mostly.  has additional indices to the *multiple* input stages
+
+            * p: contains ready/valid to the previous stages PLURAL
+            * n: contains ready/valid to the next stage
+
+            User must also:
+            * add data_i members to PrevControl and
+            * add data_o member  to NextControl
+        """
+        self.routemask = routemask
+        # set up input and output IO ACK (prev/next ready/valid)
+        print ("multi_in", self, maskwid, p_len, routemask)
+        p = []
+        for i in range(p_len):
+            p.append(PrevControl(in_multi, maskwid=maskwid))
+        self.p = Array(p)
+        if routemask:
+            nmaskwid = maskwid # straight route mask mode
+        else:
+            nmaskwid = maskwid * p_len # fan-in mode
+        self.n = NextControl(maskwid=nmaskwid) # masks fan in (Cat)
+
+    def connect_to_next(self, nxt, p_idx=0):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n.connect_to_next(nxt.p[p_idx])
+
+    def _connect_in(self, prev, idx=0, prev_idx=None):
+        """ helper function to connect stage to an input source.  do not
+            use to connect stage-to-stage!
+        """
+        if prev_idx is None:
+            return self.p[idx]._connect_in(prev.p)
+        return self.p[idx]._connect_in(prev.p[prev_idx])
+
+    def _connect_out(self, nxt):
+        """ helper function to connect stage to an output source.  do not
+            use to connect stage-to-stage!
+        """
+        if nxt_idx is None:
+            return self.n._connect_out(nxt.n)
+        return self.n._connect_out(nxt.n)
+
+    def set_input(self, i, idx=0):
+        """ helper function to set the input data
+        """
+        return eq(self.p[idx].data_i, i)
+
+    def elaborate(self, platform):
+        m = Module()
+        for i, p in enumerate(self.p):
+            setattr(m.submodules, "p%d" % i, p)
+        m.submodules.n = self.n
+        return m
+
+    def __iter__(self):
+        for p in self.p:
+            yield from p
+        yield from self.n
+
+    def ports(self):
+        return list(self)
+
+
+class MultiOutControlBase(Elaboratable):
+    """ Common functions for Pipeline API
+    """
+    def __init__(self, n_len=1, in_multi=None, maskwid=0, routemask=False):
+        """ Multi-output Control class.  Conforms to same API as ControlBase...
+            mostly.  has additional indices to the multiple *output* stages
+            [MultiInControlBase has multiple *input* stages]
+
+            * p: contains ready/valid to the previou stage
+            * n: contains ready/valid to the next stages PLURAL
+
+            User must also:
+            * add data_i member to PrevControl and
+            * add data_o members to NextControl
+        """
+
+        if routemask:
+            nmaskwid = maskwid # straight route mask mode
+        else:
+            nmaskwid = maskwid * n_len # fan-out mode
+
+        # set up input and output IO ACK (prev/next ready/valid)
+        self.p = PrevControl(in_multi, maskwid=nmaskwid)
+        n = []
+        for i in range(n_len):
+            n.append(NextControl(maskwid=maskwid))
+        self.n = Array(n)
+
+    def connect_to_next(self, nxt, n_idx=0):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n[n_idx].connect_to_next(nxt.p)
+
+    def _connect_in(self, prev, idx=0):
+        """ helper function to connect stage to an input source.  do not
+            use to connect stage-to-stage!
+        """
+        return self.n[idx]._connect_in(prev.p)
+
+    def _connect_out(self, nxt, idx=0, nxt_idx=None):
+        """ helper function to connect stage to an output source.  do not
+            use to connect stage-to-stage!
+        """
+        if nxt_idx is None:
+            return self.n[idx]._connect_out(nxt.n)
+        return self.n[idx]._connect_out(nxt.n[nxt_idx])
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.p = self.p
+        for i, n in enumerate(self.n):
+            setattr(m.submodules, "n%d" % i, n)
+        return m
+
+    def set_input(self, i):
+        """ helper function to set the input data
+        """
+        return eq(self.p.data_i, i)
+
+    def __iter__(self):
+        yield from self.p
+        for n in self.n:
+            yield from n
+
+    def ports(self):
+        return list(self)
+
+
+class CombMultiOutPipeline(MultiOutControlBase):
+    """ A multi-input Combinatorial block conforming to the Pipeline API
+
+        Attributes:
+        -----------
+        p.data_i : stage input data (non-array).  shaped according to ispec
+        n.data_o : stage output data array.       shaped according to ospec
+    """
+
+    def __init__(self, stage, n_len, n_mux, maskwid=0, routemask=False):
+        MultiOutControlBase.__init__(self, n_len=n_len, maskwid=maskwid,
+                                            routemask=routemask)
+        self.stage = stage
+        self.maskwid = maskwid
+        self.routemask = routemask
+        self.n_mux = n_mux
+
+        # set up the input and output data
+        self.p.data_i = _spec(stage.ispec, 'data_i') # input type
+        for i in range(n_len):
+            name = 'data_o_%d' % i
+            self.n[i].data_o = _spec(stage.ospec, name) # output type
+
+    def process(self, i):
+        if hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def elaborate(self, platform):
+        m = MultiOutControlBase.elaborate(self, platform)
+
+        if hasattr(self.n_mux, "elaborate"): # TODO: identify submodule?
+            m.submodules.n_mux = self.n_mux
+
+        # need buffer register conforming to *input* spec
+        r_data = _spec(self.stage.ispec, 'r_data') # input type
+        if hasattr(self.stage, "setup"):
+            self.stage.setup(m, r_data)
+
+        # multiplexer id taken from n_mux
+        muxid = self.n_mux.m_id
+        print ("self.n_mux", self.n_mux)
+        print ("self.n_mux.m_id", self.n_mux.m_id)
+
+        self.n_mux.m_id.name = "m_id"
+
+        # temporaries
+        p_valid_i = Signal(reset_less=True)
+        pv = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        #m.d.comb += pv.eq(self.p.valid_i) #& self.n[muxid].ready_i)
+        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
+
+        # all outputs to next stages first initialised to zero (invalid)
+        # the only output "active" is then selected by the muxid
+        for i in range(len(self.n)):
+            m.d.comb += self.n[i].valid_o.eq(0)
+        if self.routemask:
+            #with m.If(pv):
+            m.d.comb += self.n[muxid].valid_o.eq(pv)
+            m.d.comb += self.p.ready_o.eq(self.n[muxid].ready_i)
+        else:
+            data_valid = self.n[muxid].valid_o
+            m.d.comb += self.p.ready_o.eq(~data_valid | self.n[muxid].ready_i)
+            m.d.comb += data_valid.eq(p_valid_i | \
+                                    (~self.n[muxid].ready_i & data_valid))
+
+
+        # send data on
+        #with m.If(pv):
+        m.d.comb += eq(r_data, self.p.data_i)
+        m.d.comb += eq(self.n[muxid].data_o, self.process(r_data))
+
+        if self.maskwid:
+            if self.routemask: # straight "routing" mode - treat like data
+                m.d.comb += self.n[muxid].stop_o.eq(self.p.stop_i)
+                with m.If(pv):
+                    m.d.comb += self.n[muxid].mask_o.eq(self.p.mask_i)
+            else:
+                ml = [] # accumulate output masks
+                ms = [] # accumulate output stops
+                # fan-out mode.
+                # conditionally fan-out mask bits, always fan-out stop bits
+                for i in range(len(self.n)):
+                    ml.append(self.n[i].mask_o)
+                    ms.append(self.n[i].stop_o)
+                m.d.comb += Cat(*ms).eq(self.p.stop_i)
+                with m.If(pv):
+                    m.d.comb += Cat(*ml).eq(self.p.mask_i)
+        return m
+
+
+class CombMultiInPipeline(MultiInControlBase):
+    """ A multi-input Combinatorial block conforming to the Pipeline API
+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        r_data : input_shape according to ispec
+            A temporary (buffered) copy of a prior (valid) input.
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+    """
+
+    def __init__(self, stage, p_len, p_mux, maskwid=0, routemask=False):
+        MultiInControlBase.__init__(self, p_len=p_len, maskwid=maskwid,
+                                          routemask=routemask)
+        self.stage = stage
+        self.maskwid = maskwid
+        self.p_mux = p_mux
+
+        # set up the input and output data
+        for i in range(p_len):
+            name = 'data_i_%d' % i
+            self.p[i].data_i = _spec(stage.ispec, name) # input type
+        self.n.data_o = _spec(stage.ospec, 'data_o')
+
+    def process(self, i):
+        if hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def elaborate(self, platform):
+        m = MultiInControlBase.elaborate(self, platform)
+
+        m.submodules.p_mux = self.p_mux
+
+        # need an array of buffer registers conforming to *input* spec
+        r_data = []
+        data_valid = []
+        p_valid_i = []
+        n_ready_in = []
+        p_len = len(self.p)
+        for i in range(p_len):
+            name = 'r_%d' % i
+            r = _spec(self.stage.ispec, name) # input type
+            r_data.append(r)
+            data_valid.append(Signal(name="data_valid", reset_less=True))
+            p_valid_i.append(Signal(name="p_valid_i", reset_less=True))
+            n_ready_in.append(Signal(name="n_ready_in", reset_less=True))
+            if hasattr(self.stage, "setup"):
+                print ("setup", self, self.stage, r)
+                self.stage.setup(m, r)
+        if len(r_data) > 1:
+            r_data = Array(r_data)
+            p_valid_i = Array(p_valid_i)
+            n_ready_in = Array(n_ready_in)
+            data_valid = Array(data_valid)
+
+        nirn = Signal(reset_less=True)
+        m.d.comb += nirn.eq(~self.n.ready_i)
+        mid = self.p_mux.m_id
+        print ("CombMuxIn mid", self, self.stage, self.routemask, mid, p_len)
+        for i in range(p_len):
+            m.d.comb += data_valid[i].eq(0)
+            m.d.comb += n_ready_in[i].eq(1)
+            m.d.comb += p_valid_i[i].eq(0)
+            #m.d.comb += self.p[i].ready_o.eq(~data_valid[i] | self.n.ready_i)
+            m.d.comb += self.p[i].ready_o.eq(0)
+        p = self.p[mid]
+        maskedout = Signal(reset_less=True)
+        if hasattr(p, "mask_i"):
+            m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+        else:
+            m.d.comb += maskedout.eq(1)
+        m.d.comb += p_valid_i[mid].eq(maskedout & self.p_mux.active)
+        m.d.comb += self.p[mid].ready_o.eq(~data_valid[mid] | self.n.ready_i)
+        m.d.comb += n_ready_in[mid].eq(nirn & data_valid[mid])
+        anyvalid = Signal(i, reset_less=True)
+        av = []
+        for i in range(p_len):
+            av.append(data_valid[i])
+        anyvalid = Cat(*av)
+        m.d.comb += self.n.valid_o.eq(anyvalid.bool())
+        m.d.comb += data_valid[mid].eq(p_valid_i[mid] | \
+                                    (n_ready_in[mid] ))
+
+        if self.routemask:
+            # XXX hack - fixes loop
+            m.d.comb += eq(self.n.stop_o, self.p[-1].stop_i)
+            for i in range(p_len):
+                p = self.p[i]
+                vr = Signal(name="vr%d" % i, reset_less=True)
+                maskedout = Signal(name="maskedout%d" % i, reset_less=True)
+                if hasattr(p, "mask_i"):
+                    m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+                else:
+                    m.d.comb += maskedout.eq(1)
+                m.d.comb += vr.eq(maskedout.bool() & p.valid_i & p.ready_o)
+                #m.d.comb += vr.eq(p.valid_i & p.ready_o)
+                with m.If(vr):
+                    m.d.comb += eq(self.n.mask_o, self.p[i].mask_i)
+                    m.d.comb += eq(r_data[i], self.p[i].data_i)
+        else:
+            ml = [] # accumulate output masks
+            ms = [] # accumulate output stops
+            for i in range(p_len):
+                vr = Signal(reset_less=True)
+                p = self.p[i]
+                vr = Signal(reset_less=True)
+                maskedout = Signal(reset_less=True)
+                if hasattr(p, "mask_i"):
+                    m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+                else:
+                    m.d.comb += maskedout.eq(1)
+                m.d.comb += vr.eq(maskedout.bool() & p.valid_i & p.ready_o)
+                with m.If(vr):
+                    m.d.comb += eq(r_data[i], self.p[i].data_i)
+                if self.maskwid:
+                    mlen = len(self.p[i].mask_i)
+                    s = mlen*i
+                    e = mlen*(i+1)
+                    ml.append(Mux(vr, self.p[i].mask_i, Const(0, mlen)))
+                    ms.append(self.p[i].stop_i)
+            if self.maskwid:
+                m.d.comb += self.n.mask_o.eq(Cat(*ml))
+                m.d.comb += self.n.stop_o.eq(Cat(*ms))
+
+        m.d.comb += eq(self.n.data_o, self.process(r_data[mid]))
+
+        return m
+
+
+class NonCombMultiInPipeline(MultiInControlBase):
+    """ A multi-input pipeline block conforming to the Pipeline API
+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        r_data : input_shape according to ispec
+            A temporary (buffered) copy of a prior (valid) input.
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+    """
+
+    def __init__(self, stage, p_len, p_mux, maskwid=0, routemask=False):
+        MultiInControlBase.__init__(self, p_len=p_len, maskwid=maskwid,
+                                          routemask=routemask)
+        self.stage = stage
+        self.maskwid = maskwid
+        self.p_mux = p_mux
+
+        # set up the input and output data
+        for i in range(p_len):
+            name = 'data_i_%d' % i
+            self.p[i].data_i = _spec(stage.ispec, name) # input type
+        self.n.data_o = _spec(stage.ospec, 'data_o')
+
+    def process(self, i):
+        if hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def elaborate(self, platform):
+        m = MultiInControlBase.elaborate(self, platform)
+
+        m.submodules.p_mux = self.p_mux
+
+        # need an array of buffer registers conforming to *input* spec
+        r_data = []
+        r_busy = []
+        p_valid_i = []
+        p_len = len(self.p)
+        for i in range(p_len):
+            name = 'r_%d' % i
+            r = _spec(self.stage.ispec, name) # input type
+            r_data.append(r)
+            r_busy.append(Signal(name="r_busy%d" % i, reset_less=True))
+            p_valid_i.append(Signal(name="p_valid_i%d" % i, reset_less=True))
+            if hasattr(self.stage, "setup"):
+                print ("setup", self, self.stage, r)
+                self.stage.setup(m, r)
+        if len(r_data) > 1:
+            r_data = Array(r_data)
+            p_valid_i = Array(p_valid_i)
+            r_busy = Array(r_busy)
+
+        nirn = Signal(reset_less=True)
+        m.d.comb += nirn.eq(~self.n.ready_i)
+        mid = self.p_mux.m_id
+        print ("CombMuxIn mid", self, self.stage, self.routemask, mid, p_len)
+        for i in range(p_len):
+            m.d.comb += r_busy[i].eq(0)
+            m.d.comb += n_ready_in[i].eq(1)
+            m.d.comb += p_valid_i[i].eq(0)
+            m.d.comb += self.p[i].ready_o.eq(n_ready_in[i])
+        p = self.p[mid]
+        maskedout = Signal(reset_less=True)
+        if hasattr(p, "mask_i"):
+            m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+        else:
+            m.d.comb += maskedout.eq(1)
+        m.d.comb += p_valid_i[mid].eq(maskedout & self.p_mux.active)
+        m.d.comb += self.p[mid].ready_o.eq(~data_valid[mid] | self.n.ready_i)
+        m.d.comb += n_ready_in[mid].eq(nirn & data_valid[mid])
+        anyvalid = Signal(i, reset_less=True)
+        av = []
+        for i in range(p_len):
+            av.append(data_valid[i])
+        anyvalid = Cat(*av)
+        m.d.comb += self.n.valid_o.eq(anyvalid.bool())
+        m.d.comb += data_valid[mid].eq(p_valid_i[mid] | \
+                                    (n_ready_in[mid] ))
+
+        if self.routemask:
+            # XXX hack - fixes loop
+            m.d.comb += eq(self.n.stop_o, self.p[-1].stop_i)
+            for i in range(p_len):
+                p = self.p[i]
+                vr = Signal(name="vr%d" % i, reset_less=True)
+                maskedout = Signal(name="maskedout%d" % i, reset_less=True)
+                if hasattr(p, "mask_i"):
+                    m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+                else:
+                    m.d.comb += maskedout.eq(1)
+                m.d.comb += vr.eq(maskedout.bool() & p.valid_i & p.ready_o)
+                #m.d.comb += vr.eq(p.valid_i & p.ready_o)
+                with m.If(vr):
+                    m.d.comb += eq(self.n.mask_o, self.p[i].mask_i)
+                    m.d.comb += eq(r_data[i], self.p[i].data_i)
+        else:
+            ml = [] # accumulate output masks
+            ms = [] # accumulate output stops
+            for i in range(p_len):
+                vr = Signal(reset_less=True)
+                p = self.p[i]
+                vr = Signal(reset_less=True)
+                maskedout = Signal(reset_less=True)
+                if hasattr(p, "mask_i"):
+                    m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+                else:
+                    m.d.comb += maskedout.eq(1)
+                m.d.comb += vr.eq(maskedout.bool() & p.valid_i & p.ready_o)
+                with m.If(vr):
+                    m.d.comb += eq(r_data[i], self.p[i].data_i)
+                if self.maskwid:
+                    mlen = len(self.p[i].mask_i)
+                    s = mlen*i
+                    e = mlen*(i+1)
+                    ml.append(Mux(vr, self.p[i].mask_i, Const(0, mlen)))
+                    ms.append(self.p[i].stop_i)
+            if self.maskwid:
+                m.d.comb += self.n.mask_o.eq(Cat(*ml))
+                m.d.comb += self.n.stop_o.eq(Cat(*ms))
+
+        m.d.comb += eq(self.n.data_o, self.process(r_data[mid]))
+
+        return m
+
+
+class CombMuxOutPipe(CombMultiOutPipeline):
+    def __init__(self, stage, n_len, maskwid=0, muxidname=None,
+                                     routemask=False):
+        muxidname = muxidname or "muxid"
+        # HACK: stage is also the n-way multiplexer
+        CombMultiOutPipeline.__init__(self, stage, n_len=n_len,
+                                            n_mux=stage, maskwid=maskwid,
+                                            routemask=routemask)
+
+        # HACK: n-mux is also the stage... so set the muxid equal to input muxid
+        muxid = getattr(self.p.data_i, muxidname)
+        print ("combmuxout", muxidname, muxid)
+        stage.m_id = muxid
+
+
+
+class InputPriorityArbiter(Elaboratable):
+    """ arbitration module for Input-Mux pipe, baed on PriorityEncoder
+    """
+    def __init__(self, pipe, num_rows):
+        self.pipe = pipe
+        self.num_rows = num_rows
+        self.mmax = int(log(self.num_rows) / log(2))
+        self.m_id = Signal(self.mmax, reset_less=True) # multiplex id
+        self.active = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        assert len(self.pipe.p) == self.num_rows, \
+                "must declare input to be same size"
+        pe = PriorityEncoder(self.num_rows)
+        m.submodules.selector = pe
+
+        # connect priority encoder
+        in_ready = []
+        for i in range(self.num_rows):
+            p_valid_i = Signal(reset_less=True)
+            if self.pipe.maskwid and not self.pipe.routemask:
+                p = self.pipe.p[i]
+                maskedout = Signal(reset_less=True)
+                m.d.comb += maskedout.eq(p.mask_i & ~p.stop_i)
+                m.d.comb += p_valid_i.eq(maskedout.bool() & p.valid_i_test)
+            else:
+                m.d.comb += p_valid_i.eq(self.pipe.p[i].valid_i_test)
+            in_ready.append(p_valid_i)
+        m.d.comb += pe.i.eq(Cat(*in_ready)) # array of input "valids"
+        m.d.comb += self.active.eq(~pe.n)   # encoder active (one input valid)
+        m.d.comb += self.m_id.eq(pe.o)       # output one active input
+
+        return m
+
+    def ports(self):
+        return [self.m_id, self.active]
+
+
+
+class PriorityCombMuxInPipe(CombMultiInPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self, stage, p_len=2, maskwid=0, routemask=False):
+        p_mux = InputPriorityArbiter(self, p_len)
+        CombMultiInPipeline.__init__(self, stage, p_len, p_mux,
+                                     maskwid=maskwid, routemask=routemask)
+
+
+if __name__ == '__main__':
+
+    from nmutil.test.example_buf_pipe import ExampleStage
+    dut = PriorityCombMuxInPipe(ExampleStage)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_combpipe.il", "w") as f:
+        f.write(vl)
diff --git a/src/nmutil/nmoperator.py b/src/nmutil/nmoperator.py
new file mode 100644
index 0000000..fd50d2f
--- /dev/null
+++ b/src/nmutil/nmoperator.py
@@ -0,0 +1,172 @@
+""" nmigen operator functions / utils
+
+    eq:
+    --
+
+    a strategically very important function that is identical in function
+    to nmigen's Signal.eq function, except it may take objects, or a list
+    of objects, or a tuple of objects, and where objects may also be
+    Records.
+"""
+
+from nmigen import Signal, Cat, Const, Mux, Module, Value, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.fifo import SyncFIFO, SyncFIFOBuffered
+from nmigen.hdl.ast import ArrayProxy
+from nmigen.hdl.rec import Record, Layout
+
+from abc import ABCMeta, abstractmethod
+from collections.abc import Sequence, Iterable
+from collections import OrderedDict
+from nmutil.queue import Queue
+import inspect
+
+
+class Visitor2:
+    """ a helper class for iterating twin-argument compound data structures.
+
+        Record is a special (unusual, recursive) case, where the input may be
+        specified as a dictionary (which may contain further dictionaries,
+        recursively), where the field names of the dictionary must match
+        the Record's field spec.  Alternatively, an object with the same
+        member names as the Record may be assigned: it does not have to
+        *be* a Record.
+
+        ArrayProxy is also special-cased, it's a bit messy: whilst ArrayProxy
+        has an eq function, the object being assigned to it (e.g. a python
+        object) might not.  despite the *input* having an eq function,
+        that doesn't help us, because it's the *ArrayProxy* that's being
+        assigned to.  so.... we cheat.  use the ports() function of the
+        python object, enumerate them, find out the list of Signals that way,
+        and assign them.
+    """
+    def iterator2(self, o, i):
+        if isinstance(o, dict):
+            yield from self.dict_iter2(o, i)
+
+        if not isinstance(o, Sequence):
+            o, i = [o], [i]
+        for (ao, ai) in zip(o, i):
+            #print ("visit", fn, ao, ai)
+            if isinstance(ao, Record):
+                yield from self.record_iter2(ao, ai)
+            elif isinstance(ao, ArrayProxy) and not isinstance(ai, Value):
+                yield from self.arrayproxy_iter2(ao, ai)
+            else:
+                yield (ao, ai)
+
+    def dict_iter2(self, o, i):
+        for (k, v) in o.items():
+            print ("d-iter", v, i[k])
+            yield (v, i[k])
+        return res
+
+    def _not_quite_working_with_all_unit_tests_record_iter2(self, ao, ai):
+        print ("record_iter2", ao, ai, type(ao), type(ai))
+        if isinstance(ai, Value):
+            if isinstance(ao, Sequence):
+                ao, ai = [ao], [ai]
+            for o, i in zip(ao, ai):
+                yield (o, i)
+            return
+        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            yield from self.iterator2(ao.fields[field_name], val)
+
+    def record_iter2(self, ao, ai):
+        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            yield from self.iterator2(ao.fields[field_name], val)
+
+    def arrayproxy_iter2(self, ao, ai):
+        #print ("arrayproxy_iter2", ai.ports(), ai, ao)
+        for p in ai.ports():
+            #print ("arrayproxy - p", p, p.name, ao)
+            op = getattr(ao, p.name)
+            yield from self.iterator2(op, p)
+
+
+class Visitor:
+    """ a helper class for iterating single-argument compound data structures.
+        similar to Visitor2.
+    """
+    def iterate(self, i):
+        """ iterate a compound structure recursively using yield
+        """
+        if not isinstance(i, Sequence):
+            i = [i]
+        for ai in i:
+            #print ("iterate", ai)
+            if isinstance(ai, Record):
+                #print ("record", list(ai.layout))
+                yield from self.record_iter(ai)
+            elif isinstance(ai, ArrayProxy) and not isinstance(ai, Value):
+                yield from self.array_iter(ai)
+            else:
+                yield ai
+
+    def record_iter(self, ai):
+        for idx, (field_name, field_shape, _) in enumerate(ai.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            #print ("recidx", idx, field_name, field_shape, val)
+            yield from self.iterate(val)
+
+    def array_iter(self, ai):
+        for p in ai.ports():
+            yield from self.iterate(p)
+
+
+def eq(o, i):
+    """ makes signals equal: a helper routine which identifies if it is being
+        passed a list (or tuple) of objects, or signals, or Records, and calls
+        the objects' eq function.
+    """
+    res = []
+    for (ao, ai) in Visitor2().iterator2(o, i):
+        rres = ao.eq(ai)
+        if not isinstance(rres, Sequence):
+            rres = [rres]
+        res += rres
+    return res
+
+
+def shape(i):
+    #print ("shape", i)
+    r = 0
+    for part in list(i):
+        #print ("shape?", part)
+        s, _ = part.shape()
+        r += s
+    return r, False
+
+
+def cat(i):
+    """ flattens a compound structure recursively using Cat
+    """
+    from nmigen._utils import flatten
+    #res = list(flatten(i)) # works (as of nmigen commit f22106e5) HOWEVER...
+    res = list(Visitor().iterate(i)) # needed because input may be a sequence
+    return Cat(*res)
+
+
diff --git a/src/nmutil/noconflict.py b/src/nmutil/noconflict.py
new file mode 100644
index 0000000..ad7eb09
--- /dev/null
+++ b/src/nmutil/noconflict.py
@@ -0,0 +1,55 @@
+import inspect, types
+
+############## preliminary: two utility functions #####################
+
+def skip_redundant(iterable, skipset=None):
+   "Redundant items are repeated items or items in the original skipset."
+   if skipset is None: skipset = set()
+   for item in iterable:
+       if item not in skipset:
+           skipset.add(item)
+           yield item
+
+
+def remove_redundant(metaclasses):
+   skipset = set([type])
+   for meta in metaclasses: # determines the metaclasses to be skipped
+       skipset.update(inspect.getmro(meta)[1:])
+   return tuple(skip_redundant(metaclasses, skipset))
+
+##################################################################
+## now the core of the module: two mutually recursive functions ##
+##################################################################
+
+memoized_metaclasses_map = {}
+
+def get_noconflict_metaclass(bases, left_metas, right_metas):
+    """Not intended to be used outside of this module, unless you know
+    what you are doing."""
+    # make tuple of needed metaclasses in specified priority order
+    metas = left_metas + tuple(map(type, bases)) + right_metas
+    needed_metas = remove_redundant(metas)
+
+    # return existing confict-solving meta, if any
+    if needed_metas in memoized_metaclasses_map:
+      return memoized_metaclasses_map[needed_metas]
+    # nope: compute, memoize and return needed conflict-solving meta
+    elif not needed_metas:         # wee, a trivial case, happy us
+        meta = type
+    elif len(needed_metas) == 1: # another trivial case
+       meta = needed_metas[0]
+    # check for recursion, can happen i.e. for Zope ExtensionClasses
+    elif needed_metas == bases: 
+        raise TypeError("Incompatible root metatypes", needed_metas)
+    else: # gotta work ...
+        metaname = '_' + ''.join([m.__name__ for m in needed_metas])
+        meta = classmaker()(metaname, needed_metas, {})
+    memoized_metaclasses_map[needed_metas] = meta
+    return meta
+
+def classmaker(left_metas=(), right_metas=()):
+    def make_class(name, bases, adict):
+        print ("make_class", name)
+        metaclass = get_noconflict_metaclass(bases, left_metas, right_metas)
+        return metaclass(name, bases, adict)
+    return make_class
diff --git a/src/nmutil/picker.py b/src/nmutil/picker.py
new file mode 100644
index 0000000..d47f785
--- /dev/null
+++ b/src/nmutil/picker.py
@@ -0,0 +1,42 @@
+""" Priority Picker: optimised back-to-back PriorityEncoder and Decoder
+
+    The input is N bits, the output is N bits wide and only one is
+    enabled.
+"""
+
+from nmigen import Module, Signal, Cat, Elaboratable
+
+class PriorityPicker(Elaboratable):
+    """ implements a priority-picker.  input: N bits, output: N bits
+    """
+    def __init__(self, wid):
+        self.wid = wid
+        # inputs
+        self.i = Signal(wid, reset_less=True)
+        self.o = Signal(wid, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        res = []
+        ni = Signal(self.wid, reset_less = True)
+        m.d.comb += ni.eq(~self.i)
+        for i in range(0, self.wid):
+            t = Signal(reset_less = True)
+            res.append(t)
+            if i == 0:
+                m.d.comb += t.eq(self.i[i])
+            else:
+                m.d.comb += t.eq(~Cat(ni[i], *self.i[:i]).bool())
+
+        # we like Cat(*xxx).  turn lists into concatenated bits
+        m.d.comb += self.o.eq(Cat(*res))
+
+        return m
+
+    def __iter__(self):
+        yield self.i
+        yield self.o
+
+    def ports(self):
+        return list(self)
diff --git a/src/nmutil/pipeline.py b/src/nmutil/pipeline.py
new file mode 100644
index 0000000..812b527
--- /dev/null
+++ b/src/nmutil/pipeline.py
@@ -0,0 +1,394 @@
+""" Example 5: Making use of PyRTL and Introspection. """
+
+from collections.abc import Sequence
+
+from nmigen import Signal
+from nmigen.hdl.rec import Record
+from nmigen import tracer
+from nmigen.compat.fhdl.bitcontainer import value_bits_sign
+from contextlib import contextmanager
+
+from nmutil.nmoperator import eq
+from nmutil.singlepipe import StageCls, ControlBase, BufferedHandshake
+from nmutil.singlepipe import UnbufferedPipeline
+
+
+# The following example shows how pyrtl can be used to make some interesting
+# hardware structures using python introspection.  In particular, this example
+# makes a N-stage pipeline structure.  Any specific pipeline is then a derived
+# class of SimplePipeline where methods with names starting with "stage" are
+# stages, and new members with names not starting with "_" are to be registered
+# for the next stage.
+
+def like(value, rname, pipe, pipemode=False):
+    if isinstance(value, ObjectProxy):
+        return ObjectProxy.like(pipe, value, pipemode=pipemode,
+                                name=rname, reset_less=True)
+    else:
+        return Signal(value_bits_sign(value), name=rname,
+                             reset_less=True)
+        return Signal.like(value, name=rname, reset_less=True)
+
+def get_assigns(_assigns):
+    assigns = []
+    for e in _assigns:
+        if isinstance(e, ObjectProxy):
+            assigns += get_assigns(e._assigns)
+        else:
+            assigns.append(e)
+    return assigns
+
+
+def get_eqs(_eqs):
+    eqs = []
+    for e in _eqs:
+        if isinstance(e, ObjectProxy):
+            eqs += get_eqs(e._eqs)
+        else:
+            eqs.append(e)
+    return eqs
+
+
+class ObjectProxy:
+    def __init__(self, m, name=None, pipemode=False, syncmode=True):
+        self._m = m
+        if name is None:
+            name = tracer.get_var_name(default=None)
+        self.name = name
+        self._pipemode = pipemode
+        self._syncmode = syncmode
+        self._eqs = {}
+        self._assigns = []
+        self._preg_map = {}
+
+    @classmethod
+    def like(cls, m, value, pipemode=False, name=None, src_loc_at=0, **kwargs):
+        name = name or tracer.get_var_name(depth=2 + src_loc_at,
+                                            default="$like")
+
+        src_loc_at_1 = 1 + src_loc_at
+        r = ObjectProxy(m, value.name, pipemode)
+        #for a, aname in value._preg_map.items():
+        #    r._preg_map[aname] = like(a, aname, m, pipemode)
+        for a in value.ports():
+            aname = a.name
+            r._preg_map[aname] = like(a, aname, m, pipemode)
+        return r
+
+    def __repr__(self):
+        subobjs = []
+        for a in self.ports():
+            aname = a.name
+            ai = self._preg_map[aname]
+            subobjs.append(repr(ai))
+        return "<OP %s>" % subobjs
+
+    def get_specs(self, liked=False):
+        res = []
+        for k, v in self._preg_map.items():
+            #v = like(v, k, stage._m)
+            res.append(v)
+            if isinstance(v, ObjectProxy):
+                res += v.get_specs()
+        return res
+
+    def eq(self, i):
+        print ("ObjectProxy eq", self, i)
+        res = []
+        for a in self.ports():
+            aname = a.name
+            ai = i._preg_map[aname]
+            res.append(a.eq(ai))
+        return res
+
+    def ports(self):
+        res = []
+        for aname, a in self._preg_map.items():
+            if isinstance(a, Signal) or isinstance(a, ObjectProxy) or \
+               isinstance(a, Record):
+                res.append(a)
+        #print ("ObjectPorts", res)
+        return res
+
+    def __getattr__(self, name):
+        try:
+            v = self._preg_map[name]
+            return v
+            #return like(v, name, self._m)
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for OP %s'
+                % (name, self.name))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_') or name in ['name', 'ports', 'eq', 'like']:
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        #rname = "%s_%s" % (self.name, name)
+        rname = name
+        new_pipereg = like(value, rname, self._m, self._pipemode)
+        self._preg_map[name] = new_pipereg
+        #object.__setattr__(self, name, new_pipereg)
+        if self._pipemode:
+            #print ("OP pipemode", self._syncmode, new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            if self._syncmode:
+                self._m.d.sync += assign
+            else:
+                self._m.d.comb += assign
+        elif self._m:
+            #print ("OP !pipemode assign", new_pipereg, value, type(value))
+            self._m.d.comb += eq(new_pipereg, value)
+        else:
+            #print ("OP !pipemode !m", new_pipereg, value, type(value))
+            self._assigns += eq(new_pipereg, value)
+            if isinstance(value, ObjectProxy):
+                #print ("OP, defer assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs.append(value._eqs)
+
+
+class PipelineStage:
+    """ Pipeline builder stage with auto generation of pipeline registers.
+    """
+
+    def __init__(self, name, m, prev=None, pipemode=False, ispec=None):
+        self._m = m
+        self._stagename = name
+        self._preg_map = {'__nextstage__': {}}
+        self._prev_stage = prev
+        self._ispec = ispec
+        if ispec:
+            self._preg_map[self._stagename] = ispec
+        if prev:
+            print ("prev", prev._stagename, prev._preg_map)
+            #if prev._stagename in prev._preg_map:
+            #    m = prev._preg_map[prev._stagename]
+            #    self._preg_map[prev._stagename] = m
+            if '__nextstage__' in prev._preg_map:
+                m = prev._preg_map['__nextstage__']
+                m = likedict(m)
+                self._preg_map[self._stagename] = m
+                #for k, v in m.items():
+                    #m[k] = like(v, k, self._m)
+                print ("make current", self._stagename, m)
+        self._pipemode = pipemode
+        self._eqs = {}
+        self._assigns = []
+
+    def __getattribute__(self, name):
+        if name.startswith('_'):
+            return object.__getattribute__(self, name)
+        #if name in self._preg_map['__nextstage__']:
+        #    return self._preg_map['__nextstage__'][name]
+        try:
+            print ("getattr", name, object.__getattribute__(self, '_preg_map'))
+            v = self._preg_map[self._stagename][name]
+            return v
+            #return like(v, name, self._m)
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for stage %s'
+                % (name, self._stagename))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        pipereg_id = self._stagename
+        rname = 'pipereg_' + pipereg_id + '_' + name
+        new_pipereg = like(value, rname, self._m, self._pipemode)
+        next_stage = '__nextstage__'
+        if next_stage not in self._preg_map:
+            self._preg_map[next_stage] = {}
+        self._preg_map[next_stage][name] = new_pipereg
+        print ("setattr", name, value, self._preg_map)
+        if self._pipemode:
+            self._eqs[name] = new_pipereg
+            assign = eq(new_pipereg, value)
+            print ("pipemode: append", new_pipereg, value, assign)
+            if isinstance(value, ObjectProxy):
+                print ("OP, assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs[name]._eqs = value._eqs
+            #self._m.d.comb += assign
+            self._assigns += assign
+        elif self._m:
+            print ("!pipemode: assign", new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            self._m.d.sync += assign
+        else:
+            print ("!pipemode !m: defer assign", new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            self._eqs[name] = new_pipereg
+            self._assigns += assign
+            if isinstance(value, ObjectProxy):
+                print ("OP, defer assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs[name]._eqs = value._eqs
+
+def likelist(specs):
+    res = []
+    for v in specs:
+        res.append(like(v, v.name, None, pipemode=True))
+    return res
+
+def likedict(specs):
+    if not isinstance(specs, dict):
+        return like(specs, specs.name, None, pipemode=True)
+    res = {}
+    for k, v in specs.items():
+        res[k] = likedict(v)
+    return res
+
+
+class AutoStage(StageCls):
+    def __init__(self, inspecs, outspecs, eqs, assigns):
+        self.inspecs, self.outspecs = inspecs, outspecs
+        self.eqs, self.assigns = eqs, assigns
+        #self.o = self.ospec()
+    def ispec(self): return likedict(self.inspecs)
+    def ospec(self): return likedict(self.outspecs)
+
+    def process(self, i):
+        print ("stage process", i)
+        return self.eqs
+
+    def setup(self, m, i):
+        print ("stage setup i", i, m)
+        print ("stage setup inspecs", self.inspecs)
+        print ("stage setup outspecs", self.outspecs)
+        print ("stage setup eqs", self.eqs)
+        #self.o = self.ospec()
+        m.d.comb += eq(self.inspecs, i)
+        #m.d.comb += eq(self.outspecs, self.eqs)
+        #m.d.comb += eq(self.o, i)
+
+
+class AutoPipe(UnbufferedPipeline):
+    def __init__(self, stage, assigns):
+        UnbufferedPipeline.__init__(self, stage)
+        self.assigns = assigns
+
+    def elaborate(self, platform):
+        m = UnbufferedPipeline.elaborate(self, platform)
+        m.d.comb += self.assigns
+        print ("assigns", self.assigns, m)
+        return m
+
+
+class PipeManager:
+    def __init__(self, m, pipemode=False, pipetype=None):
+        self.m = m
+        self.pipemode = pipemode
+        self.pipetype = pipetype
+
+    @contextmanager
+    def Stage(self, name, prev=None, ispec=None):
+        if ispec:
+            ispec = likedict(ispec)
+        print ("start stage", name, ispec)
+        stage = PipelineStage(name, None, prev, self.pipemode, ispec=ispec)
+        try:
+            yield stage, self.m #stage._m
+        finally:
+            pass
+        if self.pipemode:
+            if stage._ispec:
+                print ("use ispec", stage._ispec)
+                inspecs = stage._ispec
+            else:
+                inspecs = self.get_specs(stage, name)
+                #inspecs = likedict(inspecs)
+            outspecs = self.get_specs(stage, '__nextstage__', liked=True)
+            print ("stage inspecs", name, inspecs)
+            print ("stage outspecs", name, outspecs)
+            eqs = stage._eqs # get_eqs(stage._eqs)
+            assigns = get_assigns(stage._assigns)
+            print ("stage eqs", name, eqs)
+            print ("stage assigns", name, assigns)
+            s = AutoStage(inspecs, outspecs, eqs, assigns)
+            self.stages.append(s)
+        print ("end stage", name, self.pipemode, "\n")
+
+    def get_specs(self, stage, name, liked=False):
+        return stage._preg_map[name]
+        if name in stage._preg_map:
+            res = []
+            for k, v in stage._preg_map[name].items():
+                #v = like(v, k, stage._m)
+                res.append(v)
+                #if isinstance(v, ObjectProxy):
+                #    res += v.get_specs()
+            return res
+        return {}
+
+    def __enter__(self):
+        self.stages = []
+        return self
+
+    def __exit__(self, *args):
+        print ("exit stage", args)
+        pipes = []
+        cb = ControlBase()
+        for s in self.stages:
+            print ("stage specs", s, s.inspecs, s.outspecs)
+            if self.pipetype == 'buffered':
+                p = BufferedHandshake(s)
+            else:
+                p = AutoPipe(s, s.assigns)
+            pipes.append(p)
+            self.m.submodules += p
+
+        self.m.d.comb += cb.connect(pipes)
+
+
+class SimplePipeline:
+    """ Pipeline builder with auto generation of pipeline registers.
+    """
+
+    def __init__(self, m):
+        self._m = m
+        self._pipeline_register_map = {}
+        self._current_stage_num = 0
+
+    def _setup(self):
+        stage_list = []
+        for method in dir(self):
+            if method.startswith('stage'):
+                stage_list.append(method)
+        for stage in sorted(stage_list):
+            stage_method = getattr(self, stage)
+            stage_method()
+            self._current_stage_num += 1
+
+    def __getattr__(self, name):
+        try:
+            return self._pipeline_register_map[self._current_stage_num][name]
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for stage %d'
+                % (name, self._current_stage_num))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        next_stage = self._current_stage_num + 1
+        pipereg_id = str(self._current_stage_num) + 'to' + str(next_stage)
+        rname = 'pipereg_' + pipereg_id + '_' + name
+        #new_pipereg = Signal(value_bits_sign(value), name=rname,
+        #                     reset_less=True)
+        if isinstance(value, ObjectProxy):
+            new_pipereg = ObjectProxy.like(self._m, value,
+                                           name=rname, reset_less = True)
+        else:
+            new_pipereg = Signal.like(value, name=rname, reset_less = True)
+        if next_stage not in self._pipeline_register_map:
+            self._pipeline_register_map[next_stage] = {}
+        self._pipeline_register_map[next_stage][name] = new_pipereg
+        self._m.d.sync += eq(new_pipereg, value)
+
diff --git a/src/nmutil/pipemodbase.py b/src/nmutil/pipemodbase.py
new file mode 100644
index 0000000..0c5a02f
--- /dev/null
+++ b/src/nmutil/pipemodbase.py
@@ -0,0 +1,54 @@
+from nmigen import Elaboratable
+from ieee754.pipeline import DynamicPipe
+from nmutil.singlepipe import StageChain
+
+
+class PipeModBase(Elaboratable):
+    """PipeModBase: common code between nearly every pipeline module
+    """
+    def __init__(self, pspec, modname):
+        self.modname = modname # use this to give a name to this module
+        self.pspec = pspec
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        setattr(m.submodules, self.modname, self)
+        m.d.comb += self.i.eq(i)
+
+
+class PipeModBaseChain(DynamicPipe):
+    """PipeModBaseChain: common code between stage-chained pipes
+
+    Links a set of combinatorial modules (get_chain) together
+    and uses pspec.pipekls to dynamically select the pipeline type
+    Also conforms to the Pipeline Stage API
+    """
+    def __init__(self, pspec):
+        self.pspec = pspec
+        self.chain = self.get_chain()
+        super().__init__(pspec)
+
+    def ispec(self):
+        """ returns the input spec of the first module in the chain
+        """
+        return self.chain[0].ispec()
+
+    def ospec(self):
+        """ returns the output spec of the last module in the chain
+        """
+        return self.chain[-1].ospec()
+
+    def process(self, i):
+        return self.o # ... returned here (see setup comment below)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        StageChain(self.chain).setup(m, i) # input linked here, through chain
+        self.o = self.chain[-1].o # output is the last thing in the chain...
diff --git a/src/nmutil/queue.py b/src/nmutil/queue.py
new file mode 100644
index 0000000..3d47c63
--- /dev/null
+++ b/src/nmutil/queue.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2014 - 2019 The Regents of the University of
+# California (Regents). All Rights Reserved.  Redistribution and use in
+# source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#    * Redistributions of source code must retain the above
+#      copyright notice, this list of conditions and the following
+#      two paragraphs of disclaimer.
+#    * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      two paragraphs of disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#    * Neither the name of the Regents nor the names of its contributors
+#      may be used to endorse or promote products derived from this
+#      software without specific prior written permission.
+# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+# SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
+# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
+# REGENTS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF
+# ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION
+# TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+# MODIFICATIONS.
+
+from nmigen import Module, Signal, Memory, Mux, Elaboratable
+from nmigen.utils import bits_for
+from nmigen.cli import main
+from nmigen.lib.fifo import FIFOInterface
+
+# translated from https://github.com/freechipsproject/chisel3/blob/a4a29e29c3f1eed18f851dcf10bdc845571dfcb6/src/main/scala/chisel3/util/Decoupled.scala#L185   # noqa
+
+
+class Queue(FIFOInterface, Elaboratable):
+    def __init__(self, width, depth, fwft=True, pipe=False):
+        """ Queue (FIFO) with pipe mode and first-write fall-through capability
+
+            * :width: width of Queue data in/out
+            * :depth: queue depth.  NOTE: may be set to 0 (this is ok)
+            * :fwft : first-write, fall-through mode (Chisel Queue "flow" mode)
+            * :pipe : pipe mode.  NOTE: this mode can cause unanticipated
+                      problems.  when read is enabled, so is writeable.
+                      therefore if read is enabled, the data ABSOLUTELY MUST
+                      be read.
+
+            fwft mode = True basically means that the data may be transferred
+            combinatorially from input to output.
+
+            Attributes:
+            * level: available free space (number of unread entries)
+
+            din  = enq_data, writable  = enq_ready, we = enq_valid
+            dout = deq_data, re = deq_ready, readable = deq_valid
+        """
+        FIFOInterface.__init__(self, width, depth, fwft)
+        self.pipe = pipe
+        self.depth = depth
+        self.level = Signal(bits_for(depth))
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # set up an SRAM.  XXX bug in Memory: cannot create SRAM of depth 1
+        ram = Memory(self.width, self.depth if self.depth > 1 else 2)
+        m.submodules.ram_read = ram_read = ram.read_port(domain="comb")
+        m.submodules.ram_write = ram_write = ram.write_port()
+
+        # convenience names, for people familiar with ready/valid terminology
+        # "p" stands for "previous stage", "n" stands for "next stage"
+        # for people familiar with the chisel Decoupled library:
+        # enq is "enqueue" (data in, aka "prev stage"),
+        # deq is "dequeue" (data out, aka "next stage")
+        p_ready_o = self.writable
+        p_valid_i = self.we
+        enq_data = self.din # aka p_data_i
+
+        n_valid_o = self.readable
+        n_ready_i = self.re
+        deq_data = self.dout # aka n_data_o
+
+        # intermediaries
+        ptr_width = bits_for(self.depth - 1) if self.depth > 1 else 0
+        enq_ptr = Signal(ptr_width) # cyclic pointer to "insert" point (wrport)
+        deq_ptr = Signal(ptr_width) # cyclic pointer to "remove" point (rdport)
+        maybe_full = Signal() # not reset_less (set by sync)
+
+        # temporaries
+        do_enq = Signal(reset_less=True)
+        do_deq = Signal(reset_less=True)
+        ptr_diff = Signal(ptr_width)
+        ptr_match = Signal(reset_less=True)
+        empty = Signal(reset_less=True)
+        full = Signal(reset_less=True)
+        enq_max = Signal(reset_less=True)
+        deq_max = Signal(reset_less=True)
+
+        m.d.comb += [ptr_match.eq(enq_ptr == deq_ptr), # read-ptr = write-ptr
+                     ptr_diff.eq(enq_ptr - deq_ptr),
+                     enq_max.eq(enq_ptr == self.depth - 1),
+                     deq_max.eq(deq_ptr == self.depth - 1),
+                     empty.eq(ptr_match & ~maybe_full),
+                     full.eq(ptr_match & maybe_full),
+                     do_enq.eq(p_ready_o & p_valid_i), # write conditions ok
+                     do_deq.eq(n_ready_i & n_valid_o), # read conditions ok
+
+                     # set readable and writable (NOTE: see pipe mode below)
+                     n_valid_o.eq(~empty), # cannot read if empty!
+                     p_ready_o.eq(~full),  # cannot write if full!
+
+                     # set up memory and connect to input and output
+                     ram_write.addr.eq(enq_ptr),
+                     ram_write.data.eq(enq_data),
+                     ram_write.en.eq(do_enq),
+                     ram_read.addr.eq(deq_ptr),
+                     deq_data.eq(ram_read.data) # NOTE: overridden in fwft mode
+                    ]
+
+        # under write conditions, SRAM write-pointer moves on next clock
+        with m.If(do_enq):
+            m.d.sync += enq_ptr.eq(Mux(enq_max, 0, enq_ptr+1))
+
+        # under read conditions, SRAM read-pointer moves on next clock
+        with m.If(do_deq):
+            m.d.sync += deq_ptr.eq(Mux(deq_max, 0, deq_ptr+1))
+
+        # if read-but-not-write or write-but-not-read, maybe_full set
+        with m.If(do_enq != do_deq):
+            m.d.sync += maybe_full.eq(do_enq)
+
+        # first-word fall-through: same as "flow" parameter in Chisel3 Queue
+        # basically instead of relying on the Memory characteristics (which
+        # in FPGAs do not have write-through), then when the queue is empty
+        # take the output directly from the input, i.e. *bypass* the SRAM.
+        # this done combinatorially to give the exact same characteristics
+        # as Memory "write-through"... without relying on a changing API
+        if self.fwft:
+            with m.If(p_valid_i):
+                m.d.comb += n_valid_o.eq(1)
+            with m.If(empty):
+                m.d.comb += deq_data.eq(enq_data)
+                m.d.comb += do_deq.eq(0)
+                with m.If(n_ready_i):
+                    m.d.comb += do_enq.eq(0)
+
+        # pipe mode: if next stage says it's ready (readable), we
+        #            *must* declare the input ready (writeable).
+        if self.pipe:
+            with m.If(n_ready_i):
+                m.d.comb += p_ready_o.eq(1)
+
+        # set the count (available free space), optimise on power-of-two
+        if self.depth == 1 << ptr_width:  # is depth a power of 2
+            m.d.comb += self.level.eq(
+                Mux(maybe_full & ptr_match, self.depth, 0) | ptr_diff)
+        else:
+            m.d.comb += self.level.eq(Mux(ptr_match,
+                                          Mux(maybe_full, self.depth, 0),
+                                          Mux(deq_ptr > enq_ptr,
+                                              self.depth + ptr_diff,
+                                              ptr_diff)))
+
+        return m
+
+
+if __name__ == "__main__":
+    reg_stage = Queue(1, 1, pipe=True)
+    break_ready_chain_stage = Queue(1, 1, pipe=True, fwft=True)
+    m = Module()
+    ports = []
+
+    def queue_ports(queue, name_prefix):
+        retval = []
+        for name in ["level",
+                     "dout",
+                     "readable",
+                     "writable"]:
+            port = getattr(queue, name)
+            signal = Signal(port.shape(), name=name_prefix+name)
+            m.d.comb += signal.eq(port)
+            retval.append(signal)
+        for name in ["re",
+                     "din",
+                     "we"]:
+            port = getattr(queue, name)
+            signal = Signal(port.shape(), name=name_prefix+name)
+            m.d.comb += port.eq(signal)
+            retval.append(signal)
+        return retval
+
+    m.submodules.reg_stage = reg_stage
+    ports += queue_ports(reg_stage, "reg_stage_")
+    m.submodules.break_ready_chain_stage = break_ready_chain_stage
+    ports += queue_ports(break_ready_chain_stage, "break_ready_chain_stage_")
+    main(m, ports=ports)
diff --git a/src/nmutil/singlepipe.py b/src/nmutil/singlepipe.py
new file mode 100644
index 0000000..4880a81
--- /dev/null
+++ b/src/nmutil/singlepipe.py
@@ -0,0 +1,994 @@
+""" Pipeline API.  For multi-input and multi-output variants, see multipipe.
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=148
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Important: see Stage API (stageapi.py) and IO Control API
+    (iocontrol.py) in combination with below.  This module
+    "combines" the Stage API with the IO Control API to create
+    the Pipeline API.
+
+    The one critically important key difference between StageAPI and
+    PipelineAPI:
+
+        * StageAPI: combinatorial (NO REGISTERS / LATCHES PERMITTED)
+        * PipelineAPI: synchronous registers / latches get added here
+
+    RecordBasedStage:
+    ----------------
+
+    A convenience class that takes an input shape, output shape, a
+    "processing" function and an optional "setup" function.  Honestly
+    though, there's not much more effort to just... create a class
+    that returns a couple of Records (see ExampleAddRecordStage in
+    examples).
+
+    PassThroughStage:
+    ----------------
+
+    A convenience class that takes a single function as a parameter,
+    that is chain-called to create the exact same input and output spec.
+    It has a process() function that simply returns its input.
+
+    Instances of this class are completely redundant if handed to
+    StageChain, however when passed to UnbufferedPipeline they
+    can be used to introduce a single clock delay.
+
+    ControlBase:
+    -----------
+
+    The base class for pipelines.  Contains previous and next ready/valid/data.
+    Also has an extremely useful "connect" function that can be used to
+    connect a chain of pipelines and present the exact same prev/next
+    ready/valid/data API.
+
+    Note: pipelines basically do not become pipelines as such until
+    handed to a derivative of ControlBase.  ControlBase itself is *not*
+    strictly considered a pipeline class.  Wishbone and AXI4 (master or
+    slave) could be derived from ControlBase, for example.
+    UnbufferedPipeline:
+    ------------------
+
+    A simple stalling clock-synchronised pipeline that has no buffering
+    (unlike BufferedHandshake).  Data flows on *every* clock cycle when
+    the conditions are right (this is nominally when the input is valid
+    and the output is ready).
+
+    A stall anywhere along the line will result in a stall back-propagating
+    down the entire chain.  The BufferedHandshake by contrast will buffer
+    incoming data, allowing previous stages one clock cycle's grace before
+    also having to stall.
+
+    An advantage of the UnbufferedPipeline over the Buffered one is
+    that the amount of logic needed (number of gates) is greatly
+    reduced (no second set of buffers basically)
+
+    The disadvantage of the UnbufferedPipeline is that the valid/ready
+    logic, if chained together, is *combinatorial*, resulting in
+    progressively larger gate delay.
+
+    PassThroughHandshake:
+    ------------------
+
+    A Control class that introduces a single clock delay, passing its
+    data through unaltered.  Unlike RegisterPipeline (which relies
+    on UnbufferedPipeline and PassThroughStage) it handles ready/valid
+    itself.
+
+    RegisterPipeline:
+    ----------------
+
+    A convenience class that, because UnbufferedPipeline introduces a single
+    clock delay, when its stage is a PassThroughStage, it results in a Pipeline
+    stage that, duh, delays its (unmodified) input by one clock cycle.
+
+    BufferedHandshake:
+    ----------------
+
+    nmigen implementation of buffered pipeline stage, based on zipcpu:
+    https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html
+
+    this module requires quite a bit of thought to understand how it works
+    (and why it is needed in the first place).  reading the above is
+    *strongly* recommended.
+
+    unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires
+    the STB / ACK signals to raise and lower (on separate clocks) before
+    data may proceeed (thus only allowing one piece of data to proceed
+    on *ALTERNATE* cycles), the signalling here is a true pipeline
+    where data will flow on *every* clock when the conditions are right.
+
+    input acceptance conditions are when:
+        * incoming previous-stage strobe (p.valid_i) is HIGH
+        * outgoing previous-stage ready   (p.ready_o) is LOW
+
+    output transmission conditions are when:
+        * outgoing next-stage strobe (n.valid_o) is HIGH
+        * outgoing next-stage ready   (n.ready_i) is LOW
+
+    the tricky bit is when the input has valid data and the output is not
+    ready to accept it.  if it wasn't for the clock synchronisation, it
+    would be possible to tell the input "hey don't send that data, we're
+    not ready".  unfortunately, it's not possible to "change the past":
+    the previous stage *has no choice* but to pass on its data.
+
+    therefore, the incoming data *must* be accepted - and stored: that
+    is the responsibility / contract that this stage *must* accept.
+    on the same clock, it's possible to tell the input that it must
+    not send any more data.  this is the "stall" condition.
+
+    we now effectively have *two* possible pieces of data to "choose" from:
+    the buffered data, and the incoming data.  the decision as to which
+    to process and output is based on whether we are in "stall" or not.
+    i.e. when the next stage is no longer ready, the output comes from
+    the buffer if a stall had previously occurred, otherwise it comes
+    direct from processing the input.
+
+    this allows us to respect a synchronous "travelling STB" with what
+    dan calls a "buffered handshake".
+
+    it's quite a complex state machine!
+
+    SimpleHandshake
+    ---------------
+
+    Synchronised pipeline, Based on:
+    https://github.com/ZipCPU/dbgbus/blob/master/hexbus/rtl/hbdeword.v
+"""
+
+from nmigen import Signal, Mux, Module, Elaboratable, Const
+from nmigen.cli import verilog, rtlil
+from nmigen.hdl.rec import Record
+
+from nmutil.queue import Queue
+import inspect
+
+from nmutil.iocontrol import (PrevControl, NextControl, Object, RecordObject)
+from nmutil.stageapi import (_spec, StageCls, Stage, StageChain, StageHelper)
+from nmutil import nmoperator
+
+
+class RecordBasedStage(Stage):
+    """ convenience class which provides a Records-based layout.
+        honestly it's a lot easier just to create a direct Records-based
+        class (see ExampleAddRecordStage)
+    """
+    def __init__(self, in_shape, out_shape, processfn, setupfn=None):
+        self.in_shape = in_shape
+        self.out_shape = out_shape
+        self.__process = processfn
+        self.__setup = setupfn
+    def ispec(self): return Record(self.in_shape)
+    def ospec(self): return Record(self.out_shape)
+    def process(seif, i): return self.__process(i)
+    def setup(seif, m, i): return self.__setup(m, i)
+
+
+class PassThroughStage(StageCls):
+    """ a pass-through stage with its input data spec identical to its output,
+        and "passes through" its data from input to output (does nothing).
+
+        use this basically to explicitly make any data spec Stage-compliant.
+        (many APIs would potentially use a static "wrap" method in e.g.
+         StageCls to achieve a similar effect)
+    """
+    def __init__(self, iospecfn): self.iospecfn = iospecfn
+    def ispec(self): return self.iospecfn()
+    def ospec(self): return self.iospecfn()
+
+
+class ControlBase(StageHelper, Elaboratable):
+    """ Common functions for Pipeline API.  Note: a "pipeline stage" only
+        exists (conceptually) when a ControlBase derivative is handed
+        a Stage (combinatorial block)
+
+        NOTE: ControlBase derives from StageHelper, making it accidentally
+        compliant with the Stage API.  Using those functions directly
+        *BYPASSES* a ControlBase instance ready/valid signalling, which
+        clearly should not be done without a really, really good reason.
+    """
+    def __init__(self, stage=None, in_multi=None, stage_ctl=False, maskwid=0):
+        """ Base class containing ready/valid/data to previous and next stages
+
+            * p: contains ready/valid to the previous stage
+            * n: contains ready/valid to the next stage
+
+            Except when calling Controlbase.connect(), user must also:
+            * add data_i member to PrevControl (p) and
+            * add data_o member to NextControl (n)
+            Calling ControlBase._new_data is a good way to do that.
+        """
+        print ("ControlBase", self, stage, in_multi, stage_ctl)
+        StageHelper.__init__(self, stage)
+
+        # set up input and output IO ACK (prev/next ready/valid)
+        self.p = PrevControl(in_multi, stage_ctl, maskwid=maskwid)
+        self.n = NextControl(stage_ctl, maskwid=maskwid)
+
+        # set up the input and output data
+        if stage is not None:
+            self._new_data("data")
+
+    def _new_data(self, name):
+        """ allocates new data_i and data_o
+        """
+        self.p.data_i, self.n.data_o = self.new_specs(name)
+
+    @property
+    def data_r(self):
+        return self.process(self.p.data_i)
+
+    def connect_to_next(self, nxt):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n.connect_to_next(nxt.p)
+
+    def _connect_in(self, prev):
+        """ internal helper function to connect stage to an input source.
+            do not use to connect stage-to-stage!
+        """
+        return self.p._connect_in(prev.p)
+
+    def _connect_out(self, nxt):
+        """ internal helper function to connect stage to an output source.
+            do not use to connect stage-to-stage!
+        """
+        return self.n._connect_out(nxt.n)
+
+    def connect(self, pipechain):
+        """ connects a chain (list) of Pipeline instances together and
+            links them to this ControlBase instance:
+
+                      in <----> self <---> out
+                       |                   ^
+                       v                   |
+                    [pipe1, pipe2, pipe3, pipe4]
+                       |    ^  |    ^  |     ^
+                       v    |  v    |  v     |
+                     out---in out--in out---in
+
+            Also takes care of allocating data_i/data_o, by looking up
+            the data spec for each end of the pipechain.  i.e It is NOT
+            necessary to allocate self.p.data_i or self.n.data_o manually:
+            this is handled AUTOMATICALLY, here.
+
+            Basically this function is the direct equivalent of StageChain,
+            except that unlike StageChain, the Pipeline logic is followed.
+
+            Just as StageChain presents an object that conforms to the
+            Stage API from a list of objects that also conform to the
+            Stage API, an object that calls this Pipeline connect function
+            has the exact same pipeline API as the list of pipline objects
+            it is called with.
+
+            Thus it becomes possible to build up larger chains recursively.
+            More complex chains (multi-input, multi-output) will have to be
+            done manually.
+
+            Argument:
+
+            * :pipechain: - a sequence of ControlBase-derived classes
+                            (must be one or more in length)
+
+            Returns:
+
+            * a list of eq assignments that will need to be added in
+              an elaborate() to m.d.comb
+        """
+        assert len(pipechain) > 0, "pipechain must be non-zero length"
+        assert self.stage is None, "do not use connect with a stage"
+        eqs = [] # collated list of assignment statements
+
+        # connect inter-chain
+        for i in range(len(pipechain)-1):
+            pipe1 = pipechain[i]                # earlier
+            pipe2 = pipechain[i+1]              # later (by 1)
+            eqs += pipe1.connect_to_next(pipe2) # earlier n to later p
+
+        # connect front and back of chain to ourselves
+        front = pipechain[0]                # first in chain
+        end = pipechain[-1]                 # last in chain
+        self.set_specs(front, end) # sets up ispec/ospec functions
+        self._new_data("chain") # NOTE: REPLACES existing data
+        eqs += front._connect_in(self)      # front p to our p
+        eqs += end._connect_out(self)       # end n   to our n
+
+        return eqs
+
+    def set_input(self, i):
+        """ helper function to set the input data (used in unit tests)
+        """
+        return nmoperator.eq(self.p.data_i, i)
+
+    def __iter__(self):
+        yield from self.p # yields ready/valid/data (data also gets yielded)
+        yield from self.n # ditto
+
+    def ports(self):
+        return list(self)
+
+    def elaborate(self, platform):
+        """ handles case where stage has dynamic ready/valid functions
+        """
+        m = Module()
+        m.submodules.p = self.p
+        m.submodules.n = self.n
+
+        self.setup(m, self.p.data_i)
+
+        if not self.p.stage_ctl:
+            return m
+
+        # intercept the previous (outgoing) "ready", combine with stage ready
+        m.d.comb += self.p.s_ready_o.eq(self.p._ready_o & self.stage.d_ready)
+
+        # intercept the next (incoming) "ready" and combine it with data valid
+        sdv = self.stage.d_valid(self.n.ready_i)
+        m.d.comb += self.n.d_valid.eq(self.n.ready_i & sdv)
+
+        return m
+
+
+class BufferedHandshake(ControlBase):
+    """ buffered pipeline stage.  data and strobe signals travel in sync.
+        if ever the input is ready and the output is not, processed data
+        is shunted in a temporary register.
+
+        Argument: stage.  see Stage API above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                            process --->----^
+                              |             |
+                              +-- r_data ->-+
+
+        input data p.data_i is read (only), is processed and goes into an
+        intermediate result store [process()].  this is updated combinatorially.
+
+        in a non-stall condition, the intermediate result will go into the
+        output (update_output).  however if ever there is a stall, it goes
+        into r_data instead [update_buffer()].
+
+        when the non-stall condition is released, r_data is the first
+        to be transferred to the output [flush_buffer()], and the stall
+        condition cleared.
+
+        on the next cycle (as long as stall is not raised again) the
+        input may begin to be processed and transferred directly to output.
+    """
+
+    def elaborate(self, platform):
+        self.m = ControlBase.elaborate(self, platform)
+
+        result = _spec(self.stage.ospec, "r_tmp")
+        r_data = _spec(self.stage.ospec, "r_data")
+
+        # establish some combinatorial temporaries
+        o_n_validn = Signal(reset_less=True)
+        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
+        nir_por = Signal(reset_less=True)
+        nir_por_n = Signal(reset_less=True)
+        p_valid_i = Signal(reset_less=True)
+        nir_novn = Signal(reset_less=True)
+        nirn_novn = Signal(reset_less=True)
+        por_pivn = Signal(reset_less=True)
+        npnn = Signal(reset_less=True)
+        self.m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
+                     o_n_validn.eq(~self.n.valid_o),
+                     n_ready_i.eq(self.n.ready_i_test),
+                     nir_por.eq(n_ready_i & self.p._ready_o),
+                     nir_por_n.eq(n_ready_i & ~self.p._ready_o),
+                     nir_novn.eq(n_ready_i | o_n_validn),
+                     nirn_novn.eq(~n_ready_i & o_n_validn),
+                     npnn.eq(nir_por | nirn_novn),
+                     por_pivn.eq(self.p._ready_o & ~p_valid_i)
+        ]
+
+        # store result of processing in combinatorial temporary
+        self.m.d.comb += nmoperator.eq(result, self.data_r)
+
+        # if not in stall condition, update the temporary register
+        with self.m.If(self.p.ready_o): # not stalled
+            self.m.d.sync += nmoperator.eq(r_data, result) # update buffer
+
+        # data pass-through conditions
+        with self.m.If(npnn):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            self.m.d.sync += [self.n.valid_o.eq(p_valid_i), # valid if p_valid
+                              nmoperator.eq(self.n.data_o, data_o), # update out
+                             ]
+        # buffer flush conditions (NOTE: can override data passthru conditions)
+        with self.m.If(nir_por_n): # not stalled
+            # Flush the [already processed] buffer to the output port.
+            data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
+            self.m.d.sync += [self.n.valid_o.eq(1),  # reg empty
+                              nmoperator.eq(self.n.data_o, data_o), # flush
+                             ]
+        # output ready conditions
+        self.m.d.sync += self.p._ready_o.eq(nir_novn | por_pivn)
+
+        return self.m
+
+
+class MaskNoDelayCancellable(ControlBase):
+    """ Mask-activated Cancellable pipeline (that does not respect "ready")
+
+        Based on (identical behaviour to) SimpleHandshake.
+        TODO: decide whether to merge *into* SimpleHandshake.
+
+        Argument: stage.  see Stage API above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                              +--process->--^
+    """
+    def __init__(self, stage, maskwid, in_multi=None, stage_ctl=False):
+        ControlBase.__init__(self, stage, in_multi, stage_ctl, maskwid)
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        # store result of processing in combinatorial temporary
+        result = _spec(self.stage.ospec, "r_tmp")
+        m.d.comb += nmoperator.eq(result, self.data_r)
+
+        # establish if the data should be passed on.  cancellation is
+        # a global signal.
+        # XXX EXCEPTIONAL CIRCUMSTANCES: inspection of the data payload
+        # is NOT "normal" for the Stage API.
+        p_valid_i = Signal(reset_less=True)
+        #print ("self.p.data_i", self.p.data_i)
+        maskedout = Signal(len(self.p.mask_i), reset_less=True)
+        m.d.comb += maskedout.eq(self.p.mask_i & ~self.p.stop_i)
+        m.d.comb += p_valid_i.eq(maskedout.bool())
+
+        # if idmask nonzero, mask gets passed on (and register set).
+        # register is left as-is if idmask is zero, but out-mask is set to zero
+        # note however: only the *uncancelled* mask bits get passed on
+        m.d.sync += self.n.valid_o.eq(p_valid_i)
+        m.d.sync += self.n.mask_o.eq(Mux(p_valid_i, maskedout, 0))
+        with m.If(p_valid_i):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            m.d.sync += nmoperator.eq(self.n.data_o, data_o) # update output
+
+        # output valid if
+        # input always "ready"
+        #m.d.comb += self.p._ready_o.eq(self.n.ready_i_test)
+        m.d.comb += self.p._ready_o.eq(Const(1))
+
+        # always pass on stop (as combinatorial: single signal)
+        m.d.comb += self.n.stop_o.eq(self.p.stop_i)
+
+        return self.m
+
+
+class MaskCancellable(ControlBase):
+    """ Mask-activated Cancellable pipeline
+
+        Arguments:
+
+        * stage.  see Stage API above
+        * maskwid - sets up cancellation capability (mask and stop).
+        * in_multi
+        * stage_ctl
+        * dynamic - allows switching from sync to combinatorial (passthrough)
+                    USE WITH CARE.  will need the entire pipe to be quiescent
+                    before switching, otherwise data WILL be destroyed.
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                              +--process->--^
+    """
+    def __init__(self, stage, maskwid, in_multi=None, stage_ctl=False,
+                       dynamic=False):
+        ControlBase.__init__(self, stage, in_multi, stage_ctl, maskwid)
+        self.dynamic = dynamic
+        if dynamic:
+            self.latchmode = Signal()
+        else:
+            self.latchmode = Const(1)
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        mask_r = Signal(len(self.p.mask_i), reset_less=True)
+        data_r = _spec(self.stage.ospec, "data_r")
+        m.d.comb += nmoperator.eq(data_r, self._postprocess(self.data_r))
+
+        with m.If(self.latchmode):
+            r_busy = Signal()
+            r_latch = _spec(self.stage.ospec, "r_latch")
+
+            # establish if the data should be passed on.  cancellation is
+            # a global signal.
+            p_valid_i = Signal(reset_less=True)
+            #print ("self.p.data_i", self.p.data_i)
+            maskedout = Signal(len(self.p.mask_i), reset_less=True)
+            m.d.comb += maskedout.eq(self.p.mask_i & ~self.p.stop_i)
+
+            # establish some combinatorial temporaries
+            n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
+            p_valid_i_p_ready_o = Signal(reset_less=True)
+            m.d.comb += [p_valid_i.eq(self.p.valid_i_test & maskedout.bool()),
+                         n_ready_i.eq(self.n.ready_i_test),
+                         p_valid_i_p_ready_o.eq(p_valid_i & self.p.ready_o),
+            ]
+
+            # if idmask nonzero, mask gets passed on (and register set).
+            # register is left as-is if idmask is zero, but out-mask is set to
+            # zero
+            # note however: only the *uncancelled* mask bits get passed on
+            m.d.sync += mask_r.eq(Mux(p_valid_i, maskedout, 0))
+            m.d.comb += self.n.mask_o.eq(mask_r)
+
+            # always pass on stop (as combinatorial: single signal)
+            m.d.comb += self.n.stop_o.eq(self.p.stop_i)
+
+            stor = Signal(reset_less=True)
+            m.d.comb += stor.eq(p_valid_i_p_ready_o | n_ready_i)
+            with m.If(stor):
+                # store result of processing in combinatorial temporary
+                m.d.sync += nmoperator.eq(r_latch, data_r)
+
+            # previous valid and ready
+            with m.If(p_valid_i_p_ready_o):
+                m.d.sync += r_busy.eq(1)      # output valid
+            # previous invalid or not ready, however next is accepting
+            with m.Elif(n_ready_i):
+                m.d.sync += r_busy.eq(0) # ...so set output invalid
+
+            # output set combinatorially from latch
+            m.d.comb += nmoperator.eq(self.n.data_o, r_latch)
+
+            m.d.comb += self.n.valid_o.eq(r_busy)
+            # if next is ready, so is previous
+            m.d.comb += self.p._ready_o.eq(n_ready_i)
+
+        with m.Else():
+            # pass everything straight through.  p connected to n: data,
+            # valid, mask, everything.  this is "effectively" just a
+            # StageChain: MaskCancellable is doing "nothing" except
+            # combinatorially passing everything through
+            # (except now it's *dynamically selectable* whether to do that)
+            m.d.comb += self.n.valid_o.eq(self.p.valid_i_test)
+            m.d.comb += self.p._ready_o.eq(self.n.ready_i_test)
+            m.d.comb += self.n.stop_o.eq(self.p.stop_i)
+            m.d.comb += self.n.mask_o.eq(self.p.mask_i)
+            m.d.comb += nmoperator.eq(self.n.data_o, data_r)
+
+        return self.m
+
+
+class SimpleHandshake(ControlBase):
+    """ simple handshake control.  data and strobe signals travel in sync.
+        implements the protocol used by Wishbone and AXI4.
+
+        Argument: stage.  see Stage API above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                              +--process->--^
+        Truth Table
+
+        Inputs   Temporary  Output Data
+        -------  ---------- -----  ----
+        P P N N  PiV& ~NiR&  N P
+        i o i o  PoR  NoV    o o
+        V R R V              V R
+
+        -------   -    -     - -
+        0 0 0 0   0    0    >0 0    reg
+        0 0 0 1   0    1    >1 0    reg
+        0 0 1 0   0    0     0 1    process(data_i)
+        0 0 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        0 1 0 0   0    0    >0 0    reg
+        0 1 0 1   0    1    >1 0    reg
+        0 1 1 0   0    0     0 1    process(data_i)
+        0 1 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        1 0 0 0   0    0    >0 0    reg
+        1 0 0 1   0    1    >1 0    reg
+        1 0 1 0   0    0     0 1    process(data_i)
+        1 0 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        1 1 0 0   1    0     1 0    process(data_i)
+        1 1 0 1   1    1     1 0    process(data_i)
+        1 1 1 0   1    0     1 1    process(data_i)
+        1 1 1 1   1    0     1 1    process(data_i)
+        -------   -    -     - -
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        r_busy = Signal()
+        result = _spec(self.stage.ospec, "r_tmp")
+
+        # establish some combinatorial temporaries
+        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
+        p_valid_i_p_ready_o = Signal(reset_less=True)
+        p_valid_i = Signal(reset_less=True)
+        m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
+                     n_ready_i.eq(self.n.ready_i_test),
+                     p_valid_i_p_ready_o.eq(p_valid_i & self.p.ready_o),
+        ]
+
+        # store result of processing in combinatorial temporary
+        m.d.comb += nmoperator.eq(result, self.data_r)
+
+        # previous valid and ready
+        with m.If(p_valid_i_p_ready_o):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            m.d.sync += [r_busy.eq(1),      # output valid
+                         nmoperator.eq(self.n.data_o, data_o), # update output
+                        ]
+        # previous invalid or not ready, however next is accepting
+        with m.Elif(n_ready_i):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            m.d.sync += [nmoperator.eq(self.n.data_o, data_o)]
+            # TODO: could still send data here (if there was any)
+            #m.d.sync += self.n.valid_o.eq(0) # ...so set output invalid
+            m.d.sync += r_busy.eq(0) # ...so set output invalid
+
+        m.d.comb += self.n.valid_o.eq(r_busy)
+        # if next is ready, so is previous
+        m.d.comb += self.p._ready_o.eq(n_ready_i)
+
+        return self.m
+
+
+class UnbufferedPipeline(ControlBase):
+    """ A simple pipeline stage with single-clock synchronisation
+        and two-way valid/ready synchronised signalling.
+
+        Note that a stall in one stage will result in the entire pipeline
+        chain stalling.
+
+        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
+        travel synchronously with the data: the valid/ready signalling
+        combines in a *combinatorial* fashion.  Therefore, a long pipeline
+        chain will lengthen propagation delays.
+
+        Argument: stage.  see Stage API, above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                            r_data        result
+                              |             |
+                              +--process ->-+
+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        r_data : input_shape according to ispec
+            A temporary (buffered) copy of a prior (valid) input.
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+        result: output_shape according to ospec
+            The output of the combinatorial logic.  it is updated
+            COMBINATORIALLY (no clock dependence).
+
+        Truth Table
+
+        Inputs  Temp  Output  Data
+        -------   -   -----   ----
+        P P N N ~NiR&  N P
+        i o i o  NoV   o o
+        V R R V        V R
+
+        -------   -    - -
+        0 0 0 0   0    0 1    reg
+        0 0 0 1   1    1 0    reg
+        0 0 1 0   0    0 1    reg
+        0 0 1 1   0    0 1    reg
+        -------   -    - -
+        0 1 0 0   0    0 1    reg
+        0 1 0 1   1    1 0    reg
+        0 1 1 0   0    0 1    reg
+        0 1 1 1   0    0 1    reg
+        -------   -    - -
+        1 0 0 0   0    1 1    reg
+        1 0 0 1   1    1 0    reg
+        1 0 1 0   0    1 1    reg
+        1 0 1 1   0    1 1    reg
+        -------   -    - -
+        1 1 0 0   0    1 1    process(data_i)
+        1 1 0 1   1    1 0    process(data_i)
+        1 1 1 0   0    1 1    process(data_i)
+        1 1 1 1   0    1 1    process(data_i)
+        -------   -    - -
+
+        Note: PoR is *NOT* involved in the above decision-making.
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        data_valid = Signal() # is data valid or not
+        r_data = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # some temporaries
+        p_valid_i = Signal(reset_less=True)
+        pv = Signal(reset_less=True)
+        buf_full = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
+        m.d.comb += buf_full.eq(~self.n.ready_i_test & data_valid)
+
+        m.d.comb += self.n.valid_o.eq(data_valid)
+        m.d.comb += self.p._ready_o.eq(~data_valid | self.n.ready_i_test)
+        m.d.sync += data_valid.eq(p_valid_i | buf_full)
+
+        with m.If(pv):
+            m.d.sync += nmoperator.eq(r_data, self.data_r)
+        data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
+
+        return self.m
+
+
+class UnbufferedPipeline2(ControlBase):
+    """ A simple pipeline stage with single-clock synchronisation
+        and two-way valid/ready synchronised signalling.
+
+        Note that a stall in one stage will result in the entire pipeline
+        chain stalling.
+
+        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
+        travel synchronously with the data: the valid/ready signalling
+        combines in a *combinatorial* fashion.  Therefore, a long pipeline
+        chain will lengthen propagation delays.
+
+        Argument: stage.  see Stage API, above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |    |
+                              +- process-> buf <-+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        buf : output_shape according to ospec
+            A temporary (buffered) copy of a valid output
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+
+        Inputs  Temp  Output Data
+        -------   -   -----
+        P P N N ~NiR&  N P   (buf_full)
+        i o i o  NoV   o o
+        V R R V        V R
+
+        -------   -    - -
+        0 0 0 0   0    0 1   process(data_i)
+        0 0 0 1   1    1 0   reg (odata, unchanged)
+        0 0 1 0   0    0 1   process(data_i)
+        0 0 1 1   0    0 1   process(data_i)
+        -------   -    - -
+        0 1 0 0   0    0 1   process(data_i)
+        0 1 0 1   1    1 0   reg (odata, unchanged)
+        0 1 1 0   0    0 1   process(data_i)
+        0 1 1 1   0    0 1   process(data_i)
+        -------   -    - -
+        1 0 0 0   0    1 1   process(data_i)
+        1 0 0 1   1    1 0   reg (odata, unchanged)
+        1 0 1 0   0    1 1   process(data_i)
+        1 0 1 1   0    1 1   process(data_i)
+        -------   -    - -
+        1 1 0 0   0    1 1   process(data_i)
+        1 1 0 1   1    1 0   reg (odata, unchanged)
+        1 1 1 0   0    1 1   process(data_i)
+        1 1 1 1   0    1 1   process(data_i)
+        -------   -    - -
+
+        Note: PoR is *NOT* involved in the above decision-making.
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        buf_full = Signal() # is data valid or not
+        buf = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # some temporaries
+        p_valid_i = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+
+        m.d.comb += self.n.valid_o.eq(buf_full | p_valid_i)
+        m.d.comb += self.p._ready_o.eq(~buf_full)
+        m.d.sync += buf_full.eq(~self.n.ready_i_test & self.n.valid_o)
+
+        data_o = Mux(buf_full, buf, self.data_r)
+        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
+        m.d.sync += nmoperator.eq(buf, self.n.data_o)
+
+        return self.m
+
+
+class PassThroughHandshake(ControlBase):
+    """ A control block that delays by one clock cycle.
+
+        Inputs   Temporary          Output Data
+        -------  ------------------  ----- ----
+        P P N N  PiV& PiV| NiR| pvr   N P  (pvr)
+        i o i o  PoR  ~PoR ~NoV       o o
+        V R R V                       V R
+
+        -------   -    -    -   -     - -
+        0 0 0 0   0    1    1   0     1 1   odata (unchanged)
+        0 0 0 1   0    1    0   0     1 0   odata (unchanged)
+        0 0 1 0   0    1    1   0     1 1   odata (unchanged)
+        0 0 1 1   0    1    1   0     1 1   odata (unchanged)
+        -------   -    -    -   -     - -
+        0 1 0 0   0    0    1   0     0 1   odata (unchanged)
+        0 1 0 1   0    0    0   0     0 0   odata (unchanged)
+        0 1 1 0   0    0    1   0     0 1   odata (unchanged)
+        0 1 1 1   0    0    1   0     0 1   odata (unchanged)
+        -------   -    -    -   -     - -
+        1 0 0 0   0    1    1   1     1 1   process(in)
+        1 0 0 1   0    1    0   0     1 0   odata (unchanged)
+        1 0 1 0   0    1    1   1     1 1   process(in)
+        1 0 1 1   0    1    1   1     1 1   process(in)
+        -------   -    -    -   -     - -
+        1 1 0 0   1    1    1   1     1 1   process(in)
+        1 1 0 1   1    1    0   0     1 0   odata (unchanged)
+        1 1 1 0   1    1    1   1     1 1   process(in)
+        1 1 1 1   1    1    1   1     1 1   process(in)
+        -------   -    -    -   -     - -
+
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        r_data = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # temporaries
+        p_valid_i = Signal(reset_less=True)
+        pvr = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        m.d.comb += pvr.eq(p_valid_i & self.p.ready_o)
+
+        m.d.comb += self.p.ready_o.eq(~self.n.valid_o |  self.n.ready_i_test)
+        m.d.sync += self.n.valid_o.eq(p_valid_i       | ~self.p.ready_o)
+
+        odata = Mux(pvr, self.data_r, r_data)
+        m.d.sync += nmoperator.eq(r_data, odata)
+        r_data = self._postprocess(r_data) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, r_data)
+
+        return m
+
+
+class RegisterPipeline(UnbufferedPipeline):
+    """ A pipeline stage that delays by one clock cycle, creating a
+        sync'd latch out of data_o and valid_o as an indirect byproduct
+        of using PassThroughStage
+    """
+    def __init__(self, iospecfn):
+        UnbufferedPipeline.__init__(self, PassThroughStage(iospecfn))
+
+
+class FIFOControl(ControlBase):
+    """ FIFO Control.  Uses Queue to store data, coincidentally
+        happens to have same valid/ready signalling as Stage API.
+
+        data_i -> fifo.din -> FIFO -> fifo.dout -> data_o
+    """
+    def __init__(self, depth, stage, in_multi=None, stage_ctl=False,
+                                     fwft=True, pipe=False):
+        """ FIFO Control
+
+            * :depth: number of entries in the FIFO
+            * :stage: data processing block
+            * :fwft:  first word fall-thru mode (non-fwft introduces delay)
+            * :pipe:  specifies pipe mode.
+
+            when fwft = True it indicates that transfers may occur
+            combinatorially through stage processing in the same clock cycle.
+            This requires that the Stage be a Moore FSM:
+            https://en.wikipedia.org/wiki/Moore_machine
+
+            when fwft = False it indicates that all output signals are
+            produced only from internal registers or memory, i.e. that the
+            Stage is a Mealy FSM:
+            https://en.wikipedia.org/wiki/Mealy_machine
+
+            data is processed (and located) as follows:
+
+            self.p  self.stage temp    fn temp  fn  temp  fp   self.n
+            data_i->process()->result->cat->din.FIFO.dout->cat(data_o)
+
+            yes, really: cat produces a Cat() which can be assigned to.
+            this is how the FIFO gets de-catted without needing a de-cat
+            function
+        """
+        self.fwft = fwft
+        self.pipe = pipe
+        self.fdepth = depth
+        ControlBase.__init__(self, stage, in_multi, stage_ctl)
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        # make a FIFO with a signal of equal width to the data_o.
+        (fwidth, _) = nmoperator.shape(self.n.data_o)
+        fifo = Queue(fwidth, self.fdepth, fwft=self.fwft, pipe=self.pipe)
+        m.submodules.fifo = fifo
+
+        def processfn(data_i):
+            # store result of processing in combinatorial temporary
+            result = _spec(self.stage.ospec, "r_temp")
+            m.d.comb += nmoperator.eq(result, self.process(data_i))
+            return nmoperator.cat(result)
+
+        ## prev: make the FIFO (Queue object) "look" like a PrevControl...
+        m.submodules.fp = fp = PrevControl()
+        fp.valid_i, fp._ready_o, fp.data_i = fifo.we, fifo.writable, fifo.din
+        m.d.comb += fp._connect_in(self.p, fn=processfn)
+
+        # next: make the FIFO (Queue object) "look" like a NextControl...
+        m.submodules.fn = fn = NextControl()
+        fn.valid_o, fn.ready_i, fn.data_o  = fifo.readable, fifo.re, fifo.dout
+        connections = fn._connect_out(self.n, fn=nmoperator.cat)
+        valid_eq, ready_eq, data_o = connections
+
+        # ok ok so we can't just do the ready/valid eqs straight:
+        # first 2 from connections are the ready/valid, 3rd is data.
+        if self.fwft:
+            m.d.comb += [valid_eq, ready_eq] # combinatorial on next ready/valid
+        else:
+            m.d.sync += [valid_eq, ready_eq] # non-fwft mode needs sync
+        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
+        m.d.comb += data_o
+
+        return m
+
+
+# aka "RegStage".
+class UnbufferedPipeline(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+
+# aka "BreakReadyStage" XXX had to set fwft=True to get it to work
+class PassThroughHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=True)
+
+# this is *probably* BufferedHandshake, although test #997 now succeeds.
+class BufferedHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 2, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+
+
+"""
+# this is *probably* SimpleHandshake (note: memory cell size=0)
+class SimpleHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 0, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+"""
diff --git a/src/nmutil/stageapi.py b/src/nmutil/stageapi.py
new file mode 100644
index 0000000..b709abd
--- /dev/null
+++ b/src/nmutil/stageapi.py
@@ -0,0 +1,280 @@
+""" Stage API
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=148
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Stage API:
+    ---------
+
+    stage requires compliance with a strict API that may be
+    implemented in several means, including as a static class.
+
+    Stages do not HOLD data, and they definitely do not contain
+    signalling (ready/valid).  They do however specify the FORMAT
+    of the incoming and outgoing data, and they provide a means to
+    PROCESS that data (from incoming format to outgoing format).
+
+    Stage Blocks really should be combinatorial blocks (Moore FSMs).
+    It would be ok to have input come in from sync'd sources
+    (clock-driven, Mealy FSMs) however by doing so they would no longer
+    be deterministic, and chaining such blocks with such side-effects
+    together could result in unexpected, unpredictable, unreproduceable
+    behaviour.
+
+    So generally to be avoided, then unless you know what you are doing.
+    https://en.wikipedia.org/wiki/Moore_machine
+    https://en.wikipedia.org/wiki/Mealy_machine
+
+    the methods of a stage instance must be as follows:
+
+    * ispec() - Input data format specification.  Takes a bit of explaining.
+                The requirements are: something that eventually derives from
+                nmigen Value must be returned *OR* an iterator or iterable
+                or sequence (list, tuple etc.) or generator must *yield*
+                thing(s) that (eventually) derive from the nmigen Value class.
+
+                Complex to state, very simple in practice:
+                see test_buf_pipe.py for over 25 worked examples.
+
+    * ospec() - Output data format specification.
+                format requirements identical to ispec.
+
+    * process(m, i) - Optional function for processing ispec-formatted data.
+                returns a combinatorial block of a result that
+                may be assigned to the output, by way of the "nmoperator.eq"
+                function.  Note that what is returned here can be
+                extremely flexible.  Even a dictionary can be returned
+                as long as it has fields that match precisely with the
+                Record into which its values is intended to be assigned.
+                Again: see example unit tests for details.
+
+    * setup(m, i) - Optional function for setting up submodules.
+                may be used for more complex stages, to link
+                the input (i) to submodules.  must take responsibility
+                for adding those submodules to the module (m).
+                the submodules must be combinatorial blocks and
+                must have their inputs and output linked combinatorially.
+
+    Both StageCls (for use with non-static classes) and Stage (for use
+    by static classes) are abstract classes from which, for convenience
+    and as a courtesy to other developers, anything conforming to the
+    Stage API may *choose* to derive.  See Liskov Substitution Principle:
+    https://en.wikipedia.org/wiki/Liskov_substitution_principle
+
+    StageChain:
+    ----------
+
+    A useful combinatorial wrapper around stages that chains them together
+    and then presents a Stage-API-conformant interface.  By presenting
+    the same API as the stages it wraps, it can clearly be used recursively.
+
+    StageHelper:
+    ----------
+
+    A convenience wrapper around a Stage-API-compliant "thing" which
+    complies with the Stage API and provides mandatory versions of
+    all the optional bits.
+"""
+
+from nmigen import Elaboratable
+from abc import ABCMeta, abstractmethod
+import inspect
+
+from nmutil import nmoperator
+
+
+def _spec(fn, name=None):
+    """ useful function that determines if "fn" has an argument "name".
+        if so, fn(name) is called otherwise fn() is called.
+
+        means that ispec and ospec can be declared with *or without*
+        a name argument.  normally it would be necessary to have
+        "ispec(name=None)" to achieve the same effect.
+    """
+    if name is None:
+        return fn()
+    varnames = dict(inspect.getmembers(fn.__code__))['co_varnames']
+    if 'name' in varnames:
+        return fn(name=name)
+    return fn()
+
+
+class StageCls(metaclass=ABCMeta):
+    """ Class-based "Stage" API.  requires instantiation (after derivation)
+
+        see "Stage API" above..  Note: python does *not* require derivation
+        from this class.  All that is required is that the pipelines *have*
+        the functions listed in this class.  Derivation from this class
+        is therefore merely a "courtesy" to maintainers.
+    """
+    @abstractmethod
+    def ispec(self): pass       # REQUIRED
+    @abstractmethod
+    def ospec(self): pass       # REQUIRED
+    #@abstractmethod
+    #def setup(self, m, i): pass # OPTIONAL
+    #@abstractmethod
+    #def process(self, i): pass  # OPTIONAL
+
+
+class Stage(metaclass=ABCMeta):
+    """ Static "Stage" API.  does not require instantiation (after derivation)
+
+        see "Stage API" above.  Note: python does *not* require derivation
+        from this class.  All that is required is that the pipelines *have*
+        the functions listed in this class.  Derivation from this class
+        is therefore merely a "courtesy" to maintainers.
+    """
+    @staticmethod
+    @abstractmethod
+    def ispec(): pass
+
+    @staticmethod
+    @abstractmethod
+    def ospec(): pass
+
+    #@staticmethod
+    #@abstractmethod
+    #def setup(m, i): pass
+
+    #@staticmethod
+    #@abstractmethod
+    #def process(i): pass
+
+
+class StageHelper(Stage):
+    """ a convenience wrapper around something that is Stage-API-compliant.
+        (that "something" may be a static class, for example).
+
+        StageHelper happens to also be compliant with the Stage API,
+        it differs from the stage that it wraps in that all the "optional"
+        functions are provided (hence the designation "convenience wrapper")
+    """
+    def __init__(self, stage):
+        self.stage = stage
+        self._ispecfn = None
+        self._ospecfn = None
+        if stage is not None:
+            self.set_specs(self, self)
+
+    def ospec(self, name=None):
+        assert self._ospecfn is not None
+        return _spec(self._ospecfn, name)
+
+    def ispec(self, name=None):
+        assert self._ispecfn is not None
+        return _spec(self._ispecfn, name)
+
+    def set_specs(self, p, n):
+        """ sets up the ispecfn and ospecfn for getting input and output data
+        """
+        if hasattr(p, "stage"):
+            p = p.stage
+        if hasattr(n, "stage"):
+            n = n.stage
+        self._ispecfn = p.ispec
+        self._ospecfn = n.ospec
+
+    def new_specs(self, name):
+        """ allocates new ispec and ospec pair
+        """
+        return (_spec(self.ispec, "%s_i" % name),
+                _spec(self.ospec, "%s_o" % name))
+
+    def process(self, i):
+        if self.stage and hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def setup(self, m, i):
+        if self.stage is not None and hasattr(self.stage, "setup"):
+            self.stage.setup(m, i)
+
+    def _postprocess(self, i): # XXX DISABLED
+        return i # RETURNS INPUT
+        if hasattr(self.stage, "postprocess"):
+            return self.stage.postprocess(i)
+        return i
+
+
+class StageChain(StageHelper):
+    """ pass in a list of stages (combinatorial blocks), and they will
+        automatically be chained together via their input and output specs
+        into a combinatorial chain, to create one giant combinatorial
+        block.
+
+        the end result conforms to the exact same Stage API.
+
+        * input to this class will be the input of the first stage
+        * output of first stage goes into input of second
+        * output of second goes into input into third
+        * ... (etc. etc.)
+        * the output of this class will be the output of the last stage
+
+        NOTE: whilst this is very similar to ControlBase.connect(), it is
+        *really* important to appreciate that StageChain is pure
+        combinatorial and bypasses (does not involve, at all, ready/valid
+        signalling OF ANY KIND).
+
+        ControlBase.connect on the other hand respects, connects, and uses
+        ready/valid signalling.
+
+        Arguments:
+
+        * :chain: a chain of combinatorial blocks conforming to the Stage API
+                  NOTE: StageChain.ispec and ospect have to have something
+                  to return (beginning and end specs of the chain),
+                  therefore the chain argument must be non-zero length
+
+        * :specallocate: if set, new input and output data will be allocated
+                         and connected (eq'd) to each chained Stage.
+                         in some cases if this is not done, the nmigen warning
+                         "driving from two sources, module is being flattened"
+                         will be issued.
+
+        NOTE: DO NOT use StageChain with combinatorial blocks that have
+        side-effects (state-based / clock-based input) or conditional
+        (inter-chain) dependencies, unless you really know what you are doing.
+    """
+    def __init__(self, chain, specallocate=False):
+        assert len(chain) > 0, "stage chain must be non-zero length"
+        self.chain = chain
+        StageHelper.__init__(self, None)
+        if specallocate:
+            self.setup = self._sa_setup
+        else:
+            self.setup = self._na_setup
+        self.set_specs(self.chain[0], self.chain[-1])
+
+    def _sa_setup(self, m, i):
+        for (idx, c) in enumerate(self.chain):
+            if hasattr(c, "setup"):
+                c.setup(m, i)               # stage may have some module stuff
+            ofn = self.chain[idx].ospec     # last assignment survives
+            cname = 'chainin%d' % idx
+            o = _spec(ofn, cname)
+            if isinstance(o, Elaboratable):
+                setattr(m.submodules, cname, o)
+            m.d.comb += nmoperator.eq(o, c.process(i)) # process input into "o"
+            if idx == len(self.chain)-1:
+                break
+            ifn = self.chain[idx+1].ispec   # new input on next loop
+            i = _spec(ifn, 'chainin%d' % (idx+1))
+            m.d.comb += nmoperator.eq(i, o) # assign to next input
+        self.o = o
+        return self.o                       # last loop is the output
+
+    def _na_setup(self, m, i):
+        for (idx, c) in enumerate(self.chain):
+            if hasattr(c, "setup"):
+                c.setup(m, i)               # stage may have some module stuff
+            i = o = c.process(i)            # store input into "o"
+        self.o = o
+        return self.o                       # last loop is the output
+
+    def process(self, i):
+        return self.o # conform to Stage API: return last-loop output
+
+
diff --git a/src/nmutil/test/__init__.py b/src/nmutil/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/nmutil/test/example_buf_pipe.py b/src/nmutil/test/example_buf_pipe.py
new file mode 100644
index 0000000..61e9b13
--- /dev/null
+++ b/src/nmutil/test/example_buf_pipe.py
@@ -0,0 +1,103 @@
+""" Pipeline and BufferedHandshake examples
+"""
+
+from nmutil.nmoperator import eq
+from nmutil.iocontrol import (PrevControl, NextControl)
+from nmutil.singlepipe import (PrevControl, NextControl, ControlBase,
+                        StageCls, Stage, StageChain,
+                        BufferedHandshake, UnbufferedPipeline)
+
+from nmigen import Signal, Module
+from nmigen.cli import verilog, rtlil
+
+
+class ExampleAddStage(StageCls):
+    """ an example of how to use the buffered pipeline, as a class instance
+    """
+
+    def ispec(self):
+        """ returns a tuple of input signals which will be the incoming data
+        """
+        return (Signal(16), Signal(16))
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16)
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i[0] + i[1]
+
+
+class ExampleBufPipeAdd(BufferedHandshake):
+    """ an example of how to use the buffered pipeline, using a class instance
+    """
+
+    def __init__(self):
+        addstage = ExampleAddStage()
+        BufferedHandshake.__init__(self, addstage)
+
+
+class ExampleStage(Stage):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def ispec():
+        return Signal(16, name="example_input_signal")
+
+    def ospec():
+        return Signal(16, name="example_output_signal")
+
+    def process(i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+
+class ExampleStageCls(StageCls):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def ispec(self):
+        return Signal(16, name="example_input_signal")
+
+    def ospec(self):
+        return Signal(16, name="example_output_signal")
+
+    def process(self, i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+
+class ExampleBufPipe(BufferedHandshake):
+    """ an example of how to use the buffered pipeline.
+    """
+
+    def __init__(self):
+        BufferedHandshake.__init__(self, ExampleStage)
+
+
+class ExamplePipeline(UnbufferedPipeline):
+    """ an example of how to use the unbuffered pipeline.
+    """
+
+    def __init__(self):
+        UnbufferedPipeline.__init__(self, ExampleStage)
+
+
+if __name__ == '__main__':
+    dut = ExampleBufPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_bufpipe.il", "w") as f:
+        f.write(vl)
+
+    dut = ExamplePipeline()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_combpipe.il", "w") as f:
+        f.write(vl)
diff --git a/src/nmutil/test/test_buf_pipe.py b/src/nmutil/test/test_buf_pipe.py
new file mode 100644
index 0000000..f0bacbb
--- /dev/null
+++ b/src/nmutil/test/test_buf_pipe.py
@@ -0,0 +1,1563 @@
+""" Unit tests for Buffered and Unbuffered pipelines
+
+    contains useful worked examples of how to use the Pipeline API,
+    including:
+
+    * Combinatorial Stage "Chaining"
+    * class-based data stages
+    * nmigen module-based data stages
+    * special nmigen module-based data stage, where the stage *is* the module
+    * Record-based data stages
+    * static-class data stages
+    * multi-stage pipelines (and how to connect them)
+    * how to *use* the pipelines (see Test5) - how to get data in and out
+
+"""
+
+from nmigen import Module, Signal, Mux, Const, Elaboratable
+from nmigen.hdl.rec import Record
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.test.example_buf_pipe import ExampleBufPipe, ExampleBufPipeAdd
+from nmutil.test.example_buf_pipe import ExamplePipeline, UnbufferedPipeline
+from nmutil.test.example_buf_pipe import ExampleStageCls
+from nmutil.iocontrol import PrevControl, NextControl
+from nmutil.stageapi import StageChain, StageCls
+from nmutil.singlepipe import ControlBase
+from nmutil.singlepipe import UnbufferedPipeline2
+from nmutil.singlepipe import SimpleHandshake
+from nmutil.singlepipe import BufferedHandshake
+from nmutil.singlepipe import PassThroughHandshake
+from nmutil.singlepipe import PassThroughStage
+from nmutil.singlepipe import FIFOControl
+from nmutil.singlepipe import RecordObject
+from nmutil.singlepipe import MaskCancellable
+
+from random import randint, seed
+
+#seed(4)
+
+
+def check_o_n_valid(dut, val):
+    o_n_valid = yield dut.n.valid_o
+    assert o_n_valid == val
+
+def check_o_n_valid2(dut, val):
+    o_n_valid = yield dut.n.valid_o
+    assert o_n_valid == val
+
+
+def tbench(dut):
+    #yield dut.i_p_rst.eq(1)
+    yield dut.n.ready_i.eq(0)
+    #yield dut.p.ready_o.eq(0)
+    yield
+    yield
+    #yield dut.i_p_rst.eq(0)
+    yield dut.n.ready_i.eq(1)
+    yield dut.p.data_i.eq(5)
+    yield dut.p.valid_i.eq(1)
+    yield
+
+    yield dut.p.data_i.eq(7)
+    yield from check_o_n_valid(dut, 0) # effects of i_p_valid delayed
+    yield
+    yield from check_o_n_valid(dut, 1) # ok *now* i_p_valid effect is felt
+
+    yield dut.p.data_i.eq(2)
+    yield
+    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
+    yield dut.p.data_i.eq(9)
+    yield
+    yield dut.p.valid_i.eq(0)
+    yield dut.p.data_i.eq(12)
+    yield
+    yield dut.p.data_i.eq(32)
+    yield dut.n.ready_i.eq(1)
+    yield
+    yield from check_o_n_valid(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid(dut, 0) # buffer outputted, *now* we're done.
+    yield
+
+
+def tbench2(dut):
+    #yield dut.p.i_rst.eq(1)
+    yield dut.n.ready_i.eq(0)
+    #yield dut.p.ready_o.eq(0)
+    yield
+    yield
+    #yield dut.p.i_rst.eq(0)
+    yield dut.n.ready_i.eq(1)
+    yield dut.p.data_i.eq(5)
+    yield dut.p.valid_i.eq(1)
+    yield
+
+    yield dut.p.data_i.eq(7)
+    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
+    yield
+    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
+
+    yield dut.p.data_i.eq(2)
+    yield
+    yield from check_o_n_valid2(dut, 1) # ok *now* i_p_valid effect is felt
+    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
+    yield dut.p.data_i.eq(9)
+    yield
+    yield dut.p.valid_i.eq(0)
+    yield dut.p.data_i.eq(12)
+    yield
+    yield dut.p.data_i.eq(32)
+    yield dut.n.ready_i.eq(1)
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 0) # buffer outputted, *now* we're done.
+    yield
+    yield
+    yield
+
+
+class Test3:
+    def __init__(self, dut, resultfn):
+        self.dut = dut
+        self.resultfn = resultfn
+        self.data = []
+        for i in range(num_tests):
+            #data.append(randint(0, 1<<16-1))
+            self.data.append(i+1)
+        self.i = 0
+        self.o = 0
+
+    def send(self):
+        while self.o != len(self.data):
+            send_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                if send_range == 0:
+                    send = True
+                else:
+                    send = randint(0, send_range) != 0
+                o_p_ready = yield self.dut.p.ready_o
+                if not o_p_ready:
+                    yield
+                    continue
+                if send and self.i != len(self.data):
+                    yield self.dut.p.valid_i.eq(1)
+                    yield self.dut.p.data_i.eq(self.data[self.i])
+                    self.i += 1
+                else:
+                    yield self.dut.p.valid_i.eq(0)
+                yield
+
+    def rcv(self):
+        while self.o != len(self.data):
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                stall = randint(0, stall_range) != 0
+                yield self.dut.n.ready_i.eq(stall)
+                yield
+                o_n_valid = yield self.dut.n.valid_o
+                i_n_ready = yield self.dut.n.ready_i_test
+                if not o_n_valid or not i_n_ready:
+                    continue
+                data_o = yield self.dut.n.data_o
+                self.resultfn(data_o, self.data[self.o], self.i, self.o)
+                self.o += 1
+                if self.o == len(self.data):
+                    break
+
+def resultfn_3(data_o, expected, i, o):
+    assert data_o == expected + 1, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, expected)
+
+def data_placeholder():
+        data = []
+        for i in range(num_tests):
+            d = PlaceHolder()
+            d.src1 = randint(0, 1<<16-1)
+            d.src2 = randint(0, 1<<16-1)
+            data.append(d)
+        return data
+
+def data_dict():
+        data = []
+        for i in range(num_tests):
+            data.append({'src1': randint(0, 1<<16-1),
+                         'src2': randint(0, 1<<16-1)})
+        return data
+
+
+class Test5:
+    def __init__(self, dut, resultfn, data=None, stage_ctl=False):
+        self.dut = dut
+        self.resultfn = resultfn
+        self.stage_ctl = stage_ctl
+        if data:
+            self.data = data
+        else:
+            self.data = []
+            for i in range(num_tests):
+                self.data.append((randint(0, 1<<16-1), randint(0, 1<<16-1)))
+        self.i = 0
+        self.o = 0
+
+    def send(self):
+        while self.o != len(self.data):
+            send_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                if send_range == 0:
+                    send = True
+                else:
+                    send = randint(0, send_range) != 0
+                #send = True
+                o_p_ready = yield self.dut.p.ready_o
+                if not o_p_ready:
+                    yield
+                    continue
+                if send and self.i != len(self.data):
+                    yield self.dut.p.valid_i.eq(1)
+                    for v in self.dut.set_input(self.data[self.i]):
+                        yield v
+                    self.i += 1
+                else:
+                    yield self.dut.p.valid_i.eq(0)
+                yield
+
+    def rcv(self):
+        while self.o != len(self.data):
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                ready = randint(0, stall_range) != 0
+                #ready = True
+                yield self.dut.n.ready_i.eq(ready)
+                yield
+                o_n_valid = yield self.dut.n.valid_o
+                i_n_ready = yield self.dut.n.ready_i_test
+                if not o_n_valid or not i_n_ready:
+                    continue
+                if isinstance(self.dut.n.data_o, Record):
+                    data_o = {}
+                    dod = self.dut.n.data_o
+                    for k, v in dod.fields.items():
+                        data_o[k] = yield v
+                else:
+                    data_o = yield self.dut.n.data_o
+                self.resultfn(data_o, self.data[self.o], self.i, self.o)
+                self.o += 1
+                if self.o == len(self.data):
+                    break
+
+class TestMask:
+    def __init__(self, dut, resultfn, maskwid, data=None, stage_ctl=False,
+                       latching=False):
+        self.dut = dut
+        self.resultfn = resultfn
+        self.stage_ctl = stage_ctl
+        self.maskwid = maskwid
+        self.latching = latching
+        self.latchmode = 0
+        if data:
+            self.data = data
+        else:
+            self.data = []
+            for i in range(num_tests):
+                self.data.append((randint(0, 1<<16-1), randint(0, 1<<16-1)))
+        self.i = 0
+        self.o = 0
+
+    def send(self):
+        while self.o != len(self.data):
+            send_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                if send_range == 0:
+                    send = True
+                else:
+                    send = randint(0, send_range) != 0
+                #send = True
+                o_p_ready = yield self.dut.p.ready_o
+                if not o_p_ready:
+                    yield
+                    continue
+
+                if self.latching:
+                    latchtest = randint(0, 3) == 0
+                    if latchtest:
+                        yield self.dut.p.valid_i.eq(0)
+                        yield self.dut.p.mask_i.eq(0)
+                        # wait for pipeline to flush, then invert state
+                        for i in range(10):
+                            yield
+                        self.latchmode = 1 - self.latchmode
+                        yield self.dut.latchmode.eq(self.latchmode)
+                        mode = yield self.dut.latchmode
+                        print ("latching", mode)
+
+                if send and self.i != len(self.data):
+                    print ("send", self.i, self.data[self.i])
+                    yield self.dut.p.valid_i.eq(1)
+                    yield self.dut.p.mask_i.eq(1<<self.i) # XXX TODO
+                    for v in self.dut.set_input(self.data[self.i]):
+                        yield v
+                    self.i += 1
+                else:
+                    yield self.dut.p.valid_i.eq(0)
+                    yield self.dut.p.mask_i.eq(0) # XXX TODO
+                yield
+
+    def rcv(self):
+        while self.o != len(self.data):
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                ready = randint(0, stall_range) != 0
+                ready = True
+                yield self.dut.n.ready_i.eq(ready)
+                yield
+                o_n_valid = yield self.dut.n.valid_o
+                i_n_ready = yield self.dut.n.ready_i_test
+                if not o_n_valid or not i_n_ready:
+                    continue
+                if isinstance(self.dut.n.data_o, Record):
+                    data_o = {}
+                    dod = self.dut.n.data_o
+                    for k, v in dod.fields.items():
+                        data_o[k] = yield v
+                else:
+                    data_o = yield self.dut.n.data_o
+                print ("recv", self.o, data_o)
+                self.resultfn(data_o, self.data[self.o], self.i, self.o)
+                self.o += 1
+                if self.o == len(self.data):
+                    break
+
+def resultfn_5(data_o, expected, i, o):
+    res = expected[0] + expected[1]
+    assert data_o == res, \
+                "%d-%d data %x not match %s\n" \
+                % (i, o, data_o, repr(expected))
+
+def tbench4(dut):
+    data = []
+    for i in range(num_tests):
+        #data.append(randint(0, 1<<16-1))
+        data.append(i+1)
+    i = 0
+    o = 0
+    while True:
+        stall = randint(0, 3) != 0
+        send = randint(0, 5) != 0
+        yield dut.n.ready_i.eq(stall)
+        o_p_ready = yield dut.p.ready_o
+        if o_p_ready:
+            if send and i != len(data):
+                yield dut.p.valid_i.eq(1)
+                yield dut.p.data_i.eq(data[i])
+                i += 1
+            else:
+                yield dut.p.valid_i.eq(0)
+        yield
+        o_n_valid = yield dut.n.valid_o
+        i_n_ready = yield dut.n.ready_i_test
+        if o_n_valid and i_n_ready:
+            data_o = yield dut.n.data_o
+            assert data_o == data[o] + 2, "%d-%d data %x not match %x\n" \
+                                        % (i, o, data_o, data[o])
+            o += 1
+            if o == len(data):
+                break
+
+######################################################################
+# Test 2 and 4
+######################################################################
+
+class ExampleBufPipe2(ControlBase):
+    """ Example of how to do chained pipeline stages.
+    """
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufPipe()
+        pipe2 = ExampleBufPipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 9
+######################################################################
+
+class ExampleBufPipeChain2(BufferedHandshake):
+    """ connects two stages together as a *single* combinatorial stage.
+    """
+    def __init__(self):
+        stage1 = ExampleStageCls()
+        stage2 = ExampleStageCls()
+        combined = StageChain([stage1, stage2])
+        BufferedHandshake.__init__(self, combined)
+
+
+def data_chain2():
+        data = []
+        for i in range(num_tests):
+            data.append(randint(0, 1<<16-2))
+        return data
+
+
+def resultfn_9(data_o, expected, i, o):
+    res = expected + 2
+    assert data_o == res, \
+                "%d-%d received data %x not match expected %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 6 and 10
+######################################################################
+
+class SetLessThan(Elaboratable):
+    def __init__(self, width, signed):
+        self.m = Module()
+        self.src1 = Signal((width, signed), name="src1")
+        self.src2 = Signal((width, signed), name="src2")
+        self.output = Signal(width, name="out")
+
+    def elaborate(self, platform):
+        self.m.d.comb += self.output.eq(Mux(self.src1 < self.src2, 1, 0))
+        return self.m
+
+
+class LTStage(StageCls):
+    """ module-based stage example
+    """
+    def __init__(self):
+        self.slt = SetLessThan(16, True)
+
+    def ispec(self, name):
+        return (Signal(16, name="%s_sig1" % name),
+                Signal(16, name="%s_sig2" % name))
+
+    def ospec(self, name):
+        return Signal(16, "%s_out" % name)
+
+    def setup(self, m, i):
+        self.o = Signal(16)
+        m.submodules.slt = self.slt
+        m.d.comb += self.slt.src1.eq(i[0])
+        m.d.comb += self.slt.src2.eq(i[1])
+        m.d.comb += self.o.eq(self.slt.output)
+
+    def process(self, i):
+        return self.o
+
+
+class LTStageDerived(SetLessThan, StageCls):
+    """ special version of a nmigen module where the module is also a stage
+
+        shows that you don't actually need to combinatorially connect
+        to the outputs, or add the module as a submodule: just return
+        the module output parameter(s) from the Stage.process() function
+    """
+
+    def __init__(self):
+        SetLessThan.__init__(self, 16, True)
+
+    def ispec(self):
+        return (Signal(16), Signal(16))
+
+    def ospec(self):
+        return Signal(16)
+
+    def setup(self, m, i):
+        m.submodules.slt = self
+        m.d.comb += self.src1.eq(i[0])
+        m.d.comb += self.src2.eq(i[1])
+
+    def process(self, i):
+        return self.output
+
+
+class ExampleLTPipeline(UnbufferedPipeline):
+    """ an example of how to use the unbuffered pipeline.
+    """
+
+    def __init__(self):
+        stage = LTStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+class ExampleLTBufferedPipeDerived(BufferedHandshake):
+    """ an example of how to use the buffered pipeline.
+    """
+
+    def __init__(self):
+        stage = LTStageDerived()
+        BufferedHandshake.__init__(self, stage)
+
+
+def resultfn_6(data_o, expected, i, o):
+    res = 1 if expected[0] < expected[1] else 0
+    assert data_o == res, \
+                "%d-%d data %x not match %s\n" \
+                % (i, o, data_o, repr(expected))
+
+
+######################################################################
+# Test 7
+######################################################################
+
+class ExampleAddRecordStage(StageCls):
+    """ example use of a Record
+    """
+
+    record_spec = [('src1', 16), ('src2', 16)]
+    def ispec(self):
+        """ returns a Record using the specification
+        """
+        return Record(self.record_spec)
+
+    def ospec(self):
+        return Record(self.record_spec)
+
+    def process(self, i):
+        """ process the input data, returning a dictionary with key names
+            that exactly match the Record's attributes.
+        """
+        return {'src1': i.src1 + 1,
+                'src2': i.src2 + 1}
+
+######################################################################
+# Test 11
+######################################################################
+
+class ExampleAddRecordPlaceHolderStage(StageCls):
+    """ example use of a Record, with a placeholder as the processing result
+    """
+
+    record_spec = [('src1', 16), ('src2', 16)]
+    def ispec(self):
+        """ returns a Record using the specification
+        """
+        return Record(self.record_spec)
+
+    def ospec(self):
+        return Record(self.record_spec)
+
+    def process(self, i):
+        """ process the input data, returning a PlaceHolder class instance
+            with attributes that exactly match those of the Record.
+        """
+        o = PlaceHolder()
+        o.src1 = i.src1 + 1
+        o.src2 = i.src2 + 1
+        return o
+
+
+# a dummy class that may have stuff assigned to instances once created
+class PlaceHolder: pass
+
+
+class ExampleAddRecordPipe(UnbufferedPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self):
+        stage = ExampleAddRecordStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+def resultfn_7(data_o, expected, i, o):
+    res = (expected['src1'] + 1, expected['src2'] + 1)
+    assert data_o['src1'] == res[0] and data_o['src2'] == res[1], \
+                "%d-%d data %s not match %s\n" \
+                % (i, o, repr(data_o), repr(expected))
+
+
+class ExampleAddRecordPlaceHolderPipe(UnbufferedPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self):
+        stage = ExampleAddRecordPlaceHolderStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+def resultfn_11(data_o, expected, i, o):
+    res1 = expected.src1 + 1
+    res2 = expected.src2 + 1
+    assert data_o['src1'] == res1 and data_o['src2'] == res2, \
+                "%d-%d data %s not match %s\n" \
+                % (i, o, repr(data_o), repr(expected))
+
+
+######################################################################
+# Test 8
+######################################################################
+
+
+class Example2OpClass:
+    """ an example of a class used to store 2 operands.
+        requires an eq function, to conform with the pipeline stage API
+    """
+
+    def __init__(self):
+        self.op1 = Signal(16)
+        self.op2 = Signal(16)
+
+    def eq(self, i):
+        return [self.op1.eq(i.op1), self.op2.eq(i.op2)]
+
+
+class ExampleAddClassStage(StageCls):
+    """ an example of how to use the buffered pipeline, as a class instance
+    """
+
+    def ispec(self):
+        """ returns an instance of an Example2OpClass.
+        """
+        return Example2OpClass()
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16, name="add2_out")
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i.op1 + i.op2
+
+
+class ExampleBufPipeAddClass(BufferedHandshake):
+    """ an example of how to use the buffered pipeline, using a class instance
+    """
+
+    def __init__(self):
+        addstage = ExampleAddClassStage()
+        BufferedHandshake.__init__(self, addstage)
+
+
+class TestInputAdd:
+    """ the eq function, called by set_input, needs an incoming object
+        that conforms to the Example2OpClass.eq function requirements
+        easiest way to do that is to create a class that has the exact
+        same member layout (self.op1, self.op2) as Example2OpClass
+    """
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+
+def resultfn_8(data_o, expected, i, o):
+    res = expected.op1 + expected.op2 # these are a TestInputAdd instance
+    assert data_o == res, \
+                "%d-%d data %s res %x not match %s\n" \
+                % (i, o, repr(data_o), res, repr(expected))
+
+def data_2op():
+        data = []
+        for i in range(num_tests):
+            data.append(TestInputAdd(randint(0, 1<<16-1), randint(0, 1<<16-1)))
+        return data
+
+
+######################################################################
+# Test 12
+######################################################################
+
+class ExampleStageDelayCls(StageCls, Elaboratable):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def __init__(self, valid_trigger=2):
+        self.count = Signal(2)
+        self.valid_trigger = valid_trigger
+
+    def ispec(self):
+        return Signal(16, name="example_input_signal")
+
+    def ospec(self):
+        return Signal(16, name="example_output_signal")
+
+    @property
+    def d_ready(self):
+        """ data is ready to be accepted when this is true
+        """
+        return (self.count == 1)# | (self.count == 3)
+        return Const(1)
+
+    def d_valid(self, ready_i):
+        """ data is valid at output when this is true
+        """
+        return self.count == self.valid_trigger
+        return Const(1)
+
+    def process(self, i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.sync += self.count.eq(self.count + 1)
+        return m
+
+
+class ExampleBufDelayedPipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageDelayCls(valid_trigger=2)
+        BufferedHandshake.__init__(self, stage, stage_ctl=True)
+
+    def elaborate(self, platform):
+        m = BufferedHandshake.elaborate(self, platform)
+        m.submodules.stage = self.stage
+        return m
+
+
+def data_chain1():
+        data = []
+        for i in range(num_tests):
+            data.append(1<<((i*3)%15))
+            #data.append(randint(0, 1<<16-2))
+            #print (hex(data[-1]))
+        return data
+
+
+def resultfn_12(data_o, expected, i, o):
+    res = expected + 1
+    assert data_o == res, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 13
+######################################################################
+
+class ExampleUnBufDelayedPipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageDelayCls(valid_trigger=3)
+        BufferedHandshake.__init__(self, stage, stage_ctl=True)
+
+    def elaborate(self, platform):
+        m = BufferedHandshake.elaborate(self, platform)
+        m.submodules.stage = self.stage
+        return m
+
+######################################################################
+# Test 15
+######################################################################
+
+class ExampleBufModeAdd1Pipe(SimpleHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        SimpleHandshake.__init__(self, stage)
+
+
+######################################################################
+# Test 16
+######################################################################
+
+class ExampleBufModeUnBufPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufModeAdd1Pipe()
+        pipe2 = ExampleBufAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+######################################################################
+# Test 17
+######################################################################
+
+class ExampleUnBufAdd1Pipe2(UnbufferedPipeline2):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        UnbufferedPipeline2.__init__(self, stage)
+
+
+######################################################################
+# Test 18
+######################################################################
+
+class PassThroughTest(PassThroughHandshake):
+
+    def iospecfn(self):
+        return Signal(16, "out")
+
+    def __init__(self):
+        stage = PassThroughStage(self.iospecfn)
+        PassThroughHandshake.__init__(self, stage)
+
+def resultfn_identical(data_o, expected, i, o):
+    res = expected
+    assert data_o == res, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 19
+######################################################################
+
+class ExamplePassAdd1Pipe(PassThroughHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        PassThroughHandshake.__init__(self, stage)
+
+
+class ExampleBufPassThruPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        pipe1 = ExampleBufModeAdd1Pipe()
+        pipe2 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 20
+######################################################################
+
+def iospecfn():
+    return Signal(16, name="d_in")
+
+class FIFOTest16(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfn)
+        FIFOControl.__init__(self, 2, stage)
+
+
+######################################################################
+# Test 21
+######################################################################
+
+class ExampleFIFOPassThruPipe1(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTest16()
+        pipe2 = FIFOTest16()
+        pipe3 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+        m.submodules.pipe3 = pipe3
+
+        m.d.comb += self.connect([pipe1, pipe2, pipe3])
+
+        return m
+
+
+######################################################################
+# Test 22
+######################################################################
+
+class Example2OpRecord(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.op1 = Signal(16)
+        self.op2 = Signal(16)
+
+
+class ExampleAddRecordObjectStage(StageCls):
+
+    def ispec(self):
+        """ returns an instance of an Example2OpRecord.
+        """
+        return Example2OpRecord()
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16)
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i.op1 + i.op2
+
+
+class ExampleRecordHandshakeAddClass(SimpleHandshake):
+
+    def __init__(self):
+        addstage = ExampleAddRecordObjectStage()
+        SimpleHandshake.__init__(self, stage=addstage)
+
+
+######################################################################
+# Test 23
+######################################################################
+
+def iospecfnrecord():
+    return Example2OpRecord()
+
+class FIFOTestRecordControl(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfnrecord)
+        FIFOControl.__init__(self, 2, stage)
+
+
+class ExampleFIFORecordObjectPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTestRecordControl()
+        pipe2 = ExampleRecordHandshakeAddClass()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 24
+######################################################################
+
+class FIFOTestRecordAddStageControl(FIFOControl):
+
+    def __init__(self):
+        stage = ExampleAddRecordObjectStage()
+        FIFOControl.__init__(self, 2, stage)
+
+
+
+######################################################################
+# Test 25
+######################################################################
+
+class FIFOTestAdd16(FIFOControl):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        FIFOControl.__init__(self, 2, stage)
+
+
+class ExampleFIFOAdd2Pipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTestAdd16()
+        pipe2 = FIFOTestAdd16()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 26
+######################################################################
+
+def iospecfn24():
+    return (Signal(16, name="src1"), Signal(16, name="src2"))
+
+class FIFOTest2x16(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfn2)
+        FIFOControl.__init__(self, 2, stage)
+
+
+######################################################################
+# Test 997
+######################################################################
+
+class ExampleBufPassThruPipe2(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        #pipe1 = ExampleUnBufAdd1Pipe()
+        #pipe2 = ExampleBufAdd1Pipe()
+        pipe1 = ExampleBufAdd1Pipe()
+        pipe2 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 998
+######################################################################
+
+class ExampleBufPipe3(ControlBase):
+    """ Example of how to do delayed pipeline, where the stage signals
+        whether it is ready.
+    """
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufDelayedPipe()
+        pipe2 = ExampleBufPipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+######################################################################
+# Test 999 - XXX FAILS
+# http://bugs.libre-riscv.org/show_bug.cgi?id=57
+######################################################################
+
+class ExampleBufAdd1Pipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        BufferedHandshake.__init__(self, stage)
+
+
+class ExampleUnBufAdd1Pipe(UnbufferedPipeline):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+class ExampleBufUnBufPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        #pipe1 = ExampleUnBufAdd1Pipe()
+        #pipe2 = ExampleBufAdd1Pipe()
+        pipe1 = ExampleBufAdd1Pipe()
+        pipe2 = ExampleUnBufAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 0
+######################################################################
+
+class ExampleMaskRecord(RecordObject):
+    """ an example of a class used to store 2 operands.
+        requires an eq function, to conform with the pipeline stage API
+    """
+
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.src1 = Signal(16)
+        self.src2 = Signal(16)
+
+    def __eq(self, i):
+        return [self.src1.eq(i.src1), self.src2.eq(i.src2)]
+
+
+class TestInputMask:
+    """ the eq function, called by set_input, needs an incoming object
+        that conforms to the Example2OpClass.eq function requirements
+        easiest way to do that is to create a class that has the exact
+        same member layout (self.op1, self.op2) as Example2OpClass
+    """
+    def __init__(self, src1, src2):
+        self.src1 = src1
+        self.src2 = src2
+
+    def __repr__(self):
+        return "<TestInputMask %x %x" % (self.src1, self.src2)
+
+class ExampleMaskCancellable(StageCls):
+
+    def ispec(self):
+        """ returns an instance of an ExampleMaskRecord.
+        """
+        return ExampleMaskRecord()
+
+    def ospec(self):
+        """ returns the same
+        """
+        return ExampleMaskRecord()
+
+    def process(self, i):
+        """ process the input data: increase op1 and op2
+        """
+        return TestInputMask(i.src1 + 1, i.src2 + 1)
+
+
+class MaskCancellablePipe(MaskCancellable):
+
+    """ connects two stages together as a *single* combinatorial stage.
+    """
+    def __init__(self, dynamic=False, maskwid=16):
+        stage1 = ExampleMaskCancellable()
+        stage2 = ExampleMaskCancellable()
+        combined = StageChain([stage1, stage2])
+        MaskCancellable.__init__(self, combined, maskwid, dynamic=dynamic)
+
+
+class MaskCancellablePipe1(MaskCancellable):
+
+    """ connects a stage to a cancellable pipe with "dynamic" mode on.
+    """
+    def __init__(self, dynamic=True, maskwid=16):
+        stage = ExampleMaskCancellable()
+        MaskCancellable.__init__(self, stage, maskwid, dynamic=dynamic)
+
+
+class MaskCancellableDynamic(ControlBase):
+
+    def __init__(self, maskwid):
+        self.maskwid = maskwid
+        ControlBase.__init__(self, None, maskwid=maskwid)
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = MaskCancellablePipe1(maskwid=self.maskwid)
+        pipe2 = MaskCancellablePipe1(maskwid=self.maskwid)
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        self.latchmode = Signal()
+        m.d.comb += pipe1.latchmode.eq(self.latchmode)
+        m.d.comb += pipe2.latchmode.eq(self.latchmode)
+        #m.d.comb += self.latchmode.eq(1)
+
+        return m
+
+
+def data_chain0(n_tests):
+        data = []
+        for i in range(n_tests):
+            data.append(TestInputMask(randint(0, 1<<16-1),
+                                      randint(0, 1<<16-1)))
+        return data
+
+
+def resultfn_0(data_o, expected, i, o):
+    assert data_o['src1'] == expected.src1 + 2, \
+                "src1 %x-%x received data no match\n" \
+                % (data_o['src1'], expected.src1 + 2)
+    assert data_o['src2'] == expected.src2 + 2, \
+                "src2 %x-%x received data no match\n" \
+                % (data_o['src2'] , expected.src2 + 2)
+
+
+######################################################################
+# Unit Tests
+######################################################################
+
+num_tests = 10
+
+def test0():
+    maskwid = num_tests
+    print ("test 0")
+    dut = MaskCancellablePipe(maskwid)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             dut.p.data_i.ports() + dut.n.data_o.ports()
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_maskchain0.il", "w") as f:
+        f.write(vl)
+    data = data_chain0(maskwid)
+    test = TestMask(dut, resultfn_0, maskwid, data=data)
+    run_simulation(dut, [test.send, test.rcv],
+                        vcd_name="test_maskchain0.vcd")
+
+def test0_1():
+    maskwid = 32
+    print ("test 0.1")
+    dut = MaskCancellableDynamic(maskwid=maskwid)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] #+ \
+             #dut.p.data_i.ports() + dut.n.data_o.ports()
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_maskchain0_dynamic.il", "w") as f:
+        f.write(vl)
+    data = data_chain0(maskwid)
+    test = TestMask(dut, resultfn_0, maskwid, data=data, latching=True)
+    run_simulation(dut, [test.send, test.rcv],
+                        vcd_name="test_maskchain0_dynamic.vcd")
+
+def notworking1():
+    print ("test 1")
+    dut = ExampleBufPipe()
+    run_simulation(dut, tbench(dut), vcd_name="test_bufpipe.vcd")
+
+def notworking2():
+    print ("test 2")
+    dut = ExampleBufPipe2()
+    run_simulation(dut, tbench2(dut), vcd_name="test_bufpipe2.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipe2.il", "w") as f:
+        f.write(vl)
+
+def test3():
+    print ("test 3")
+    dut = ExampleBufPipe()
+    test = Test3(dut, resultfn_3)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe3.vcd")
+
+def test3_5():
+    print ("test 3.5")
+    dut = ExamplePipeline()
+    test = Test3(dut, resultfn_3)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_combpipe3.vcd")
+
+def test4():
+    print ("test 4")
+    dut = ExampleBufPipe2()
+    run_simulation(dut, tbench4(dut), vcd_name="test_bufpipe4.vcd")
+
+def test5():
+    print ("test 5")
+    dut = ExampleBufPipeAdd()
+    test = Test5(dut, resultfn_5, stage_ctl=True)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe5.vcd")
+
+def test6():
+    print ("test 6")
+    dut = ExampleLTPipeline()
+    test = Test5(dut, resultfn_6)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltcomb6.vcd")
+
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             list(dut.p.data_i) + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_ltcomb_pipe.il", "w") as f:
+        f.write(vl)
+
+def test7():
+    print ("test 7")
+    dut = ExampleAddRecordPipe()
+    data=data_dict()
+    test = Test5(dut, resultfn_7, data=data)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o,
+             dut.p.data_i.src1, dut.p.data_i.src2,
+             dut.n.data_o.src1, dut.n.data_o.src2]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_recordcomb_pipe.il", "w") as f:
+        f.write(vl)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
+
+def test8():
+    print ("test 8")
+    dut = ExampleBufPipeAddClass()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe8.vcd")
+
+def test9():
+    print ("test 9")
+    dut = ExampleBufPipeChain2()
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipechain2.il", "w") as f:
+        f.write(vl)
+
+    data = data_chain2()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv],
+                        vcd_name="test_bufpipechain2.vcd")
+
+def test10():
+    print ("test 10")
+    dut = ExampleLTBufferedPipeDerived()
+    test = Test5(dut, resultfn_6)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltbufpipe10.vcd")
+    ports = dut.ports()
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_ltbufpipe10.il", "w") as f:
+        f.write(vl)
+
+def test11():
+    print ("test 11")
+    dut = ExampleAddRecordPlaceHolderPipe()
+    data=data_placeholder()
+    test = Test5(dut, resultfn_11, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
+
+
+def test12():
+    print ("test 12")
+    dut = ExampleBufDelayedPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe12.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipe12.il", "w") as f:
+        f.write(vl)
+
+def test13():
+    print ("test 13")
+    dut = ExampleUnBufDelayedPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe13.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_unbufpipe13.il", "w") as f:
+        f.write(vl)
+
+def test15():
+    print ("test 15")
+    dut = ExampleBufModeAdd1Pipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf15.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf15.il", "w") as f:
+        f.write(vl)
+
+def test16():
+    print ("test 16")
+    dut = ExampleBufModeUnBufPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf16.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf16.il", "w") as f:
+        f.write(vl)
+
+def test17():
+    print ("test 17")
+    dut = ExampleUnBufAdd1Pipe2()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe17.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_unbufpipe17.il", "w") as f:
+        f.write(vl)
+
+def test18():
+    print ("test 18")
+    dut = PassThroughTest()
+    data = data_chain1()
+    test = Test5(dut, resultfn_identical, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_passthru18.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_passthru18.il", "w") as f:
+        f.write(vl)
+
+def test19():
+    print ("test 19")
+    dut = ExampleBufPassThruPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass19.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpass19.il", "w") as f:
+        f.write(vl)
+
+def test20():
+    print ("test 20")
+    dut = FIFOTest16()
+    data = data_chain1()
+    test = Test5(dut, resultfn_identical, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifo20.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_fifo20.il", "w") as f:
+        f.write(vl)
+
+def test21():
+    print ("test 21")
+    dut = ExampleFIFOPassThruPipe1()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifopass21.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_fifopass21.il", "w") as f:
+        f.write(vl)
+
+def test22():
+    print ("test 22")
+    dut = ExampleRecordHandshakeAddClass()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord22.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord22.il", "w") as f:
+        f.write(vl)
+
+def test23():
+    print ("test 23")
+    dut = ExampleFIFORecordObjectPipe()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord23.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord23.il", "w") as f:
+        f.write(vl)
+
+def test24():
+    print ("test 24")
+    dut = FIFOTestRecordAddStageControl()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord24.il", "w") as f:
+        f.write(vl)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord24.vcd")
+
+def test25():
+    print ("test 25")
+    dut = ExampleFIFOAdd2Pipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_add2pipe25.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_add2pipe25.il", "w") as f:
+        f.write(vl)
+
+def test997():
+    print ("test 997")
+    dut = ExampleBufPassThruPipe2()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass997.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpass997.il", "w") as f:
+        f.write(vl)
+
+def test998():
+    print ("test 998 (fails, bug)")
+    dut = ExampleBufPipe3()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe14.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipe14.il", "w") as f:
+        f.write(vl)
+
+def test999():
+    print ("test 999 (expected to fail, which is a bug)")
+    dut = ExampleBufUnBufPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf999.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf999.il", "w") as f:
+        f.write(vl)
+
+if __name__ == '__main__':
+    test0()
+    test0_1()
diff --git a/src/nmutil/test/test_inout_feedback_pipe.py b/src/nmutil/test/test_inout_feedback_pipe.py
new file mode 100644
index 0000000..60a1861
--- /dev/null
+++ b/src/nmutil/test/test_inout_feedback_pipe.py
@@ -0,0 +1,296 @@
+""" key strategic example showing how to do multi-input fan-in into a
+    multi-stage pipeline, then multi-output fanout, with an unary muxid
+    and cancellation
+
+    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
+    and used as a routing ID on the fanout.
+"""
+
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Value, Elaboratable, Const
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.multipipe import CombMultiOutPipeline, CombMuxOutPipe
+from nmutil.multipipe import PriorityCombMuxInPipe
+from nmutil.singlepipe import MaskCancellable, RecordObject, Object
+
+
+class PassData(Object):
+    def __init__(self):
+        Object.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+        self.operator = Signal(2, reset_less=True)
+        self.routeid = Signal(2, reset_less=True) # muxidname
+
+
+class PassThroughStage:
+    def __init__(self):
+        self.o = self.ospec()
+    def ispec(self):
+        return PassData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+    def _setup(self, m, i):
+        comb = m.d.comb
+        #comb += self.o.eq(i)
+    def process(self, i):
+        return i
+
+
+class SplitRouteStage:
+    def __init__(self):
+        self.o = self.ospec()
+
+    def ispec(self):
+        return PassData()
+    def ospec(self):
+        return PassData()
+
+    def setup(self, m, i):
+        comb = m.d.comb
+        comb += self.o.eq(i)
+        with m.If(i.operator == Const(1, 2)):
+            comb += self.o.routeid.eq(1) # selects 2nd output in CombMuxOutPipe
+            comb += self.o.data.eq(i.data + 1) # add 1 to say "we did it"
+            comb += self.o.operator.eq(2) # don't get into infinite loop
+        with m.Else():
+            comb += self.o.routeid.eq(0) # selects 2nd output in CombMuxOutPipe
+
+    def process(self, i):
+        return self.o
+
+
+class DecisionPipe(MaskCancellable):
+    def __init__(self, maskwid):
+        stage = SplitRouteStage()
+        MaskCancellable.__init__(self, stage, maskwid)
+
+class RouteBackPipe(CombMuxOutPipe):
+    """ routes data back to start of pipeline
+    """
+    def __init__(self):
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=2,
+                                maskwid=4, muxidname="routeid",
+                                routemask=True)
+
+
+class MergeRoutePipe(PriorityCombMuxInPipe):
+    """ merges data coming from end of pipe (with operator now == 1)
+    """
+    def __init__(self):
+        stage = PassThroughStage()
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=2, maskwid=4,
+                                        routemask=True)
+
+
+
+class PassThroughPipe(MaskCancellable):
+    def __init__(self, maskwid):
+        MaskCancellable.__init__(self, PassThroughStage(), maskwid)
+
+
+class InputTest:
+    def __init__(self, dut, tlen):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.sent = {}
+        self.tlen = tlen
+        for muxid in range(dut.num_rows):
+            self.di[muxid] = {}
+            self.do[muxid] = {}
+            self.sent[muxid] = []
+            for i in range(self.tlen):
+                self.di[muxid][i] = randint(0, 255) + (muxid<<8)
+                self.do[muxid][i] = self.di[muxid][i]
+
+    def send(self, muxid):
+        for i in range(self.tlen):
+            op2 = self.di[muxid][i]
+            rs = self.dut.p[muxid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.muxid.eq(muxid)
+            yield rs.data_i.operator.eq(1)
+            yield rs.mask_i.eq(1)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2), op2)
+            self.sent[muxid].append(i)
+
+            yield rs.valid_i.eq(0)
+            yield rs.mask_i.eq(0)
+            # wait until it's received
+            while i in self.do[muxid]:
+                yield
+
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        yield
+
+        print ("send ended", muxid)
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self, muxid):
+        rs = self.dut.p[muxid]
+        while True:
+
+            # check cancellation
+            if False and self.sent[muxid] and randint(0, 2) == 0:
+                todel = self.sent[muxid].pop()
+                print ("to delete", muxid, self.sent[muxid], todel)
+                if todel in self.do[muxid]:
+                    del self.do[muxid][todel]
+                    yield rs.stop_i.eq(1)
+                print ("left", muxid, self.do[muxid])
+                if len(self.do[muxid]) == 0:
+                    break
+
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+
+            n = self.dut.n[muxid]
+            yield n.ready_i.eq(1)
+            yield
+            yield rs.stop_i.eq(0) # resets cancel mask
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_muxid = yield n.data_o.muxid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", out_muxid, out_i, hex(out_v), hex(out_v))
+
+            # see if this output has occurred already, delete it if it has
+            assert muxid == out_muxid, \
+                    "out_muxid %d not correct %d" % (out_muxid, muxid)
+            if out_i not in self.sent[muxid]:
+                print ("cancelled/recv", muxid, out_i)
+                continue
+            assert out_i in self.do[muxid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[muxid]))
+            assert self.do[muxid][out_i] + 1 == out_v # check data
+            del self.do[muxid][out_i]
+            todel = self.sent[muxid].index(out_i)
+            del self.sent[muxid][todel]
+
+            # check if there's any more outputs
+            if len(self.do[muxid]) == 0:
+                break
+
+        print ("recv ended", muxid)
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        PriorityCombMuxInPipe.__init__(self, stage,
+                                       p_len=self.num_rows, maskwid=1)
+
+
+class TestMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows,
+                                maskwid=1)
+
+
+class TestInOutPipe(Elaboratable):
+    def __init__(self, num_rows=4):
+        self.num_rows = nr = num_rows
+        self.inpipe = TestPriorityMuxPipe(nr) # fan-in (combinatorial)
+        self.mergein = MergeRoutePipe()       # merge in feedback
+        self.pipe1 = PassThroughPipe(nr)      # stage 1 (clock-sync)
+        self.pipe2 = DecisionPipe(nr)         # stage 2 (clock-sync)
+        #self.pipe3 = PassThroughPipe(nr)      # stage 3 (clock-sync)
+        #self.pipe4 = PassThroughPipe(nr)      # stage 4 (clock-sync)
+        self.splitback = RouteBackPipe()      # split back to mergein
+        self.outpipe = TestMuxOutPipe(nr)     # fan-out (combinatorial)
+        self.fifoback = PassThroughPipe(nr)   # temp route-back store
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.mergein = self.mergein
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
+        #m.submodules.pipe3 = self.pipe3
+        #m.submodules.pipe4 = self.pipe4
+        m.submodules.splitback = self.splitback
+        m.submodules.outpipe = self.outpipe
+        m.submodules.fifoback = self.fifoback
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.mergein.p[0])
+        m.d.comb += self.mergein.n.connect_to_next(self.pipe1.p)
+        m.d.comb += self.pipe1.connect_to_next(self.pipe2)
+        #m.d.comb += self.pipe2.connect_to_next(self.pipe3)
+        #m.d.comb += self.pipe3.connect_to_next(self.pipe4)
+        m.d.comb += self.pipe2.connect_to_next(self.splitback)
+        m.d.comb += self.splitback.n[1].connect_to_next(self.fifoback.p)
+        m.d.comb += self.fifoback.n.connect_to_next(self.mergein.p[1])
+        m.d.comb += self.splitback.n[0].connect_to_next(self.outpipe.p)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+
+def test1():
+    dut = TestInOutPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inoutmux_feedback_pipe.il", "w") as f:
+        f.write(vl)
+
+    tlen = 5
+
+    test = InputTest(dut, tlen)
+    run_simulation(dut, [test.rcv(0), #test.rcv(1),
+                         #test.rcv(3), test.rcv(2),
+                         test.send(0), #test.send(1),
+                         #test.send(3), test.send(2),
+                        ],
+                   vcd_name="test_inoutmux_feedback_pipe.vcd")
+
+
+if __name__ == '__main__':
+    #from cProfile import Profile
+    #p = Profile()
+    #p.enable()
+    test1()
+    #p.disable()
+    #p.print_stats()
diff --git a/src/nmutil/test/test_inout_mux_pipe.py b/src/nmutil/test/test_inout_mux_pipe.py
new file mode 100644
index 0000000..03fddde
--- /dev/null
+++ b/src/nmutil/test/test_inout_mux_pipe.py
@@ -0,0 +1,232 @@
+""" key strategic example showing how to do multi-input fan-in into a
+    multi-stage pipeline, then multi-output fanout.
+
+    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
+    and used as a routing ID on the fanout.
+"""
+
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Value, Elaboratable
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.multipipe import CombMultiOutPipeline, CombMuxOutPipe
+from nmutil.multipipe import PriorityCombMuxInPipe
+from nmutil.singlepipe import SimpleHandshake, RecordObject, Object
+
+
+class PassData2(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+class PassData(Object):
+    def __init__(self):
+        Object.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+
+class PassThroughStage:
+    def ispec(self):
+        return PassData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+
+    def process(self, i):
+        return i # pass-through
+
+
+
+class PassThroughPipe(SimpleHandshake):
+    def __init__(self):
+        SimpleHandshake.__init__(self, PassThroughStage())
+
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 100
+        for muxid in range(dut.num_rows):
+            self.di[muxid] = {}
+            self.do[muxid] = {}
+            for i in range(self.tlen):
+                self.di[muxid][i] = randint(0, 255) + (muxid<<8)
+                self.do[muxid][i] = self.di[muxid][i]
+
+    def send(self, muxid):
+        for i in range(self.tlen):
+            op2 = self.di[muxid][i]
+            rs = self.dut.p[muxid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.muxid.eq(muxid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        yield
+
+        print ("send ended", muxid)
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self, muxid):
+        while True:
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+            n = self.dut.n[muxid]
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_muxid = yield n.data_o.muxid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", out_muxid, out_i, hex(out_v))
+
+            # see if this output has occurred already, delete it if it has
+            assert muxid == out_muxid, \
+                    "out_muxid %d not correct %d" % (out_muxid, muxid)
+            assert out_i in self.do[muxid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[muxid]))
+            assert self.do[muxid][out_i] == out_v # pass-through data
+            del self.do[muxid][out_i]
+
+            # check if there's any more outputs
+            if len(self.do[muxid]) == 0:
+                break
+        print ("recv ended", muxid)
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
+
+
+class OutputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = []
+        self.do = {}
+        self.tlen = 100
+        for i in range(self.tlen * dut.num_rows):
+            if i < dut.num_rows:
+                muxid = i
+            else:
+                muxid = randint(0, dut.num_rows-1)
+            data = randint(0, 255) + (muxid<<8)
+
+    def send(self):
+        for i in range(self.tlen * dut.num_rows):
+            op2 = self.di[i][0]
+            muxid = self.di[i][1]
+            rs = dut.p
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.muxid.eq(muxid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+
+
+class TestMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
+
+
+class TestInOutPipe(Elaboratable):
+    def __init__(self, num_rows=4):
+        self.num_rows = num_rows
+        self.inpipe = TestPriorityMuxPipe(num_rows) # fan-in (combinatorial)
+        self.pipe1 = PassThroughPipe()              # stage 1 (clock-sync)
+        self.pipe2 = PassThroughPipe()              # stage 2 (clock-sync)
+        self.outpipe = TestMuxOutPipe(num_rows)     # fan-out (combinatorial)
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
+        m.submodules.outpipe = self.outpipe
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.pipe1.p)
+        m.d.comb += self.pipe1.connect_to_next(self.pipe2)
+        m.d.comb += self.pipe2.connect_to_next(self.outpipe)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+
+def test1():
+    dut = TestInOutPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inoutmux_pipe.il", "w") as f:
+        f.write(vl)
+    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
+
+    test = InputTest(dut)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send(0), test.send(1),
+                         test.send(3), test.send(2),
+                        ],
+                   vcd_name="test_inoutmux_pipe.vcd")
+
+if __name__ == '__main__':
+    test1()
diff --git a/src/nmutil/test/test_inout_unary_mux_cancel_pipe.py b/src/nmutil/test/test_inout_unary_mux_cancel_pipe.py
new file mode 100644
index 0000000..6a594eb
--- /dev/null
+++ b/src/nmutil/test/test_inout_unary_mux_cancel_pipe.py
@@ -0,0 +1,235 @@
+""" key strategic example showing how to do multi-input fan-in into a
+    multi-stage pipeline, then multi-output fanout, with an unary muxid
+    and cancellation
+
+    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
+    and used as a routing ID on the fanout.
+"""
+
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Value, Elaboratable
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.multipipe import CombMultiOutPipeline, CombMuxOutPipe
+from nmutil.multipipe import PriorityCombMuxInPipe
+from nmutil.singlepipe import MaskCancellable, RecordObject, Object
+
+
+class PassData2(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+class PassData(Object):
+    def __init__(self):
+        Object.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+
+class PassThroughStage:
+    def ispec(self):
+        return PassData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+
+    def process(self, i):
+        return i # pass-through
+
+
+
+class PassThroughPipe(MaskCancellable):
+    def __init__(self, maskwid):
+        MaskCancellable.__init__(self, PassThroughStage(), maskwid)
+
+
+class InputTest:
+    def __init__(self, dut, tlen):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.sent = {}
+        self.tlen = tlen
+        for muxid in range(dut.num_rows):
+            self.di[muxid] = {}
+            self.do[muxid] = {}
+            self.sent[muxid] = []
+            for i in range(self.tlen):
+                self.di[muxid][i] = randint(0, 255) + (muxid<<8)
+                self.do[muxid][i] = self.di[muxid][i]
+
+    def send(self, muxid):
+        for i in range(self.tlen):
+            op2 = self.di[muxid][i]
+            rs = self.dut.p[muxid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.muxid.eq(muxid)
+            yield rs.mask_i.eq(1)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2), op2)
+            self.sent[muxid].append(i)
+
+            yield rs.valid_i.eq(0)
+            yield rs.mask_i.eq(0)
+            # wait until it's received
+            while i in self.do[muxid]:
+                yield
+
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        yield
+
+        print ("send ended", muxid)
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self, muxid):
+        rs = self.dut.p[muxid]
+        while True:
+
+            # check cancellation
+            if self.sent[muxid] and randint(0, 2) == 0:
+                todel = self.sent[muxid].pop()
+                print ("to delete", muxid, self.sent[muxid], todel)
+                if todel in self.do[muxid]:
+                    del self.do[muxid][todel]
+                    yield rs.stop_i.eq(1)
+                print ("left", muxid, self.do[muxid])
+                if len(self.do[muxid]) == 0:
+                    break
+
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                stall = randint(0, stall_range) != 0
+                yield self.dut.n[0].ready_i.eq(stall)
+                yield
+
+            n = self.dut.n[muxid]
+            yield n.ready_i.eq(1)
+            yield
+            yield rs.stop_i.eq(0) # resets cancel mask
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_muxid = yield n.data_o.muxid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", out_muxid, out_i, hex(out_v), out_v)
+
+            # see if this output has occurred already, delete it if it has
+            assert muxid == out_muxid, \
+                    "out_muxid %d not correct %d" % (out_muxid, muxid)
+            if out_i not in self.sent[muxid]:
+                print ("cancelled/recv", muxid, out_i)
+                continue
+            assert out_i in self.do[muxid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[muxid]))
+            assert self.do[muxid][out_i] == out_v # pass-through data
+            del self.do[muxid][out_i]
+            todel = self.sent[muxid].index(out_i)
+            del self.sent[muxid][todel]
+
+            # check if there's any more outputs
+            if len(self.do[muxid]) == 0:
+                break
+
+        print ("recv ended", muxid)
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        PriorityCombMuxInPipe.__init__(self, stage,
+                                       p_len=self.num_rows, maskwid=1)
+
+
+class TestMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows,
+                                maskwid=1)
+
+
+class TestInOutPipe(Elaboratable):
+    def __init__(self, num_rows=4):
+        self.num_rows = nr = num_rows
+        self.inpipe = TestPriorityMuxPipe(nr) # fan-in (combinatorial)
+        self.pipe1 = PassThroughPipe(nr)      # stage 1 (clock-sync)
+        self.pipe2 = PassThroughPipe(nr)      # stage 2 (clock-sync)
+        self.pipe3 = PassThroughPipe(nr)      # stage 3 (clock-sync)
+        self.pipe4 = PassThroughPipe(nr)      # stage 4 (clock-sync)
+        self.outpipe = TestMuxOutPipe(nr)     # fan-out (combinatorial)
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
+        m.submodules.pipe3 = self.pipe3
+        m.submodules.pipe4 = self.pipe4
+        m.submodules.outpipe = self.outpipe
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.pipe1.p)
+        m.d.comb += self.pipe1.connect_to_next(self.pipe2)
+        m.d.comb += self.pipe2.connect_to_next(self.pipe3)
+        m.d.comb += self.pipe3.connect_to_next(self.pipe4)
+        m.d.comb += self.pipe4.connect_to_next(self.outpipe)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+
+def test1():
+    dut = TestInOutPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inoutmux_unarycancel_pipe.il", "w") as f:
+        f.write(vl)
+
+    tlen = 20
+
+    test = InputTest(dut, tlen)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send(0), test.send(1),
+                         test.send(3), test.send(2),
+                        ],
+                   vcd_name="test_inoutmux_unarycancel_pipe.vcd")
+
+if __name__ == '__main__':
+    test1()
diff --git a/src/nmutil/test/test_outmux_pipe.py b/src/nmutil/test/test_outmux_pipe.py
new file mode 100644
index 0000000..842b130
--- /dev/null
+++ b/src/nmutil/test/test_outmux_pipe.py
@@ -0,0 +1,164 @@
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Elaboratable
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.multipipe import CombMuxOutPipe
+from nmutil.singlepipe import SimpleHandshake, PassThroughHandshake, RecordObject
+
+
+class PassInData(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.muxid = Signal(2, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+class PassThroughStage:
+
+    def ispec(self):
+        return PassInData()
+
+    def ospec(self, name):
+        return Signal(16, name="%s_dout" % name, reset_less=True)
+                
+    def process(self, i):
+        return i.data
+
+
+class PassThroughDataStage:
+    def ispec(self):
+        return PassInData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+
+    def process(self, i):
+        return i # pass-through
+
+
+
+class PassThroughPipe(PassThroughHandshake):
+    def __init__(self):
+        PassThroughHandshake.__init__(self, PassThroughDataStage())
+
+
+class OutputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = []
+        self.do = {}
+        self.tlen = 10
+        for i in range(self.tlen * dut.num_rows):
+            if i < dut.num_rows:
+                muxid = i
+            else:
+                muxid = randint(0, dut.num_rows-1)
+            data = randint(0, 255) + (muxid<<8)
+            if muxid not in self.do:
+                self.do[muxid] = []
+            self.di.append((data, muxid))
+            self.do[muxid].append(data)
+
+    def send(self):
+        for i in range(self.tlen * self.dut.num_rows):
+            op2 = self.di[i][0]
+            muxid = self.di[i][1]
+            rs = self.dut.p
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.muxid.eq(muxid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+
+    def rcv(self, muxid):
+        out_i = 0
+        count = 0
+        stall_range = randint(0, 3)
+        while out_i != len(self.do[muxid]):
+            count += 1
+            assert count != 2000, "timeout: too long"
+            n = self.dut.n[muxid]
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_v = yield n.data_o
+
+            print ("recv", muxid, out_i, hex(out_v))
+
+            assert self.do[muxid][out_i] == out_v # pass-through data
+
+            out_i += 1
+
+            if randint(0, 5) == 0:
+                stall_range = randint(0, 3)
+            stall = randint(0, stall_range) != 0
+            if stall:
+                yield n.ready_i.eq(0)
+                for i in range(stall_range):
+                    yield
+
+
+class TestPriorityMuxPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
+
+
+class TestSyncToPriorityPipe(Elaboratable):
+    def __init__(self):
+        self.num_rows = 4
+        self.pipe = PassThroughPipe()
+        self.muxpipe = TestPriorityMuxPipe(self.num_rows)
+
+        self.p = self.pipe.p
+        self.n = self.muxpipe.n
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.pipe = self.pipe
+        m.submodules.muxpipe = self.muxpipe
+        m.d.comb += self.pipe.n.connect_to_next(self.muxpipe.p)
+        return m
+
+    def ports(self):
+        res = [self.p.valid_i, self.p.ready_o] + \
+                self.p.data_i.ports()
+        for i in range(len(self.n)):
+            res += [self.n[i].ready_i, self.n[i].valid_o] + \
+                    [self.n[i].data_o]
+                    #self.n[i].data_o.ports()
+        return res
+
+
+def test1():
+    dut = TestSyncToPriorityPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_outmux_pipe.il", "w") as f:
+        f.write(vl)
+
+    test = OutputTest(dut)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send()],
+                   vcd_name="test_outmux_pipe.vcd")
+
+if __name__ == '__main__':
+    test1()
diff --git a/src/nmutil/test/test_prioritymux_pipe.py b/src/nmutil/test/test_prioritymux_pipe.py
new file mode 100644
index 0000000..1b2f4e4
--- /dev/null
+++ b/src/nmutil/test/test_prioritymux_pipe.py
@@ -0,0 +1,219 @@
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmutil.singlepipe import PassThroughStage
+from nmutil.multipipe import (CombMultiInPipeline, PriorityCombMuxInPipe)
+
+
+class PassData:
+    def __init__(self):
+        self.muxid = Signal(2, reset_less=True)
+        self.idx = Signal(6, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+    def eq(self, i):
+        return [self.muxid.eq(i.muxid), self.idx.eq(i.idx), self.data.eq(i.data)]
+
+    def ports(self):
+        return [self.muxid, self.idx, self.data]
+
+
+def tbench(dut):
+    stb = yield dut.out_op.stb
+    assert stb == 0
+    ack = yield dut.out_op.ack
+    assert ack == 0
+
+    # set row 1 input 0
+    yield dut.rs[1].in_op[0].eq(5)
+    yield dut.rs[1].stb.eq(0b01) # strobe indicate 1st op ready
+    #yield dut.rs[1].ack.eq(1)
+    yield
+
+    # check row 1 output (should be inactive)
+    decode = yield dut.rs[1].out_decode
+    assert decode == 0
+    if False:
+        op0 = yield dut.rs[1].out_op[0]
+        op1 = yield dut.rs[1].out_op[1]
+        assert op0 == 0 and op1 == 0
+
+    # output should be inactive
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+
+    # set row 0 input 1
+    yield dut.rs[1].in_op[1].eq(6)
+    yield dut.rs[1].stb.eq(0b11) # strobe indicate both ops ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield dut.rs[1].stb.eq(0) # clear row 1 strobe
+
+    # output strobe should be active, MID should be 0 until "ack" is set...
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+    out_muxid = yield dut.muxid
+    assert out_muxid == 0
+
+    # ... and output should not yet be passed through either
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 0 and op1 == 0
+
+    # wait for out_op.ack to activate...
+    yield dut.rs[1].stb.eq(0b00) # set row 1 strobes to zero
+    yield
+
+    # *now* output should be passed through
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 5 and op1 == 6
+
+    # set row 2 input
+    yield dut.rs[2].in_op[0].eq(3)
+    yield dut.rs[2].in_op[1].eq(4)
+    yield dut.rs[2].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.out_op.ack.eq(1) # set output ack
+    yield
+    yield dut.rs[2].stb.eq(0) # clear row 2 strobe
+    yield dut.out_op.ack.eq(0) # set output ack
+    yield
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 3 and op1 == 4, "op0 %d op1 %d" % (op0, op1)
+    out_muxid = yield dut.muxid
+    assert out_muxid == 2
+
+    # set row 0 and 3 input
+    yield dut.rs[0].in_op[0].eq(9)
+    yield dut.rs[0].in_op[1].eq(8)
+    yield dut.rs[0].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.rs[3].in_op[0].eq(1)
+    yield dut.rs[3].in_op[1].eq(2)
+    yield dut.rs[3].stb.eq(0b11) # strobe indicate 1st op ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.rs[0].stb.eq(0) # clear row 1 strobe
+    yield
+    out_muxid = yield dut.muxid
+    assert out_muxid == 0, "out muxid %d" % out_muxid
+
+    yield
+    yield dut.rs[3].stb.eq(0) # clear row 1 strobe
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield
+    out_muxid = yield dut.muxid
+    assert out_muxid == 3, "out muxid %d" % out_muxid
+
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 10
+        for muxid in range(dut.num_rows):
+            self.di[muxid] = {}
+            self.do[muxid] = {}
+            for i in range(self.tlen):
+                self.di[muxid][i] = randint(0, 100) + (muxid<<8)
+                self.do[muxid][i] = self.di[muxid][i]
+
+    def send(self, muxid):
+        for i in range(self.tlen):
+            op2 = self.di[muxid][i]
+            rs = self.dut.p[muxid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.muxid.eq(muxid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", muxid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self):
+        while True:
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+            n = self.dut.n
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            muxid = yield n.data_o.muxid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", muxid, out_i, hex(out_v))
+
+            # see if this output has occurred already, delete it if it has
+            assert out_i in self.do[muxid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[muxid]))
+            assert self.do[muxid][out_i] == out_v # pass-through data
+            del self.do[muxid][out_i]
+
+            # check if there's any more outputs
+            zerolen = True
+            for (k, v) in self.do.items():
+                if v:
+                    zerolen = False
+            if zerolen:
+                break
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self):
+        self.num_rows = 4
+        def iospecfn(): return PassData()
+        stage = PassThroughStage(iospecfn)
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
+
+def test1():
+    dut = TestPriorityMuxPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inputgroup_multi.il", "w") as f:
+        f.write(vl)
+    #run_simulation(dut, tbench(dut), vcd_name="test_inputgroup.vcd")
+
+    test = InputTest(dut)
+    run_simulation(dut, [test.send(1), test.send(0),
+                         test.send(3), test.send(2),
+                         test.rcv()],
+                   vcd_name="test_inputgroup_multi.vcd")
+
+if __name__ == '__main__':
+    test1()