X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Falu_hier.py;h=459bbd951cb41a35e5f06089162e365fd8b03d9b;hb=08ed759bb115235238babcb11c584424b237abc8;hp=9fd21c493a486199e09d02cd97d18e816570ff98;hpb=4de4739d2e1cca5a84e888657f41fd335cdab9ce;p=soc.git

diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py
index 9fd21c49..459bbd95 100644
--- a/src/soc/experiment/alu_hier.py
+++ b/src/soc/experiment/alu_hier.py
@@ -14,85 +14,37 @@ from nmigen.hdl.rec import Record, Layout
 from nmigen.cli import main
 from nmigen.cli import verilog, rtlil
 from nmigen.compat.sim import run_simulation
+from nmutil.extend import exts
+from nmutil.gtkw import write_gtkw
 
-from soc.decoder.power_enums import InternalOp, CryIn
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
+                                        is_engine_pysim)
 
-import operator
+from openpower.decoder.decode2execute1 import Data
+from openpower.decoder.power_enums import MicrOp, Function, CryIn
 
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+from soc.fu.cr.cr_input_record import CompCROpSubset
 
-class CompALUOpSubset(Record):
-    """CompALUOpSubset
-
-    a copy of the relevant subset information from Decode2Execute1Type
-    needed for ALU operations.
-    """
-    def __init__(self):
-        layout = (('insn_type', InternalOp),
-                  ('nia', 64),
-                  ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
-                    #'cr = Signal(32, reset_less=True) # NO: this is from the CR SPR
-                    #'xerc = XerBits() # NO: this is from the XER SPR
-                  ('lk', 1),
-                  ('rc', Layout((("rc", 1), ("rc_ok", 1)))),
-                  ('oe', Layout((("oe", 1), ("oe_ok", 1)))),
-                  ('invert_a', 1),
-                  ('invert_out', 1),
-                  ('input_carry', CryIn),
-                  ('output_carry', 1),
-                  ('input_cr', 1),
-                  ('output_cr', 1),
-                  ('is_32bit', 1),
-                  ('is_signed', 1),
-                  ('byte_reverse', 1),
-                  ('sign_extend', 1))
-
-        Record.__init__(self, Layout(layout))
-
-        # grrr.  Record does not have kwargs
-        self.insn_type.reset_less = True
-        self.nia.reset_less = True
-        #self.cr = Signal(32, reset_less = True
-        #self.xerc = XerBits(
-        self.lk.reset_less = True
-        self.invert_a.reset_less = True
-        self.invert_out.reset_less = True
-        self.input_carry.reset_less = True
-        self.output_carry.reset_less = True
-        self.input_cr.reset_less = True
-        self.output_cr.reset_less = True
-        self.is_32bit.reset_less = True
-        self.is_signed.reset_less = True
-        self.byte_reverse.reset_less = True
-        self.sign_extend.reset_less = True
+from soc.fu.pipe_data import FUBaseData
+from soc.fu.alu.pipe_data import CommonPipeSpec
+from soc.fu.compunits.compunits import FunctionUnitBaseSingle
+
+import operator
 
-    def ports(self):
-        return [self.insn_type,
-                self.nia,
-                #self.cr,
-                #self.xerc,
-                self.lk,
-                self.invert_a,
-                self.invert_out,
-                self.input_carry,
-                self.output_carry,
-                self.input_cr,
-                self.output_cr,
-                self.is_32bit,
-                self.is_signed,
-                self.byte_reverse,
-                self.sign_extend,
-        ]
 
 class Adder(Elaboratable):
     def __init__(self, width):
-        self.invert_a = Signal()
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.invert_in = Signal()
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="add_o")
 
     def elaborate(self, platform):
         m = Module()
-        with m.If(self.invert_a):
+        with m.If(self.invert_in):
             m.d.comb += self.o.eq((~self.a) + self.b)
         with m.Else():
             m.d.comb += self.o.eq(self.a + self.b)
@@ -101,9 +53,9 @@ class Adder(Elaboratable):
 
 class Subtractor(Elaboratable):
     def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="sub_o")
 
     def elaborate(self, platform):
         m = Module()
@@ -113,9 +65,9 @@ class Subtractor(Elaboratable):
 
 class Multiplier(Elaboratable):
     def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="mul_o")
 
     def elaborate(self, platform):
         m = Module()
@@ -126,109 +78,321 @@ class Multiplier(Elaboratable):
 class Shifter(Elaboratable):
     def __init__(self, width):
         self.width = width
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="shf_o")
 
     def elaborate(self, platform):
         m = Module()
         btrunc = Signal(self.width)
-        m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
+        m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
         m.d.comb += self.o.eq(self.a >> btrunc)
         return m
 
 
+class SignExtend(Elaboratable):
+    def __init__(self, width):
+        self.width = width
+        self.a = Signal(width)
+        self.o = Signal(width, name="exts_o")
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(exts(self.a, 8, self.width))
+        return m
+
+
+class Dummy:
+    pass
+
+
+class DummyALU(Elaboratable):
+    def __init__(self, width):
+        self.p = Dummy()  # make look like nmutil pipeline API
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
+        self.n = Dummy()  # make look like nmutil pipeline API
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
+        self.counter = Signal(4)
+        self.op = CompCROpSubset()
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        i.append(Signal(width, name="i3"))
+        self.i = i
+        self.a, self.b, self.c = i[0], i[1], i[2]
+        self.out = tuple([Signal(width, name="alu_o")])
+        self.o = self.out[0]
+        self.width = width
+        # more "look like nmutil pipeline API"
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.p.i_data.c = self.c
+        self.n.o_data.o = self.o
+
+    def elaborate(self, platform):
+        m = Module()
+
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
+
+        with m.If(self.p.i_valid):
+            # input is valid. next check, if we already said "ready" or not
+            with m.If(~self.p.o_ready):
+                # we didn't say "ready" yet, so say so and initialise
+                m.d.sync += self.p.o_ready.eq(1)
+
+                m.d.sync += self.o.eq(self.a)
+                m.d.comb += go_now.eq(1)
+                m.d.sync += self.counter.eq(1)
+
+        with m.Else():
+            # input says no longer valid, so drop ready as well.
+            # a "proper" ALU would have had to sync in the opcode and a/b ops
+            m.d.sync += self.p.o_ready.eq(0)
+
+        # ok so the counter's running: when it gets to 1, fire the output
+        with m.If((self.counter == 1) | go_now):
+            # set the output as valid if the recipient is ready for it
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
+            # recipient said it was ready: reset back to known-good.
+            m.d.sync += self.counter.eq(0)  # reset the counter
+            m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
+
+        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
+        with m.If(self.counter > 1):
+            m.d.sync += self.counter.eq(self.counter - 1)
+
+        return m
+
+    def __iter__(self):
+        yield from self.op.ports()
+        yield self.a
+        yield self.b
+        yield self.c
+        yield self.o
+
+    def ports(self):
+        return list(self)
+
+#####################
+# converting even this dummy ALU over to the FunctionUnit RegSpecs API
+# which, errr, note that the regspecs are totally ignored below, but
+# at least the widths are all 64-bit so it's okay.
+#####################
+
+# input (and output) for logical initial stage (common input)
+
+
+class ALUInputData(FUBaseData):
+    regspec = [('INT', 'a', '0:63'),  # RA
+               ('INT', 'b', '0:63'),  # RB/immediate
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, False)
+
+
+# output from ALU final stage
+class ALUOutputData(FUBaseData):
+    regspec = [('INT', 'o', '0:63'),        # RT
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, True)
+
+
+# ALU pipe specification class
+class ALUPipeSpec(CommonPipeSpec):
+    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
+    opsubsetkls = CompALUOpSubset
+
+
+class ALUFunctionUnit(FunctionUnitBaseSingle):
+    # class ALUFunctionUnit(FunctionUnitBaseMulti):
+    fnunit = Function.ALU
+
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
+
+
 class ALU(Elaboratable):
     def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = CompALUOpSubset()
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        # XXX major temporary hack: attempting to convert
+        # ALU over to RegSpecs API, FunctionUnitBaseSingle passes in
+        # a regspec here which we can't cope with.  therefore, errr...
+        # just throw it away and set the width to 64
+        if not isinstance(width, int):
+            width = 64
+        # TODO, really this should just inherit from ControlBase it would
+        # be a lot less messy.
+        self.p = Dummy()  # make look like nmutil pipeline API
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
+        self.n = Dummy()  # make look like nmutil pipeline API
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
+        self.counter = Signal(4)
+        self.op = CompALUOpSubset(name="op")
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        self.i = i
+        self.a, self.b = i[0], i[1]
+        out = []
+        out.append(Data(width, name="alu_o"))
+        out.append(Data(width, name="alu_cr"))
+        self.out = tuple(out)
+        self.o = self.out[0]
+        self.cr = self.out[1]
         self.width = width
+        # more "look like nmutil ControlBase pipeline API" stuff
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.n.o_data.o = self.o
+        self.n.o_data.cr = self.cr
 
     def elaborate(self, platform):
         m = Module()
         add = Adder(self.width)
         mul = Multiplier(self.width)
         shf = Shifter(self.width)
+        sub = Subtractor(self.width)
+        ext_sign = SignExtend(self.width)
 
         m.submodules.add = add
         m.submodules.mul = mul
         m.submodules.shf = shf
+        m.submodules.sub = sub
+        m.submodules.ext_sign = ext_sign
 
         # really should not activate absolutely all ALU inputs like this
-        for mod in [add, mul, shf]:
+        for mod in [add, mul, shf, sub]:
             m.d.comb += [
                 mod.a.eq(self.a),
                 mod.b.eq(self.b),
             ]
+        # EXTS sign extends the first input
+        with m.If(self.op.insn_type == MicrOp.OP_EXTS):
+            m.d.comb += ext_sign.a.eq(self.a)
+        # EXTSWSLI sign extends the second input
+        with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
+            m.d.comb += ext_sign.a.eq(self.b)
 
         # pass invert (and carry later)
-        m.d.comb += add.invert_a.eq(self.op.invert_a)
+        m.d.comb += add.invert_in.eq(self.op.invert_in)
 
-        go_now = Signal(reset_less=True) # testing no-delay ALU
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
 
-        with m.If(self.p_valid_i):
-            # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
-                # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
+        # ALU sequencer is idle when the count is zero
+        alu_idle = Signal(reset_less=True)
+        m.d.comb += alu_idle.eq(self.counter == 0)
+
+        # ALU sequencer is done when the count is one
+        alu_done = Signal(reset_less=True)
+        m.d.comb += alu_done.eq(self.counter == 1)
+
+        # select handshake handling according to ALU type
+        with m.If(go_now):
+            # with a combinatorial, no-delay ALU, just pass through
+            # the handshake signals to the other side
+            m.d.comb += self.p.o_ready.eq(self.n.i_ready)
+            m.d.comb += self.n.o_valid.eq(self.p.i_valid)
+        with m.Else():
+            # sequential ALU handshake:
+            # o_ready responds to i_valid, but only if the ALU is idle
+            m.d.comb += self.p.o_ready.eq(alu_idle)
+            # select the internally generated o_valid, above
+            m.d.comb += self.n.o_valid.eq(alu_done)
+
+        # hold the ALU result until o_ready is asserted
+        alu_r = Signal(self.width)
+
+        # output masks
+        # NOP and ILLEGAL don't output anything
+        with m.If((self.op.insn_type != MicrOp.OP_NOP) &
+                  (self.op.insn_type != MicrOp.OP_ILLEGAL)):
+            m.d.comb += self.o.ok.eq(1)
+        # CR is output when rc bit is active
+        m.d.comb += self.cr.ok.eq(self.op.rc.rc)
+
+        with m.If(alu_idle):
+            with m.If(self.p.i_valid):
 
                 # as this is a "fake" pipeline, just grab the output right now
-                with m.If(self.op.insn_type == InternalOp.OP_ADD):
-                    m.d.sync += self.o.eq(add.o)
-                with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64):
-                    m.d.sync += self.o.eq(mul.o)
-                with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
-                    m.d.sync += self.o.eq(shf.o)
-                # TODO: SUB
+                with m.If(self.op.insn_type == MicrOp.OP_ADD):
+                    m.d.sync += alu_r.eq(add.o)
+                with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
+                    m.d.sync += alu_r.eq(mul.o)
+                with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
+                    m.d.sync += alu_r.eq(shf.o)
+                with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
+                    m.d.sync += alu_r.eq(ext_sign.o)
+                with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
+                    m.d.sync += alu_r.eq(ext_sign.o)
+                # SUB is zero-delay, no need to register
 
                 # NOTE: all of these are fake, just something to test
 
                 # MUL, to take 5 instructions
-                with m.If(self.op.insn_type == InternalOp.OP_MUL_L64.value):
+                with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
                     m.d.sync += self.counter.eq(5)
-                # SHIFT to take 7
-                with m.Elif(self.op.insn_type == InternalOp.OP_SHR.value):
-                    m.d.sync += self.counter.eq(7)
-                # SUB to take 1, straight away
-                with m.If(self.op.insn_type == InternalOp.OP_ADD.value):
+                # SHIFT to take 1, straight away
+                with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
                     m.d.sync += self.counter.eq(1)
-                    m.d.comb += go_now.eq(1)
-                # ADD to take 2
+                # ADD/SUB to take 3
+                with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
+                    m.d.sync += self.counter.eq(3)
+                # EXTS to take 1
+                with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
+                    m.d.sync += self.counter.eq(1)
+                # EXTSWSLI to take 1
+                with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
+                    m.d.sync += self.counter.eq(1)
+                # others to take no delay
                 with m.Else():
-                    m.d.sync += self.counter.eq(2)
-        with m.Else():
-            # input says no longer valid, so drop ready as well.
-            # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
-
-        # ok so the counter's running: when it gets to 1, fire the output
-        with m.If((self.counter == 1) | go_now):
-            # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
-            # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+                    m.d.comb += go_now.eq(1)
 
-        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
-        with m.If(self.counter > 1):
+        with m.Elif(~alu_done | self.n.i_ready):
+            # decrement the counter while the ALU is neither idle nor finished
             m.d.sync += self.counter.eq(self.counter - 1)
 
+        # choose between zero-delay output, or registered
+        with m.If(go_now):
+            m.d.comb += self.o.data.eq(sub.o)
+        # only present the result at the last computation cycle
+        with m.Elif(alu_done):
+            m.d.comb += self.o.data.eq(alu_r)
+
+        # determine condition register bits based on the data output value
+        with m.If(~self.o.data.any()):
+            m.d.comb += self.cr.data.eq(0b001)
+        with m.Elif(self.o.data[-1]):
+            m.d.comb += self.cr.data.eq(0b010)
+        with m.Else():
+            m.d.comb += self.cr.data.eq(0b100)
+
         return m
 
     def __iter__(self):
         yield from self.op.ports()
         yield self.a
         yield self.b
-        yield self.o
+        yield from self.o.ports()
+        yield self.p.i_valid
+        yield self.p.o_ready
+        yield self.n.o_valid
+        yield self.n.i_ready
 
     def ports(self):
         return list(self)
@@ -236,9 +400,9 @@ class ALU(Elaboratable):
 
 class BranchOp(Elaboratable):
     def __init__(self, width, op):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width)
         self.op = op
 
     def elaborate(self, platform):
@@ -249,15 +413,24 @@ class BranchOp(Elaboratable):
 
 class BranchALU(Elaboratable):
     def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = Signal(2)
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.p = Dummy()  # make look like nmutil pipeline API
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
+        self.n = Dummy()  # make look like nmutil pipeline API
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
+        self.counter = Signal(4)
+        self.op = Signal(2)
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        self.i = i
+        self.a, self.b = i[0], i[1]
+        self.out = tuple([Signal(width)])
+        self.o = self.out[0]
         self.width = width
 
     def elaborate(self, platform):
@@ -277,34 +450,35 @@ class BranchALU(Elaboratable):
                 mod.b.eq(self.b),
             ]
 
-        go_now = Signal(reset_less=True) # testing no-delay ALU
-        with m.If(self.p_valid_i):
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
+        with m.If(self.p.i_valid):
             # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
+            with m.If(~self.p.o_ready):
                 # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
+                m.d.sync += self.p.o_ready.eq(1)
 
                 # as this is a "fake" pipeline, just grab the output right now
                 with m.Switch(self.op):
                     for i, mod in enumerate([bgt, blt, beq, bne]):
                         with m.Case(i):
                             m.d.sync += self.o.eq(mod.o)
-                m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
+                # branch to take 5 cycles (fake)
+                m.d.sync += self.counter.eq(5)
                 #m.d.comb += go_now.eq(1)
         with m.Else():
             # input says no longer valid, so drop ready as well.
             # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
+            m.d.sync += self.p.o_ready.eq(0)
 
         # ok so the counter's running: when it gets to 1, fire the output
         with m.If((self.counter == 1) | go_now):
             # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
             # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+            m.d.sync += self.counter.eq(0)  # reset the counter
+            m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 
         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
         with m.If(self.counter > 1):
@@ -321,57 +495,236 @@ class BranchALU(Elaboratable):
     def ports(self):
         return list(self)
 
+
 def run_op(dut, a, b, op, inv_a=0):
     yield dut.a.eq(a)
     yield dut.b.eq(b)
     yield dut.op.insn_type.eq(op)
-    yield dut.op.invert_a.eq(inv_a)
-    yield dut.n_ready_i.eq(0)
-    yield dut.p_valid_i.eq(1)
+    yield dut.op.invert_in.eq(inv_a)
+    yield dut.n.i_ready.eq(0)
+    yield dut.p.i_valid.eq(1)
+    yield dut.n.i_ready.eq(1)
     yield
-    while True:
+
+    # wait for the ALU to accept our input data
+    while not (yield dut.p.o_ready):
         yield
-        n_valid_o = yield dut.n_valid_o
-        if n_valid_o:
-            break
-    yield
 
-    result = yield dut.o
-    yield dut.p_valid_i.eq(0)
-    yield dut.n_ready_i.eq(0)
-    yield
+    yield dut.p.i_valid.eq(0)
+    yield dut.a.eq(0)
+    yield dut.b.eq(0)
+    yield dut.op.insn_type.eq(0)
+    yield dut.op.invert_in.eq(0)
+
+    # wait for the ALU to present the output data
+    while not (yield dut.n.o_valid):
+        yield
+
+    # latch the result and lower read_i
+    result = yield dut.o.data
+    yield dut.n.i_ready.eq(0)
 
     return result
 
 
 def alu_sim(dut):
-    result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD)
-    print ("alu_sim add", result)
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
+    print("alu_sim add", result)
     assert (result == 8)
 
-    result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64)
-    print ("alu_sim mul", result)
+    result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
+    print("alu_sim mul", result)
     assert (result == 6)
 
-    result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1)
-    print ("alu_sim add-inv", result)
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
+    print("alu_sim add-inv", result)
     assert (result == 65533)
 
+    # test zero-delay ALU
+    # don't have OP_SUB, so use any other
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
+    print("alu_sim sub", result)
+    assert (result == 2)
+
+    result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
+    print("alu_sim shr", result)
+    assert (result == 3)
+
 
 def test_alu():
     alu = ALU(width=16)
-    run_simulation(alu, alu_sim(alu), vcd_name='test_alusim.vcd')
+    write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
+    run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 
     vl = rtlil.convert(alu, ports=alu.ports())
     with open("test_alu.il", "w") as f:
         f.write(vl)
 
 
+def test_alu_parallel():
+    # Compare with the sequential test implementation, above.
+    m = Module()
+    m.submodules.alu = dut = ALU(width=16)
+    write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
+                   pysim=is_engine_pysim())
+
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    def send(a, b, op, inv_a=0, rc=0):
+        # present input data and assert i_valid
+        yield dut.a.eq(a)
+        yield dut.b.eq(b)
+        yield dut.op.insn_type.eq(op)
+        yield dut.op.invert_in.eq(inv_a)
+        yield dut.op.rc.rc.eq(rc)
+        yield dut.p.i_valid.eq(1)
+        yield
+        # wait for o_ready to be asserted
+        while not (yield dut.p.o_ready):
+            yield
+        # clear input data and negate i_valid
+        # if send is called again immediately afterwards, there will be no
+        # visible transition (they will not be negated, after all)
+        yield dut.p.i_valid.eq(0)
+        yield dut.a.eq(0)
+        yield dut.b.eq(0)
+        yield dut.op.insn_type.eq(0)
+        yield dut.op.invert_in.eq(0)
+        yield dut.op.rc.rc.eq(0)
+
+    def receive():
+        # signal readiness to receive data
+        yield dut.n.i_ready.eq(1)
+        yield
+        # wait for o_valid to be asserted
+        while not (yield dut.n.o_valid):
+            yield
+        # read results
+        result = yield dut.o.data
+        cr = yield dut.cr.data
+        # negate i_ready
+        # if receive is called again immediately afterwards, there will be no
+        # visible transition (it will not be negated, after all)
+        yield dut.n.i_ready.eq(0)
+        return result, cr
+
+    def producer():
+        # send a few test cases, interspersed with wait states
+        # note that, for this test, we do not wait for the result to be ready,
+        # before presenting the next input
+        # 5 + 3
+        yield from send(5, 3, MicrOp.OP_ADD)
+        yield
+        yield
+        # 2 * 3
+        yield from send(2, 3, MicrOp.OP_MUL_L64, rc=1)
+        # (-6) + 3
+        yield from send(5, 3, MicrOp.OP_ADD, inv_a=1, rc=1)
+        yield
+        # 5 - 3
+        # note that this is a zero-delay operation
+        yield from send(5, 3, MicrOp.OP_CMP)
+        yield
+        yield
+        # NOP
+        yield from send(5, 3, MicrOp.OP_NOP)
+        # 13 >> 2
+        yield from send(13, 2, MicrOp.OP_SHR)
+        # sign extent 13
+        yield from send(13, 2, MicrOp.OP_EXTS)
+        # sign extend -128 (8 bits)
+        yield from send(0x80, 2, MicrOp.OP_EXTS, rc=1)
+        # sign extend -128 (8 bits)
+        yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
+        # 5 - 5
+        yield from send(5, 5, MicrOp.OP_CMP, rc=1)
+
+    def consumer():
+        # receive and check results, interspersed with wait states
+        # the consumer is not in step with the producer, but the
+        # order of the results are preserved
+        yield
+        # 5 + 3 = 8
+        result = yield from receive()
+        assert result[0] == 8
+        # 2 * 3 = 6
+        # 6 > 0 => CR = 0b100
+        result = yield from receive()
+        assert result == (6, 0b100)
+        yield
+        yield
+        # (-6) + 3 = -3
+        # -3 < 0 => CR = 0b010
+        result = yield from receive()
+        assert result == (65533, 0b010)  # unsigned equivalent to -2
+        # 5 - 3 = 2
+        # note that this is a zero-delay operation
+        # this, and the previous result, will be received back-to-back
+        # (check the output waveform to see this)
+        result = yield from receive()
+        assert result[0] == 2
+        yield
+        yield
+        # NOP
+        yield from receive()
+        # 13 >> 2 = 3
+        result = yield from receive()
+        assert result[0] == 3
+        # sign extent 13 = 13
+        result = yield from receive()
+        assert result[0] == 13
+        # sign extend -128 (8 bits) = -128 (16 bits)
+        # -128 < 0 => CR = 0b010
+        result = yield from receive()
+        assert result == (0xFF80, 0b010)
+        # sign extend -128 (8 bits) = -128 (16 bits)
+        result = yield from receive()
+        assert result[0] == 0xFF80
+        # 5 - 5 = 0
+        # 0 == 0 => CR = 0b001
+        result = yield from receive()
+        assert result == (0, 0b001)
+
+    sim.add_sync_process(producer)
+    sim.add_sync_process(consumer)
+    sim_writer = sim.write_vcd("test_alu_parallel.vcd")
+    with sim_writer:
+        sim.run()
+
+
+def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
+                   pysim=True):
+    """Common function to write the GTKWave documents for this module"""
+    gtkwave_desc = [
+        'clk',
+        'i1[15:0]',
+        'i2[15:0]',
+        'op__insn_type' if pysim else 'op__insn_type[6:0]',
+        'op__invert_in',
+        'i_valid',
+        'o_ready',
+        'o_valid',
+        'i_ready',
+        'alu_o[15:0]',
+        'alu_o_ok',
+        'alu_cr[15:0]',
+        'alu_cr_ok'
+    ]
+    # determine the module name of the DUT
+    module = 'top'
+    if sub_module is not None:
+        module = nmigen_sim_top_module + sub_module
+    vcd_name = gtkw_name.replace('.gtkw', '.vcd')
+    write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
+               loc=__file__, clk_period=clk_period, base='signed')
+
+
 if __name__ == "__main__":
     test_alu()
+    test_alu_parallel()
 
-    alu = BranchALU(width=16)
-    vl = rtlil.convert(alu, ports=alu.ports())
-    with open("test_branch_alu.il", "w") as f:
-        f.write(vl)
-
+    # alu = BranchALU(width=16)
+    # vl = rtlil.convert(alu, ports=alu.ports())
+    # with open("test_branch_alu.il", "w") as f:
+    #     f.write(vl)