add cookie-cut version of branch, copied from Logical, name changes TODO
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 15 May 2020 16:29:57 +0000 (17:29 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 15 May 2020 16:29:57 +0000 (17:29 +0100)
src/soc/branch/__init__.py [new file with mode: 0644]
src/soc/branch/formal/proof_input_stage.py [new file with mode: 0644]
src/soc/branch/formal/proof_main_stage.py [new file with mode: 0644]
src/soc/branch/input_stage.py [new file with mode: 0644]
src/soc/branch/main_stage.py [new file with mode: 0644]
src/soc/branch/pipe_data.py [new file with mode: 0644]
src/soc/branch/pipeline.py [new file with mode: 0644]
src/soc/branch/test/test_pipe_caller.py [new file with mode: 0644]

diff --git a/src/soc/branch/__init__.py b/src/soc/branch/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/soc/branch/formal/proof_input_stage.py b/src/soc/branch/formal/proof_input_stage.py
new file mode 100644 (file)
index 0000000..bb62fb6
--- /dev/null
@@ -0,0 +1,85 @@
+# Proof of correctness for partitioned equal signal combiner
+# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+
+from nmigen import Module, Signal, Elaboratable, Mux
+from nmigen.asserts import Assert, AnyConst, Assume, Cover
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+
+from soc.alu.input_stage import ALUInputStage
+from soc.alu.pipe_data import ALUPipeSpec
+from soc.alu.alu_input_record import CompALUOpSubset
+from soc.decoder.power_enums import InternalOp
+import unittest
+
+
+# This defines a module to drive the device under test and assert
+# properties about its outputs
+class Driver(Elaboratable):
+    def __init__(self):
+        # inputs and outputs
+        pass
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        rec = CompALUOpSubset()
+        recwidth = 0
+        # Setup random inputs for dut.op
+        for p in rec.ports():
+            width = p.width
+            recwidth += width
+            comb += p.eq(AnyConst(width))
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        m.submodules.dut = dut = ALUInputStage(pspec)
+
+        a = Signal(64)
+        b = Signal(64)
+        comb += [dut.i.a.eq(a),
+                 dut.i.b.eq(b),
+                 a.eq(AnyConst(64)),
+                 b.eq(AnyConst(64))]
+                      
+
+        comb += dut.i.ctx.op.eq(rec)
+
+
+        # Assert that op gets copied from the input to output
+        for p in rec.ports():
+            name = p.name
+            rec_sig = p
+            dut_sig = getattr(dut.o.ctx.op, name)
+            comb += Assert(dut_sig == rec_sig)
+
+        with m.If(rec.invert_a):
+            comb += Assert(dut.o.a == ~a)
+        with m.Else():
+            comb += Assert(dut.o.a == a)
+
+        with m.If(rec.imm_data.imm_ok &
+                  ~(rec.insn_type == InternalOp.OP_RLC)):
+            comb += Assert(dut.o.b == rec.imm_data.imm)
+        with m.Else():
+            comb += Assert(dut.o.b == b)
+
+
+
+
+        return m
+
+class GTCombinerTestCase(FHDLTestCase):
+    def test_formal(self):
+        module = Driver()
+        self.assertFormal(module, mode="bmc", depth=4)
+        self.assertFormal(module, mode="cover", depth=4)
+    def test_ilang(self):
+        dut = Driver()
+        vl = rtlil.convert(dut, ports=[])
+        with open("input_stage.il", "w") as f:
+            f.write(vl)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/soc/branch/formal/proof_main_stage.py b/src/soc/branch/formal/proof_main_stage.py
new file mode 100644 (file)
index 0000000..5ca9481
--- /dev/null
@@ -0,0 +1,92 @@
+# Proof of correctness for partitioned equal signal combiner
+# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+
+from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
+                    signed)
+from nmigen.asserts import Assert, AnyConst, Assume, Cover
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+
+from soc.logical.main_stage import LogicalMainStage
+from soc.alu.pipe_data import ALUPipeSpec
+from soc.alu.alu_input_record import CompALUOpSubset
+from soc.decoder.power_enums import InternalOp
+import unittest
+
+
+# This defines a module to drive the device under test and assert
+# properties about its outputs
+class Driver(Elaboratable):
+    def __init__(self):
+        # inputs and outputs
+        pass
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        rec = CompALUOpSubset()
+        recwidth = 0
+        # Setup random inputs for dut.op
+        for p in rec.ports():
+            width = p.width
+            recwidth += width
+            comb += p.eq(AnyConst(width))
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        m.submodules.dut = dut = LogicalMainStage(pspec)
+
+        # convenience variables
+        a = dut.i.a
+        b = dut.i.b
+        carry_in = dut.i.carry_in
+        so_in = dut.i.so
+        carry_out = dut.o.carry_out
+        o = dut.o.o
+
+        # setup random inputs
+        comb += [a.eq(AnyConst(64)),
+                 b.eq(AnyConst(64)),
+                 carry_in.eq(AnyConst(1)),
+                 so_in.eq(AnyConst(1))]
+
+        comb += dut.i.ctx.op.eq(rec)
+
+        # Assert that op gets copied from the input to output
+        for rec_sig in rec.ports():
+            name = rec_sig.name
+            dut_sig = getattr(dut.o.ctx.op, name)
+            comb += Assert(dut_sig == rec_sig)
+
+        # signed and signed/32 versions of input a
+        a_signed = Signal(signed(64))
+        a_signed_32 = Signal(signed(32))
+        comb += a_signed.eq(a)
+        comb += a_signed_32.eq(a[0:32])
+
+        # main assertion of arithmetic operations
+        with m.Switch(rec.insn_type):
+            with m.Case(InternalOp.OP_AND):
+                comb += Assert(dut.o.o == a & b)
+            with m.Case(InternalOp.OP_OR):
+                comb += Assert(dut.o.o == a | b)
+            with m.Case(InternalOp.OP_XOR):
+                comb += Assert(dut.o.o == a ^ b)
+
+        return m
+
+
+class LogicalTestCase(FHDLTestCase):
+    def test_formal(self):
+        module = Driver()
+        self.assertFormal(module, mode="bmc", depth=2)
+        self.assertFormal(module, mode="cover", depth=2)
+    def test_ilang(self):
+        dut = Driver()
+        vl = rtlil.convert(dut, ports=[])
+        with open("main_stage.il", "w") as f:
+            f.write(vl)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/soc/branch/input_stage.py b/src/soc/branch/input_stage.py
new file mode 100644 (file)
index 0000000..e6ab48e
--- /dev/null
@@ -0,0 +1,63 @@
+# This stage is intended to adjust the input data before sending it to
+# the acutal ALU. Things like handling inverting the input, carry_in
+# generation for subtraction, and handling of immediates should happen
+# here
+from nmigen import (Module, Signal, Cat, Const, Mux, Repl, signed,
+                    unsigned)
+from nmutil.pipemodbase import PipeModBase
+from soc.decoder.power_enums import InternalOp
+from soc.alu.pipe_data import ALUInputData
+from soc.decoder.power_enums import CryIn
+
+
+class ALUInputStage(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "input")
+
+    def ispec(self):
+        return ALUInputData(self.pspec)
+
+    def ospec(self):
+        return ALUInputData(self.pspec)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        ##### operand A #####
+
+        # operand a to be as-is or inverted
+        a = Signal.like(self.i.a)
+
+        with m.If(self.i.ctx.op.invert_a):
+            comb += a.eq(~self.i.a)
+        with m.Else():
+            comb += a.eq(self.i.a)
+
+        comb += self.o.a.eq(a)
+
+        ##### operand B #####
+
+        # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
+        # remove this, just do self.o.b.eq(self.i.b) and move the
+        # immediate-detection into set_alu_inputs in the unit test
+        # If there's an immediate, set the B operand to that
+        comb += self.o.b.eq(self.i.b)
+
+        ##### carry-in #####
+
+        # either copy incoming carry or set to 1/0 as defined by op
+        with m.Switch(self.i.ctx.op.input_carry):
+            with m.Case(CryIn.ZERO):
+                comb += self.o.carry_in.eq(0)
+            with m.Case(CryIn.ONE):
+                comb += self.o.carry_in.eq(1)
+            with m.Case(CryIn.CA):
+                comb += self.o.carry_in.eq(self.i.carry_in)
+
+        ##### sticky overflow and context (both pass-through) #####
+
+        comb += self.o.so.eq(self.i.so)
+        comb += self.o.ctx.eq(self.i.ctx)
+
+        return m
diff --git a/src/soc/branch/main_stage.py b/src/soc/branch/main_stage.py
new file mode 100644 (file)
index 0000000..b50afc2
--- /dev/null
@@ -0,0 +1,128 @@
+# This stage is intended to do most of the work of executing Logical
+# instructions. This is OR, AND, XOR, POPCNT, PRTY, CMPB, BPERMD, CNTLZ
+# however input and output stages also perform bit-negation on input(s)
+# and output, as well as carry and overflow generation.
+# This module however should not gate the carry or overflow, that's up
+# to the output stage
+
+from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
+from nmutil.pipemodbase import PipeModBase
+from soc.logical.pipe_data import ALUInputData
+from soc.alu.pipe_data import ALUOutputData
+from ieee754.part.partsig import PartitionedSignal
+from soc.decoder.power_enums import InternalOp
+from soc.countzero.countzero import ZeroCounter
+
+from soc.decoder.power_fields import DecodeFields
+from soc.decoder.power_fieldsn import SignalBitRange
+
+
+def array_of(count, bitwidth):
+    res = []
+    for i in range(count):
+        res.append(Signal(bitwidth, reset_less=True))
+    return res
+
+
+class LogicalMainStage(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "main")
+        self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
+        self.fields.create_specs()
+
+    def ispec(self):
+        return ALUInputData(self.pspec)
+
+    def ospec(self):
+        return ALUOutputData(self.pspec) # TODO: ALUIntermediateData
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
+
+        ##########################
+        # main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
+
+        with m.Switch(op.insn_type):
+
+            ###### AND, OR, XOR #######
+            with m.Case(InternalOp.OP_AND):
+                comb += o.eq(a & b)
+            with m.Case(InternalOp.OP_OR):
+                comb += o.eq(a | b)
+            with m.Case(InternalOp.OP_XOR):
+                comb += o.eq(a ^ b)
+
+            ###### cmpb #######
+            with m.Case(InternalOp.OP_CMPB):
+                l = []
+                for i in range(8):
+                    slc = slice(i*8, (i+1)*8)
+                    l.append(Repl(a[slc] == b[slc], 8))
+                comb += o.eq(Cat(*l))
+
+            ###### popcount #######
+            with m.Case(InternalOp.OP_POPCNT):
+                # starting from a, perform successive addition-reductions
+                # creating arrays big enough to store the sum, each time
+                pc = [a]
+                # QTY32 2-bit (to take 2x 1-bit sums) etc.
+                work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 6)]
+                for l, b in work:
+                    pc.append(array_of(l, b))
+                pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
+                pc32 = pc[5]    # array of 2 32-bit counts (popcntw)
+                popcnt = pc[-1] # array of 1 64-bit count (popcntd)
+                # cascade-tree of adds
+                for idx, (l, b) in enumerate(work):
+                    for i in range(l):
+                        stt, end = i*2, i*2+1
+                        src, dst = pc[idx], pc[idx+1]
+                        comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
+                                          Cat(src[end], Const(0, 1)))
+                # decode operation length
+                with m.If(op.data_len[2:4] == 0b00):
+                    # popcntb - pack 8x 4-bit answers into output
+                    for i in range(8):
+                        comb += o[i*8:i*8+4].eq(pc8[i])
+                with m.Elif(op.data_len[3] == 0):
+                    # popcntw - pack 2x 5-bit answers into output
+                    for i in range(2):
+                        comb += o[i*32:i*32+5].eq(pc32[i])
+                with m.Else():
+                    # popcntd - put 1x 6-bit answer into output
+                    comb += o.eq(popcnt[0])
+
+            ###### parity #######
+            with m.Case(InternalOp.OP_PRTY):
+                # strange instruction which XORs together the LSBs of each byte
+                par0 = Signal(reset_less=True)
+                par1 = Signal(reset_less=True)
+                comb += par0.eq(Cat(a[0] , a[8] , a[16], a[24]).xor())
+                comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+                with m.If(op.data_len[3] == 1):
+                    comb += o.eq(par0 ^ par1)
+                with m.Else():
+                    comb += o[0].eq(par0)
+                    comb += o[32].eq(par1)
+
+            ###### cntlz #######
+            with m.Case(InternalOp.OP_CNTZ):
+                x_fields = self.fields.instrs['X']
+                XO = Signal(x_fields['XO'][0:-1].shape())
+                m.submodules.countz = countz = ZeroCounter()
+                comb += countz.rs_i.eq(a)
+                comb += countz.is_32bit_i.eq(op.is_32bit)
+                comb += countz.count_right_i.eq(XO[-1])
+                comb += o.eq(countz.result_o)
+
+            ###### bpermd #######
+            # TODO with m.Case(InternalOp.OP_BPERM): - not in microwatt
+
+        ###### sticky overflow and context, both pass-through #####
+
+        comb += self.o.so.eq(self.i.so)
+        comb += self.o.ctx.eq(self.i.ctx)
+
+        return m
diff --git a/src/soc/branch/pipe_data.py b/src/soc/branch/pipe_data.py
new file mode 100644 (file)
index 0000000..34d9c0a
--- /dev/null
@@ -0,0 +1,37 @@
+from nmigen import Signal, Const
+from ieee754.fpcommon.getop import FPPipeContext
+
+
+class IntegerData:
+
+    def __init__(self, pspec):
+        self.ctx = FPPipeContext(pspec)
+        self.muxid = self.ctx.muxid
+
+    def __iter__(self):
+        yield from self.ctx
+
+    def eq(self, i):
+        return [self.ctx.eq(i.ctx)]
+
+
+class ALUInputData(IntegerData):
+    def __init__(self, pspec):
+        super().__init__(pspec)
+        self.a = Signal(64, reset_less=True) # RA
+        self.b = Signal(64, reset_less=True) # RB/immediate
+        self.so = Signal(reset_less=True)
+        self.carry_in = Signal(reset_less=True)
+
+    def __iter__(self):
+        yield from super().__iter__()
+        yield self.a
+        yield self.b
+        yield self.carry_in
+        yield self.so
+
+    def eq(self, i):
+        lst = super().eq(i)
+        return lst + [self.a.eq(i.a), self.b.eq(i.b),
+                      self.carry_in.eq(i.carry_in),
+                      self.so.eq(i.so)]
diff --git a/src/soc/branch/pipeline.py b/src/soc/branch/pipeline.py
new file mode 100644 (file)
index 0000000..f3c8327
--- /dev/null
@@ -0,0 +1,25 @@
+from nmutil.singlepipe import ControlBase
+from nmutil.pipemodbase import PipeModBaseChain
+from soc.alu.input_stage import ALUInputStage
+from soc.logical.main_stage import LogicalMainStage
+from soc.alu.output_stage import ALUOutputStage
+
+class LogicalStages(PipeModBaseChain):
+    def get_chain(self):
+        inp = ALUInputStage(self.pspec)
+        main = LogicalMainStage(self.pspec)
+        out = ALUOutputStage(self.pspec)
+        return [inp, main, out]
+
+
+class LogicalBasePipe(ControlBase):
+    def __init__(self, pspec):
+        ControlBase.__init__(self)
+        self.pipe1 = LogicalStages(pspec)
+        self._eqs = self.connect([self.pipe1])
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+        m.submodules.pipe = self.pipe1
+        m.d.comb += self._eqs
+        return m
diff --git a/src/soc/branch/test/test_pipe_caller.py b/src/soc/branch/test/test_pipe_caller.py
new file mode 100644 (file)
index 0000000..d540e7a
--- /dev/null
@@ -0,0 +1,263 @@
+from nmigen import Module, Signal
+from nmigen.back.pysim import Simulator, Delay, Settle
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+import unittest
+from soc.decoder.isa.caller import ISACaller, special_sprs
+from soc.decoder.power_decoder import (create_pdecode)
+from soc.decoder.power_decoder2 import (PowerDecode2)
+from soc.decoder.power_enums import (XER_bits, Function)
+from soc.decoder.selectable_int import SelectableInt
+from soc.simulator.program import Program
+from soc.decoder.isa.all import ISA
+
+
+from soc.logical.pipeline import LogicalBasePipe
+from soc.alu.alu_input_record import CompALUOpSubset
+from soc.alu.pipe_data import ALUPipeSpec
+import random
+
+
+class TestCase:
+    def __init__(self, program, regs, sprs, name):
+        self.program = program
+        self.regs = regs
+        self.sprs = sprs
+        self.name = name
+
+def get_rec_width(rec):
+    recwidth = 0
+    # Setup random inputs for dut.op
+    for p in rec.ports():
+        width = p.width
+        recwidth += width
+    return recwidth
+
+def set_alu_inputs(alu, dec2, sim):
+    # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
+    # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
+    # and place it into data_i.b
+
+    reg3_ok = yield dec2.e.read_reg3.ok
+    reg1_ok = yield dec2.e.read_reg1.ok
+    assert reg3_ok != reg1_ok
+    if reg3_ok:
+        data1 = yield dec2.e.read_reg3.data
+        data1 = sim.gpr(data1).value
+    elif reg1_ok:
+        data1 = yield dec2.e.read_reg1.data
+        data1 = sim.gpr(data1).value
+    else:
+        data1 = 0
+
+    yield alu.p.data_i.a.eq(data1)
+
+    # If there's an immediate, set the B operand to that
+    reg2_ok = yield dec2.e.read_reg2.ok
+    imm_ok = yield dec2.e.imm_data.imm_ok
+    if imm_ok:
+        data2 = yield dec2.e.imm_data.imm
+    elif reg2_ok:
+        data2 = yield dec2.e.read_reg2.data
+        data2 = sim.gpr(data2).value
+    else:
+        data2 = 0
+    yield alu.p.data_i.b.eq(data2)
+
+
+
+def set_extra_alu_inputs(alu, dec2, sim):
+    carry = 1 if sim.spr['XER'][XER_bits['CA']] else 0
+    yield alu.p.data_i.carry_in.eq(carry)
+    so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+    yield alu.p.data_i.so.eq(so)
+    
+
+# This test bench is a bit different than is usual. Initially when I
+# was writing it, I had all of the tests call a function to create a
+# device under test and simulator, initialize the dut, run the
+# simulation for ~2 cycles, and assert that the dut output what it
+# should have. However, this was really slow, since it needed to
+# create and tear down the dut and simulator for every test case.
+
+# Now, instead of doing that, every test case in ALUTestCase puts some
+# data into the test_data list below, describing the instructions to
+# be tested and the initial state. Once all the tests have been run,
+# test_data gets passed to TestRunner which then sets up the DUT and
+# simulator once, runs all the data through it, and asserts that the
+# results match the pseudocode sim at every cycle.
+
+# By doing this, I've reduced the time it takes to run the test suite
+# massively. Before, it took around 1 minute on my computer, now it
+# takes around 3 seconds
+
+test_data = []
+
+
+class LogicalTestCase(FHDLTestCase):
+    def __init__(self, name):
+        super().__init__(name)
+        self.test_name = name
+    def run_tst_program(self, prog, initial_regs=[0] * 32, initial_sprs={}):
+        tc = TestCase(prog, initial_regs, initial_sprs, self.test_name)
+        test_data.append(tc)
+
+    def test_rand(self):
+        insns = ["and", "or", "xor"]
+        for i in range(40):
+            choice = random.choice(insns)
+            lst = [f"{choice} 3, 1, 2"]
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            initial_regs[2] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    def test_rand_imm_logical(self):
+        insns = ["andi.", "andis.", "ori", "oris", "xori", "xoris"]
+        for i in range(10):
+            choice = random.choice(insns)
+            imm = random.randint(0, (1<<16)-1)
+            lst = [f"{choice} 3, 1, {imm}"]
+            print(lst)
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    @unittest.skip("broken")
+    def test_cntz(self):
+        insns = ["cntlzd", "cnttzd"]
+        for i in range(10):
+            choice = random.choice(insns)
+            lst = [f"{choice} 3, 1"]
+            print(lst)
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    def test_parity(self):
+        insns = ["prtyw", "prtyd"]
+        for i in range(10):
+            choice = random.choice(insns)
+            lst = [f"{choice} 3, 1"]
+            print(lst)
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    @unittest.skip("broken")
+    def test_popcnt(self):
+        insns = ["popcntb", "popcntw", "popcntd"]
+        for i in range(10):
+            choice = random.choice(insns)
+            lst = [f"{choice} 3, 1"]
+            print(lst)
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    def test_cmpb(self):
+        lst = ["cmpb 3, 1, 2"]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0xdeadbeefcafec0de
+        initial_regs[2] = 0xd0adb0000afec1de
+        self.run_tst_program(Program(lst), initial_regs)
+
+    def test_ilang(self):
+        rec = CompALUOpSubset()
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
+        alu = LogicalBasePipe(pspec)
+        vl = rtlil.convert(alu, ports=[])
+        with open("logical_pipeline.il", "w") as f:
+            f.write(vl)
+
+
+class TestRunner(FHDLTestCase):
+    def __init__(self, test_data):
+        super().__init__("run_all")
+        self.test_data = test_data
+
+    def run_all(self):
+        m = Module()
+        comb = m.d.comb
+        instruction = Signal(32)
+
+        pdecode = create_pdecode()
+
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+
+        rec = CompALUOpSubset()
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
+        m.submodules.alu = alu = LogicalBasePipe(pspec)
+
+        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.e)
+        comb += alu.p.valid_i.eq(1)
+        comb += alu.n.ready_i.eq(1)
+        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+        sim = Simulator(m)
+
+        sim.add_clock(1e-6)
+        def process():
+            for test in self.test_data:
+                print(test.name)
+                program = test.program
+                self.subTest(test.name)
+                simulator = ISA(pdecode2, test.regs, test.sprs)
+                gen = program.generate_instructions()
+                instructions = list(zip(gen, program.assembly.splitlines()))
+
+                index = simulator.pc.CIA.value//4
+                while index < len(instructions):
+                    ins, code = instructions[index]
+
+                    print("0x{:X}".format(ins & 0xffffffff))
+                    print(code)
+
+                    # ask the decoder to decode this binary data (endian'd)
+                    yield pdecode2.dec.bigendian.eq(0)  # little / big?
+                    yield instruction.eq(ins)          # raw binary instr.
+                    yield Settle()
+                    fn_unit = yield pdecode2.e.fn_unit
+                    self.assertEqual(fn_unit, Function.LOGICAL.value, code)
+                    yield from set_alu_inputs(alu, pdecode2, simulator)
+                    yield from set_extra_alu_inputs(alu, pdecode2, simulator)
+                    yield 
+                    opname = code.split(' ')[0]
+                    yield from simulator.call(opname)
+                    index = simulator.pc.CIA.value//4
+
+                    vld = yield alu.n.valid_o
+                    while not vld:
+                        yield
+                        vld = yield alu.n.valid_o
+                    yield
+                    alu_out = yield alu.n.data_o.o
+                    out_reg_valid = yield pdecode2.e.write_reg.ok
+                    if out_reg_valid:
+                        write_reg_idx = yield pdecode2.e.write_reg.data
+                        expected = simulator.gpr(write_reg_idx).value
+                        print(f"expected {expected:x}, actual: {alu_out:x}")
+                        self.assertEqual(expected, alu_out, code)
+                    yield from self.check_extra_alu_outputs(alu, pdecode2,
+                                                            simulator)
+
+        sim.add_sync_process(process)
+        with sim.write_vcd("simulator.vcd", "simulator.gtkw",
+                            traces=[]):
+            sim.run()
+    def check_extra_alu_outputs(self, alu, dec2, sim):
+        rc = yield dec2.e.rc.data
+        if rc:
+            cr_expected = sim.crl[0].get_range().value
+            cr_actual = yield alu.n.data_o.cr0
+            self.assertEqual(cr_expected, cr_actual)
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+    suite.addTest(TestRunner(test_data))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)