From 233300afbc3f7cc989706c7759d99914ad1082d5 Mon Sep 17 00:00:00 2001
From: Michael Nolan <mtnolan2640@gmail.com>
Date: Tue, 12 May 2020 13:37:42 -0400
Subject: [PATCH] Add new shift_rot FU for shifts and rotates

---
 src/soc/shift_rot/formal/proof_main_stage.py | 105 ++++++++
 src/soc/shift_rot/main_stage.py              | 126 ++++++++++
 src/soc/shift_rot/maskgen.py                 |  47 ++++
 src/soc/shift_rot/pipeline.py                |  25 ++
 src/soc/shift_rot/rotator.py                 | 154 ++++++++++++
 src/soc/shift_rot/rotl.py                    |  24 ++
 src/soc/shift_rot/test/test_maskgen.py       |  41 ++++
 src/soc/shift_rot/test/test_pipe_caller.py   | 243 +++++++++++++++++++
 8 files changed, 765 insertions(+)
 create mode 100644 src/soc/shift_rot/formal/proof_main_stage.py
 create mode 100644 src/soc/shift_rot/main_stage.py
 create mode 100644 src/soc/shift_rot/maskgen.py
 create mode 100644 src/soc/shift_rot/pipeline.py
 create mode 100644 src/soc/shift_rot/rotator.py
 create mode 100644 src/soc/shift_rot/rotl.py
 create mode 100644 src/soc/shift_rot/test/test_maskgen.py
 create mode 100644 src/soc/shift_rot/test/test_pipe_caller.py

diff --git a/src/soc/shift_rot/formal/proof_main_stage.py b/src/soc/shift_rot/formal/proof_main_stage.py
new file mode 100644
index 00000000..2cb2b0d1
--- /dev/null
+++ b/src/soc/shift_rot/formal/proof_main_stage.py
@@ -0,0 +1,105 @@
+# Proof of correctness for partitioned equal signal combiner
+# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+
+from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
+                    signed)
+from nmigen.asserts import Assert, AnyConst, Assume, Cover
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+
+from soc.shift_rot.main_stage import ShiftRotMainStage
+from soc.alu.pipe_data import ALUPipeSpec
+from soc.alu.alu_input_record import CompALUOpSubset
+from soc.decoder.power_enums import InternalOp
+import unittest
+
+
+# This defines a module to drive the device under test and assert
+# properties about its outputs
+class Driver(Elaboratable):
+    def __init__(self):
+        # inputs and outputs
+        pass
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        rec = CompALUOpSubset()
+        recwidth = 0
+        # Setup random inputs for dut.op
+        for p in rec.ports():
+            width = p.width
+            recwidth += width
+            comb += p.eq(AnyConst(width))
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        m.submodules.dut = dut = ShiftRotMainStage(pspec)
+
+        # convenience variables
+        a = dut.i.a
+        b = dut.i.b
+        carry_in = dut.i.carry_in
+        so_in = dut.i.so
+        carry_out = dut.o.carry_out
+        o = dut.o.o
+
+        # setup random inputs
+        comb += [a.eq(AnyConst(64)),
+                 b.eq(AnyConst(64)),
+                 carry_in.eq(AnyConst(1)),
+                 so_in.eq(AnyConst(1))]
+
+        comb += dut.i.ctx.op.eq(rec)
+
+        # Assert that op gets copied from the input to output
+        for rec_sig in rec.ports():
+            name = rec_sig.name
+            dut_sig = getattr(dut.o.ctx.op, name)
+            comb += Assert(dut_sig == rec_sig)
+
+        # signed and signed/32 versions of input a
+        a_signed = Signal(signed(64))
+        a_signed_32 = Signal(signed(32))
+        comb += a_signed.eq(a)
+        comb += a_signed_32.eq(a[0:32])
+
+        # main assertion of arithmetic operations
+        with m.Switch(rec.insn_type):
+            with m.Case(InternalOp.OP_SHL):
+                with m.If(rec.is_32bit):
+                    comb += Assert(o[0:32] == ((a << b[0:6]) & 0xffffffff))
+                    comb += Assert(o[32:64] == 0)
+                with m.Else():
+                    comb += Assert(o == ((a << b[0:7]) & ((1 << 64)-1)))
+            with m.Case(InternalOp.OP_SHR):
+                with m.If(~rec.is_signed):
+                    with m.If(rec.is_32bit):
+                        comb += Assert(o[0:32] == (a[0:32] >> b[0:6]))
+                        comb += Assert(o[32:64] == 0)
+                    with m.Else():
+                        comb += Assert(o == (a >> b[0:7]))
+                with m.Else():
+                    with m.If(rec.is_32bit):
+                        comb += Assert(o[0:32] == (a_signed_32 >> b[0:6]))
+                        comb += Assert(o[32:64] == Repl(a[31], 32))
+                    with m.Else():
+                        comb += Assert(o == (a_signed >> b[0:7]))
+
+        return m
+
+
+class ALUTestCase(FHDLTestCase):
+    def test_formal(self):
+        module = Driver()
+        self.assertFormal(module, mode="bmc", depth=2)
+        self.assertFormal(module, mode="cover", depth=2)
+    def test_ilang(self):
+        dut = Driver()
+        vl = rtlil.convert(dut, ports=[])
+        with open("main_stage.il", "w") as f:
+            f.write(vl)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/soc/shift_rot/main_stage.py b/src/soc/shift_rot/main_stage.py
new file mode 100644
index 00000000..09c32855
--- /dev/null
+++ b/src/soc/shift_rot/main_stage.py
@@ -0,0 +1,126 @@
+# This stage is intended to do most of the work of executing the ALU
+# instructions. This would be like the additions, logical operations,
+# and shifting, as well as carry and overflow generation. This module
+# however should not gate the carry or overflow, that's up to the
+# output stage
+from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
+from nmutil.pipemodbase import PipeModBase
+from soc.alu.pipe_data import ALUInputData, ALUOutputData
+from ieee754.part.partsig import PartitionedSignal
+from soc.decoder.power_enums import InternalOp
+from soc.shift_rot.maskgen import MaskGen
+from soc.shift_rot.rotl import ROTL
+
+from soc.decoder.power_fields import DecodeFields
+from soc.decoder.power_fieldsn import SignalBitRange
+
+
+class ShiftRotMainStage(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "main")
+        self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
+        self.fields.create_specs()
+
+    def ispec(self):
+        return ALUInputData(self.pspec)
+
+    def ospec(self):
+        return ALUOutputData(self.pspec) # TODO: ALUIntermediateData
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+
+        fields = self.fields.instrs['M']
+        mb = Signal(fields['MB'][0:-1].shape())
+        comb += mb.eq(fields['MB'][0:-1])
+        me = Signal(fields['ME'][0:-1].shape())
+        comb += me.eq(fields['ME'][0:-1])
+
+        # check if op is 32-bit, and get sign bit from operand a
+        is_32bit = Signal(reset_less=True)
+        sign_bit = Signal(reset_less=True)
+        comb += is_32bit.eq(self.i.ctx.op.is_32bit)
+        comb += sign_bit.eq(Mux(is_32bit, self.i.a[31], self.i.a[63]))
+
+        # Signals for rotates and shifts
+        rotl_out = Signal.like(self.i.a)
+        mask = Signal.like(self.i.a)
+        m.submodules.maskgen = maskgen = MaskGen(64)
+        m.submodules.rotl = rotl = ROTL(64)
+        m.submodules.rotl32 = rotl32 = ROTL(32)
+        rotate_amt = Signal.like(rotl.b)
+
+        comb += [
+            rotl.a.eq(self.i.a),
+            rotl.b.eq(rotate_amt),
+            rotl32.a.eq(self.i.a[0:32]),
+            rotl32.b.eq(rotate_amt)]
+
+        with m.If(is_32bit):
+            comb += rotl_out.eq(Cat(rotl32.o, Repl(0, 32)))
+        with m.Else():
+            comb += rotl_out.eq(rotl.o)
+
+        ##########################
+        # main switch-statement for handling arithmetic and logic operations
+
+        with m.Switch(self.i.ctx.op.insn_type):
+            #### shift left ####
+            with m.Case(InternalOp.OP_SHL):
+                comb += maskgen.mb.eq(Mux(is_32bit, 32, 0))
+                comb += maskgen.me.eq(63-self.i.b[0:6])
+                comb += rotate_amt.eq(self.i.b[0:6])
+                with m.If(is_32bit):
+                    with m.If(self.i.b[5]):
+                        comb += mask.eq(0)
+                    with m.Else():
+                        comb += mask.eq(maskgen.o)
+                with m.Else():
+                    with m.If(self.i.b[6]):
+                        comb += mask.eq(0)
+                    with m.Else():
+                        comb += mask.eq(maskgen.o)
+                comb += self.o.o.eq(rotl_out & mask)
+
+            #### shift right ####
+            with m.Case(InternalOp.OP_SHR):
+                comb += maskgen.mb.eq(Mux(is_32bit, 32, 0) + self.i.b[0:6])
+                comb += maskgen.me.eq(63)
+                comb += rotate_amt.eq(64-self.i.b[0:6])
+                with m.If(is_32bit):
+                    with m.If(self.i.b[5]):
+                        comb += mask.eq(0)
+                    with m.Else():
+                        comb += mask.eq(maskgen.o)
+                with m.Else():
+                    with m.If(self.i.b[6]):
+                        comb += mask.eq(0)
+                    with m.Else():
+                        comb += mask.eq(maskgen.o)
+                with m.If(self.i.ctx.op.is_signed):
+                    out = rotl_out & mask | Mux(sign_bit, ~mask, 0)
+                    cout = sign_bit & ((rotl_out & mask) != 0)
+                    comb += self.o.o.eq(out)
+                    comb += self.o.carry_out.eq(cout)
+                with m.Else():
+                    comb += self.o.o.eq(rotl_out & mask)
+
+            with m.Case(InternalOp.OP_RLC):
+                with m.If(self.i.ctx.op.imm_data.imm_ok):
+                    comb += rotate_amt.eq(self.i.ctx.op.imm_data.imm[0:5])
+                with m.Else():
+                    comb += rotate_amt.eq(self.i.b[0:5])
+                comb += maskgen.mb.eq(mb+32)
+                comb += maskgen.me.eq(me+32)
+                comb += mask.eq(maskgen.o)
+                comb += self.o.o.eq((rotl_out & mask) | (self.i.b & ~mask))
+                
+
+        ###### sticky overflow and context, both pass-through #####
+
+        comb += self.o.so.eq(self.i.so)
+        comb += self.o.ctx.eq(self.i.ctx)
+
+        return m
diff --git a/src/soc/shift_rot/maskgen.py b/src/soc/shift_rot/maskgen.py
new file mode 100644
index 00000000..89246e0b
--- /dev/null
+++ b/src/soc/shift_rot/maskgen.py
@@ -0,0 +1,47 @@
+from nmigen import (Elaboratable, Signal, Module)
+import math
+
+class MaskGen(Elaboratable):
+    """MaskGen - create a diff mask
+
+    example: x=5 --> a=0b11111
+             y=3 --> b=0b00111
+             o:        0b11000
+             x=2 --> a=0b00011
+             y=4 --> b=0b01111
+             o:        0b10011
+    """
+    def __init__(self, width):
+        self.width = width
+        self.shiftwidth = math.ceil(math.log2(width))
+        self.mb = Signal(self.shiftwidth, reset_less=True)
+        self.me = Signal(self.shiftwidth, reset_less=True)
+
+        self.o = Signal(width, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        x = Signal.like(self.mb)
+        y = Signal.like(self.mb)
+
+        comb += x.eq(64 - self.mb)
+        comb += y.eq(63 - self.me)
+
+        mask_a = Signal.like(self.o)
+        mask_b = Signal.like(self.o)
+
+        comb += mask_a.eq((1<<x) - 1)
+        comb += mask_b.eq((1<<y) - 1)
+
+        with m.If(x > y):
+            comb += self.o.eq(mask_a ^ mask_b)
+        with m.Else():
+            comb += self.o.eq(mask_a ^ ~mask_b)
+            
+
+        return m
+
+    def ports(self):
+        return [self.mb, self.me, self.o]
diff --git a/src/soc/shift_rot/pipeline.py b/src/soc/shift_rot/pipeline.py
new file mode 100644
index 00000000..eb62013a
--- /dev/null
+++ b/src/soc/shift_rot/pipeline.py
@@ -0,0 +1,25 @@
+from nmutil.singlepipe import ControlBase
+from nmutil.pipemodbase import PipeModBaseChain
+from soc.alu.input_stage import ALUInputStage
+from soc.shift_rot.main_stage import ShiftRotMainStage
+from soc.alu.output_stage import ALUOutputStage
+
+class ShiftRotStages(PipeModBaseChain):
+    def get_chain(self):
+        inp = ALUInputStage(self.pspec)
+        main = ShiftRotMainStage(self.pspec)
+        out = ALUOutputStage(self.pspec)
+        return [inp, main, out]
+
+
+class ShiftRotBasePipe(ControlBase):
+    def __init__(self, pspec):
+        ControlBase.__init__(self)
+        self.pipe1 = ShiftRotStages(pspec)
+        self._eqs = self.connect([self.pipe1])
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+        m.submodules.pipe = self.pipe1
+        m.d.comb += self._eqs
+        return m
diff --git a/src/soc/shift_rot/rotator.py b/src/soc/shift_rot/rotator.py
new file mode 100644
index 00000000..7681692e
--- /dev/null
+++ b/src/soc/shift_rot/rotator.py
@@ -0,0 +1,154 @@
+# Manual translation and adaptation of rotator.vhdl from microwatt into nmigen
+#
+
+from nmigen import (Elaboratable, Signal, Module, Const, Cat)
+from soc.alu.rotl import ROTL
+
+# note BE bit numbering
+def right_mask(m, mask_begin):
+    """ this can be replaced by something like (mask_begin << 1) - 1"""
+    ret = Signal(64, name="right_mask", reset_less=True)
+    m.d.comb += ret.eq(0)
+    for i in range(64):
+        with m.If(i >= unsigned(mask_begin)): # set from i upwards
+            m.d.comb += ret[63 - i].eq(1)
+    return ret;
+
+def left_mask(m, mask_end):
+    """ this can be replaced by something like ~((mask_end << 1) - 1)"""
+    ret = Signal(64, name="left_mask", reset_less=True)
+    m.d.comb += ret.eq(0)
+    with m.If(mask_end[6] != 0):
+        return ret
+    for i in range(64):
+        with m.If(i <= unsigned(mask_end)): # set from i downwards
+            m.d.comb += ret[63 - i].eq(1)
+    return ret;
+
+
+class Rotator(Elaboratable):
+    """Rotator: covers multiple POWER9 rotate functions
+
+        supported modes:
+
+        * sl[wd]
+        * rlw*, rldic, rldicr, rldimi
+        * rldicl, sr[wd]
+        * sra[wd][i]
+
+        use as follows:
+
+        * shift = RB[0:7]
+        * arith = 1 when is_signed
+        * right_shift = 1 when insn_type is OP_SHR
+        * clear_left = 1 when insn_type is OP_RLC or OP_RLCL
+        * clear_right = 1 when insn_type is OP_RLC or OP_RLCR
+    """
+    def __init__(self):
+        # input
+        self.rs = Signal(64, reset_less=True)       # RS
+        self.ra = Signal(64, reset_less=True)       # RA
+        self.shift = Signal(7, reset_less=True)     # RB[0:7]
+        self.insn = Signal(32, reset_less=True)     # for mb and me fields
+        self.is_32bit = Signal(reset_less=True)
+        self.right_shift = Signal(reset_less=True)
+        self.arith = Signal(reset_less=True)
+        self.clear_left = Signal(reset_less=True)
+        self.clear_right = Signal(reset_less=True)
+        # output
+        self.result_o = Signal(64, reset_less=True)
+        self.carry_out_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        ra, rs = self.ra, self.rs
+
+        # temporaries
+        repl32 = Signal(64, reset_less=True)
+        rot_count = Signal(6, reset_less=True)
+        rot = Signal(64, reset_less=True)
+        sh = Signal(7, reset_less=True)
+        mb = Signal(7, reset_less=True)
+        me = Signal(7, reset_less=True)
+        mr = Signal(64, reset_less=True)
+        ml = Signal(64, reset_less=True)
+        output_mode = Signal(2, reset_less=True)
+
+        # First replicate bottom 32 bits to both halves if 32-bit
+        comb += repl32[0:32].eq(rs[0:32])
+        with m.If(self.is_32bit):
+            comb += repl32[32:64].eq(rs[0:32])
+
+        # Negate shift count for right shifts
+        with m.If(self.right_shift):
+            comb += rot_count.eq(-signed(self.shift[0:6]))
+        with m.Else():
+            comb += rot_count.eq(self.shift[0:6])
+
+        # ROTL submodule
+        m.submodules.rotl = rotl = ROTL(64)
+        comb += rotl.a.eq(repl32)
+        comb += rotl.b.eq(rot_count)
+        comb += rot.eq(rotl.o)
+
+        # Trim shift count to 6 bits for 32-bit shifts
+        comb += sh.eq(Cat(shift[0:6], shift[6] & ~self.is_32bit))
+
+        # XXX errr... we should already have these, in Fields?  oh well
+        # Work out mask begin/end indexes (caution, big-endian bit numbering)
+
+        # mask-begin (mb)
+        with m.If(self.clear_left):
+            with m.If(self.is_32bit):
+                comb += mb.eq(Cat(self.insn[6:11], Const(0b01, 2)))
+            with m.Else():
+                comb += mb.eq(Cat(self.insn[6:11], self.insn[5], Const(0b0, 1)))
+        with m.Elif(self.right_shift):
+            # this is basically mb = sh + (is_32bit? 32: 0);
+            with m.If(self.is_32bit):
+                comb += mb.eq(Cat(sh[0:5], ~sh[5], sh[5]))
+            with m.Else():
+                comb += mb.eq(sh)
+        with m.Else():
+            comb += mb.eq(Cat(Const(0b0, 5), self.is_32bit, Const(0b0, 1)))
+
+        # mask-end (me)
+        with m.If(self.clear_right & self.is_32bit):
+            comb += me.eq(Cat(self.insn[1:6], Const(0b01, 2)))
+        with m.Elif(self.clear_right & ~self.clear_left):
+            comb += me.eq(Cat(self.insn[6:11], self.insn[5], Const(0b0, 1)))
+        with m.Else():
+            # effectively, 63 - sh
+            comb += me.eq(Cat(~shift[0:6], shift[6]))
+
+        # Calculate left and right masks
+        comb += mr.eq(right_mask(m, mb))
+        comb += ml.eq(left_mask(m, me))
+
+        # Work out output mode
+        # 00 for sl[wd]
+        # 0w for rlw*, rldic, rldicr, rldimi, where w = 1 iff mb > me
+        # 10 for rldicl, sr[wd]
+        # 1z for sra[wd][i], z = 1 if rs is negative
+        with m.If((self.clear_left & ~self.clear_right) | self.right_shift):
+            comb += output_mode.eq(Cat(self.arith & repl32[63], Const(1, 1))
+        with m.Else():
+            mbgt = self.clear_right & (unsigned(mb[0:6]) > unsigned(me[0:6]))
+            comb += output_mode.eq(Cat(mbgt, Const(0, 1))
+
+        # Generate output from rotated input and masks
+        with m.Switch(output_mode):
+            with m.Case(0b00):
+                comb += self.result_o.eq((rot & (mr & ml)) | (ra & ~(mr & ml)))
+            with m.Case(0b01):
+                comb += self.result_o.eq((rot & (mr | ml)) | (ra & ~(mr | ml)))
+            with m.Case(0b10):
+                comb += self.result_o.eq(rot & mr)
+            with m.Case(0b11):
+                comb += self.result_o.eq(rot | ~mr)
+                # Generate carry output for arithmetic shift right of -ve value
+                comb += self.carry_out_o.eq(rs & ~ml)
+
+        return m
+
diff --git a/src/soc/shift_rot/rotl.py b/src/soc/shift_rot/rotl.py
new file mode 100644
index 00000000..d2ebfcf7
--- /dev/null
+++ b/src/soc/shift_rot/rotl.py
@@ -0,0 +1,24 @@
+from nmigen import (Elaboratable, Signal, Module)
+import math
+
+class ROTL(Elaboratable):
+    def __init__(self, width):
+        self.width = width
+        self.shiftwidth = math.ceil(math.log2(width))
+        self.a = Signal(width, reset_less=True)
+        self.b = Signal(self.shiftwidth, reset_less=True)
+
+        self.o = Signal(width, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        shl = Signal.like(self.a)
+        shr = Signal.like(self.a)
+
+        comb += shl.eq(self.a << self.b)
+        comb += shr.eq(self.a >> (self.width - self.b))
+
+        comb += self.o.eq(shl | shr)
+        return m
diff --git a/src/soc/shift_rot/test/test_maskgen.py b/src/soc/shift_rot/test/test_maskgen.py
new file mode 100644
index 00000000..f9d28d70
--- /dev/null
+++ b/src/soc/shift_rot/test/test_maskgen.py
@@ -0,0 +1,41 @@
+from nmigen import Signal, Module
+from nmigen.back.pysim import Simulator, Delay, Settle
+from nmigen.test.utils import FHDLTestCase
+from soc.alu.maskgen import MaskGen
+from soc.decoder.helpers import MASK
+import random
+import unittest
+
+class MaskGenTestCase(FHDLTestCase):
+    def test_maskgen(self):
+        m = Module()
+        comb = m.d.comb
+        m.submodules.dut = dut = MaskGen(64)
+        mb = Signal.like(dut.mb)
+        me = Signal.like(dut.me)
+        o = Signal.like(dut.o)
+
+        comb += [
+            dut.mb.eq(mb),
+            dut.me.eq(me),
+            o.eq(dut.o)]
+
+        sim = Simulator(m)
+
+        def process():
+            for x in range(0, 64):
+                for y in range(0, 64):
+                    yield mb.eq(x)
+                    yield me.eq(y)
+                    yield Delay(1e-6)
+
+                    expected = MASK(x, y)
+                    result = yield o
+                    self.assertEqual(expected, result)
+
+        sim.add_process(process) # or sim.add_sync_process(process), see below
+        with sim.write_vcd("maskgen.vcd", "maskgen.gtkw", traces=dut.ports()):
+            sim.run()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/soc/shift_rot/test/test_pipe_caller.py b/src/soc/shift_rot/test/test_pipe_caller.py
new file mode 100644
index 00000000..24a03c12
--- /dev/null
+++ b/src/soc/shift_rot/test/test_pipe_caller.py
@@ -0,0 +1,243 @@
+from nmigen import Module, Signal
+from nmigen.back.pysim import Simulator, Delay, Settle
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+import unittest
+from soc.decoder.isa.caller import ISACaller, special_sprs
+from soc.decoder.power_decoder import (create_pdecode)
+from soc.decoder.power_decoder2 import (PowerDecode2)
+from soc.decoder.power_enums import (XER_bits)
+from soc.decoder.selectable_int import SelectableInt
+from soc.simulator.program import Program
+from soc.decoder.isa.all import ISA
+
+
+from soc.shift_rot.pipeline import ShiftRotBasePipe
+from soc.alu.alu_input_record import CompALUOpSubset
+from soc.alu.pipe_data import ALUPipeSpec
+import random
+
+class TestCase:
+    def __init__(self, program, regs, sprs, name):
+        self.program = program
+        self.regs = regs
+        self.sprs = sprs
+        self.name = name
+
+def get_rec_width(rec):
+    recwidth = 0
+    # Setup random inputs for dut.op
+    for p in rec.ports():
+        width = p.width
+        recwidth += width
+    return recwidth
+
+def set_alu_inputs(alu, dec2, sim):
+    inputs = []
+    # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
+    # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
+    # and place it into data_i.b
+
+    reg3_ok = yield dec2.e.read_reg3.ok
+    if reg3_ok:
+        reg3_sel = yield dec2.e.read_reg3.data
+        inputs.append(sim.gpr(reg3_sel).value)
+    reg1_ok = yield dec2.e.read_reg1.ok
+    if reg1_ok:
+        reg1_sel = yield dec2.e.read_reg1.data
+        inputs.append(sim.gpr(reg1_sel).value)
+    reg2_ok = yield dec2.e.read_reg2.ok
+    if reg2_ok:
+        reg2_sel = yield dec2.e.read_reg2.data
+        inputs.append(sim.gpr(reg2_sel).value)
+
+    print(inputs)
+
+    if len(inputs) == 0:
+        yield alu.p.data_i.a.eq(0)
+        yield alu.p.data_i.b.eq(0)
+    if len(inputs) == 1:
+        yield alu.p.data_i.a.eq(inputs[0])
+        yield alu.p.data_i.b.eq(0)
+    if len(inputs) == 2:
+        yield alu.p.data_i.a.eq(inputs[0])
+        yield alu.p.data_i.b.eq(inputs[1])
+
+def set_extra_alu_inputs(alu, dec2, sim):
+    carry = 1 if sim.spr['XER'][XER_bits['CA']] else 0
+    yield alu.p.data_i.carry_in.eq(carry)
+    so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+    yield alu.p.data_i.so.eq(so)
+    
+
+# This test bench is a bit different than is usual. Initially when I
+# was writing it, I had all of the tests call a function to create a
+# device under test and simulator, initialize the dut, run the
+# simulation for ~2 cycles, and assert that the dut output what it
+# should have. However, this was really slow, since it needed to
+# create and tear down the dut and simulator for every test case.
+
+# Now, instead of doing that, every test case in ALUTestCase puts some
+# data into the test_data list below, describing the instructions to
+# be tested and the initial state. Once all the tests have been run,
+# test_data gets passed to TestRunner which then sets up the DUT and
+# simulator once, runs all the data through it, and asserts that the
+# results match the pseudocode sim at every cycle.
+
+# By doing this, I've reduced the time it takes to run the test suite
+# massively. Before, it took around 1 minute on my computer, now it
+# takes around 3 seconds
+
+test_data = []
+
+
+class ALUTestCase(FHDLTestCase):
+    def __init__(self, name):
+        super().__init__(name)
+        self.test_name = name
+    def run_tst_program(self, prog, initial_regs=[0] * 32, initial_sprs={}):
+        tc = TestCase(prog, initial_regs, initial_sprs, self.test_name)
+        test_data.append(tc)
+
+
+    def test_shift(self):
+        insns = ["slw", "sld", "srw", "srd", "sraw", "srad"]
+        for i in range(20):
+            choice = random.choice(insns)
+            lst = [f"{choice} 3, 1, 2"]
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            initial_regs[2] = random.randint(0, 63)
+            print(initial_regs[1], initial_regs[2])
+            self.run_tst_program(Program(lst), initial_regs)
+
+
+    def test_shift_arith(self):
+        lst = ["sraw 3, 1, 2"]
+        initial_regs = [0] * 32
+        initial_regs[1] = random.randint(0, (1<<64)-1)
+        initial_regs[2] = random.randint(0, 63)
+        print(initial_regs[1], initial_regs[2])
+        self.run_tst_program(Program(lst), initial_regs)
+
+    def test_rlwinm(self):
+        for i in range(10):
+            mb = random.randint(0,31)
+            me = random.randint(0,31)
+            sh = random.randint(0,31)
+            lst = [f"rlwinm 3, 1, {mb}, {me}, {sh}"]
+            initial_regs = [0] * 32
+            initial_regs[1] = random.randint(0, (1<<64)-1)
+            self.run_tst_program(Program(lst), initial_regs)
+
+    def test_rlwimi(self):
+        lst = ["rlwimi 3, 1, 5, 20, 6"]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0xdeadbeef
+        initial_regs[3] = 0x12345678
+        self.run_tst_program(Program(lst), initial_regs)
+
+    def test_rlwnm(self):
+        lst = ["rlwnm 3, 1, 2, 20, 6"]
+        initial_regs = [0] * 32
+        initial_regs[1] = random.randint(0, (1<<64)-1)
+        initial_regs[2] = random.randint(0, 63)
+        self.run_tst_program(Program(lst), initial_regs)
+        
+    def test_ilang(self):
+        rec = CompALUOpSubset()
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
+        alu = ShiftRotBasePipe(pspec)
+        vl = rtlil.convert(alu, ports=[])
+        with open("pipeline.il", "w") as f:
+            f.write(vl)
+
+
+class TestRunner(FHDLTestCase):
+    def __init__(self, test_data):
+        super().__init__("run_all")
+        self.test_data = test_data
+
+    def run_all(self):
+        m = Module()
+        comb = m.d.comb
+        instruction = Signal(32)
+
+        pdecode = create_pdecode()
+
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+
+        rec = CompALUOpSubset()
+
+        pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
+        m.submodules.alu = alu = ShiftRotBasePipe(pspec)
+
+        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.e)
+        comb += alu.p.valid_i.eq(1)
+        comb += alu.n.ready_i.eq(1)
+        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+        sim = Simulator(m)
+
+        sim.add_clock(1e-6)
+        def process():
+            for test in self.test_data:
+                print(test.name)
+                program = test.program
+                self.subTest(test.name)
+                simulator = ISA(pdecode2, test.regs, test.sprs)
+                gen = program.generate_instructions()
+                instructions = list(zip(gen, program.assembly.splitlines()))
+
+                index = simulator.pc.CIA.value//4
+                while index < len(instructions):
+                    ins, code = instructions[index]
+
+                    print("0x{:X}".format(ins & 0xffffffff))
+                    print(code)
+
+                    # ask the decoder to decode this binary data (endian'd)
+                    yield pdecode2.dec.bigendian.eq(0)  # little / big?
+                    yield instruction.eq(ins)          # raw binary instr.
+                    yield Settle()
+                    yield from set_alu_inputs(alu, pdecode2, simulator)
+                    yield from set_extra_alu_inputs(alu, pdecode2, simulator)
+                    yield 
+                    opname = code.split(' ')[0]
+                    yield from simulator.call(opname)
+                    index = simulator.pc.CIA.value//4
+
+                    vld = yield alu.n.valid_o
+                    while not vld:
+                        yield
+                        vld = yield alu.n.valid_o
+                    yield
+                    alu_out = yield alu.n.data_o.o
+                    out_reg_valid = yield pdecode2.e.write_reg.ok
+                    if out_reg_valid:
+                        write_reg_idx = yield pdecode2.e.write_reg.data
+                        expected = simulator.gpr(write_reg_idx).value
+                        print(f"expected {expected:x}, actual: {alu_out:x}")
+                        self.assertEqual(expected, alu_out)
+                    yield from self.check_extra_alu_outputs(alu, pdecode2,
+                                                            simulator)
+
+        sim.add_sync_process(process)
+        with sim.write_vcd("simulator.vcd", "simulator.gtkw",
+                            traces=[]):
+            sim.run()
+    def check_extra_alu_outputs(self, alu, dec2, sim):
+        rc = yield dec2.e.rc.data
+        if rc:
+            cr_expected = sim.crl[0].get_range().value
+            cr_actual = yield alu.n.data_o.cr0
+            self.assertEqual(cr_expected, cr_actual)
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+    suite.addTest(TestRunner(test_data))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
-- 
2.30.2