From 0ffe4f4ed1a544badb243da322ee8182e965ff05 Mon Sep 17 00:00:00 2001
From: Michael Nolan <mtnolan2640@gmail.com>
Date: Fri, 15 May 2020 13:40:08 -0400
Subject: [PATCH] Add rudimentary branch unit test bench

---
 src/soc/branch/main_stage.py            |  88 ++---------------
 src/soc/branch/pipe_data.py             |  45 ++++++---
 src/soc/branch/pipeline.py              |  16 ++--
 src/soc/branch/test/test_pipe_caller.py | 120 +-----------------------
 src/soc/decoder/power_enums.py          |   1 +
 5 files changed, 51 insertions(+), 219 deletions(-)

diff --git a/src/soc/branch/main_stage.py b/src/soc/branch/main_stage.py
index b50afc27..e4e522b6 100644
--- a/src/soc/branch/main_stage.py
+++ b/src/soc/branch/main_stage.py
@@ -7,11 +7,8 @@
 
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
-from soc.logical.pipe_data import ALUInputData
-from soc.alu.pipe_data import ALUOutputData
-from ieee754.part.partsig import PartitionedSignal
+from soc.branch.pipe_data import BranchInputData, BranchOutputData
 from soc.decoder.power_enums import InternalOp
-from soc.countzero.countzero import ZeroCounter
 
 from soc.decoder.power_fields import DecodeFields
 from soc.decoder.power_fieldsn import SignalBitRange
@@ -24,105 +21,32 @@ def array_of(count, bitwidth):
     return res
 
 
-class LogicalMainStage(PipeModBase):
+class BranchMainStage(PipeModBase):
     def __init__(self, pspec):
         super().__init__(pspec, "main")
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
 
     def ispec(self):
-        return ALUInputData(self.pspec)
+        return BranchInputData(self.pspec)
 
     def ospec(self):
-        return ALUOutputData(self.pspec) # TODO: ALUIntermediateData
+        return BranchOutputData(self.pspec) # TODO: ALUIntermediateData
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
-        op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
+        op = self.i.ctx.op
 
         ##########################
         # main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
 
         with m.Switch(op.insn_type):
+            pass
 
-            ###### AND, OR, XOR #######
-            with m.Case(InternalOp.OP_AND):
-                comb += o.eq(a & b)
-            with m.Case(InternalOp.OP_OR):
-                comb += o.eq(a | b)
-            with m.Case(InternalOp.OP_XOR):
-                comb += o.eq(a ^ b)
-
-            ###### cmpb #######
-            with m.Case(InternalOp.OP_CMPB):
-                l = []
-                for i in range(8):
-                    slc = slice(i*8, (i+1)*8)
-                    l.append(Repl(a[slc] == b[slc], 8))
-                comb += o.eq(Cat(*l))
-
-            ###### popcount #######
-            with m.Case(InternalOp.OP_POPCNT):
-                # starting from a, perform successive addition-reductions
-                # creating arrays big enough to store the sum, each time
-                pc = [a]
-                # QTY32 2-bit (to take 2x 1-bit sums) etc.
-                work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 6)]
-                for l, b in work:
-                    pc.append(array_of(l, b))
-                pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
-                pc32 = pc[5]    # array of 2 32-bit counts (popcntw)
-                popcnt = pc[-1] # array of 1 64-bit count (popcntd)
-                # cascade-tree of adds
-                for idx, (l, b) in enumerate(work):
-                    for i in range(l):
-                        stt, end = i*2, i*2+1
-                        src, dst = pc[idx], pc[idx+1]
-                        comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
-                                          Cat(src[end], Const(0, 1)))
-                # decode operation length
-                with m.If(op.data_len[2:4] == 0b00):
-                    # popcntb - pack 8x 4-bit answers into output
-                    for i in range(8):
-                        comb += o[i*8:i*8+4].eq(pc8[i])
-                with m.Elif(op.data_len[3] == 0):
-                    # popcntw - pack 2x 5-bit answers into output
-                    for i in range(2):
-                        comb += o[i*32:i*32+5].eq(pc32[i])
-                with m.Else():
-                    # popcntd - put 1x 6-bit answer into output
-                    comb += o.eq(popcnt[0])
-
-            ###### parity #######
-            with m.Case(InternalOp.OP_PRTY):
-                # strange instruction which XORs together the LSBs of each byte
-                par0 = Signal(reset_less=True)
-                par1 = Signal(reset_less=True)
-                comb += par0.eq(Cat(a[0] , a[8] , a[16], a[24]).xor())
-                comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
-                with m.If(op.data_len[3] == 1):
-                    comb += o.eq(par0 ^ par1)
-                with m.Else():
-                    comb += o[0].eq(par0)
-                    comb += o[32].eq(par1)
-
-            ###### cntlz #######
-            with m.Case(InternalOp.OP_CNTZ):
-                x_fields = self.fields.instrs['X']
-                XO = Signal(x_fields['XO'][0:-1].shape())
-                m.submodules.countz = countz = ZeroCounter()
-                comb += countz.rs_i.eq(a)
-                comb += countz.is_32bit_i.eq(op.is_32bit)
-                comb += countz.count_right_i.eq(XO[-1])
-                comb += o.eq(countz.result_o)
-
-            ###### bpermd #######
-            # TODO with m.Case(InternalOp.OP_BPERM): - not in microwatt
 
         ###### sticky overflow and context, both pass-through #####
 
-        comb += self.o.so.eq(self.i.so)
         comb += self.o.ctx.eq(self.i.ctx)
 
         return m
diff --git a/src/soc/branch/pipe_data.py b/src/soc/branch/pipe_data.py
index 34d9c0ae..26a4aae4 100644
--- a/src/soc/branch/pipe_data.py
+++ b/src/soc/branch/pipe_data.py
@@ -1,5 +1,6 @@
 from nmigen import Signal, Const
 from ieee754.fpcommon.getop import FPPipeContext
+from soc.decoder.power_decoder2 import Data
 
 
 class IntegerData:
@@ -15,23 +16,43 @@ class IntegerData:
         return [self.ctx.eq(i.ctx)]
 
 
-class ALUInputData(IntegerData):
+class BranchInputData(IntegerData):
     def __init__(self, pspec):
         super().__init__(pspec)
-        self.a = Signal(64, reset_less=True) # RA
-        self.b = Signal(64, reset_less=True) # RB/immediate
-        self.so = Signal(reset_less=True)
-        self.carry_in = Signal(reset_less=True)
+        # We need both lr and spr for bclr and bcctrl. Bclr can read
+        # from both ctr and lr, and bcctrl can write to both ctr and
+        # lr.
+        self.lr = Signal(64, reset_less=True)
+        self.spr = Signal(64, reset_less=True)
+        self.cr = Signal(32, reset_less=True)
+        # NIA not needed, it's already part of ctx
 
     def __iter__(self):
         yield from super().__iter__()
-        yield self.a
-        yield self.b
-        yield self.carry_in
-        yield self.so
+        yield self.lr
+        yield self.spr
+        yield self.cr
 
     def eq(self, i):
         lst = super().eq(i)
-        return lst + [self.a.eq(i.a), self.b.eq(i.b),
-                      self.carry_in.eq(i.carry_in),
-                      self.so.eq(i.so)]
+        return lst + [self.lr.eq(i.lr), self.spr.eq(i.lr),
+                      self.cr.eq(i.cr)]
+
+
+class BranchOutputData(IntegerData):
+    def __init__(self, pspec):
+        super().__init__(pspec)
+        self.lr = Signal(64, reset_less=True)
+        self.spr = Signal(64, reset_less=True)
+        self.nia_out = Data(64, name="nia_out")
+
+    def __iter__(self):
+        yield from super().__iter__()
+        yield self.lr
+        yield self.spr
+        yield from self.nia_out
+
+    def eq(self, i):
+        lst = super().eq(i)
+        return lst + [self.lr.eq(i.lr), self.spr.eq(i.spr),
+                      self.nia_out.eq(i.nia_out)]
diff --git a/src/soc/branch/pipeline.py b/src/soc/branch/pipeline.py
index f3c83276..ac132f74 100644
--- a/src/soc/branch/pipeline.py
+++ b/src/soc/branch/pipeline.py
@@ -1,21 +1,17 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
-from soc.alu.input_stage import ALUInputStage
-from soc.logical.main_stage import LogicalMainStage
-from soc.alu.output_stage import ALUOutputStage
+from soc.branch.main_stage import BranchMainStage
 
-class LogicalStages(PipeModBaseChain):
+class BranchStages(PipeModBaseChain):
     def get_chain(self):
-        inp = ALUInputStage(self.pspec)
-        main = LogicalMainStage(self.pspec)
-        out = ALUOutputStage(self.pspec)
-        return [inp, main, out]
+        main = BranchMainStage(self.pspec)
+        return [main]
 
 
-class LogicalBasePipe(ControlBase):
+class BranchBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
-        self.pipe1 = LogicalStages(pspec)
+        self.pipe1 = BranchStages(pspec)
         self._eqs = self.connect([self.pipe1])
 
     def elaborate(self, platform):
diff --git a/src/soc/branch/test/test_pipe_caller.py b/src/soc/branch/test/test_pipe_caller.py
index d540e7a7..dbc96b11 100644
--- a/src/soc/branch/test/test_pipe_caller.py
+++ b/src/soc/branch/test/test_pipe_caller.py
@@ -12,7 +12,7 @@ from soc.simulator.program import Program
 from soc.decoder.isa.all import ISA
 
 
-from soc.logical.pipeline import LogicalBasePipe
+from soc.branch.pipeline import BranchBasePipe
 from soc.alu.alu_input_record import CompALUOpSubset
 from soc.alu.pipe_data import ALUPipeSpec
 import random
@@ -33,45 +33,6 @@ def get_rec_width(rec):
         recwidth += width
     return recwidth
 
-def set_alu_inputs(alu, dec2, sim):
-    # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
-    # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
-
-    reg3_ok = yield dec2.e.read_reg3.ok
-    reg1_ok = yield dec2.e.read_reg1.ok
-    assert reg3_ok != reg1_ok
-    if reg3_ok:
-        data1 = yield dec2.e.read_reg3.data
-        data1 = sim.gpr(data1).value
-    elif reg1_ok:
-        data1 = yield dec2.e.read_reg1.data
-        data1 = sim.gpr(data1).value
-    else:
-        data1 = 0
-
-    yield alu.p.data_i.a.eq(data1)
-
-    # If there's an immediate, set the B operand to that
-    reg2_ok = yield dec2.e.read_reg2.ok
-    imm_ok = yield dec2.e.imm_data.imm_ok
-    if imm_ok:
-        data2 = yield dec2.e.imm_data.imm
-    elif reg2_ok:
-        data2 = yield dec2.e.read_reg2.data
-        data2 = sim.gpr(data2).value
-    else:
-        data2 = 0
-    yield alu.p.data_i.b.eq(data2)
-
-
-
-def set_extra_alu_inputs(alu, dec2, sim):
-    carry = 1 if sim.spr['XER'][XER_bits['CA']] else 0
-    yield alu.p.data_i.carry_in.eq(carry)
-    so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
-    yield alu.p.data_i.so.eq(so)
-    
 
 # This test bench is a bit different than is usual. Initially when I
 # was writing it, I had all of the tests call a function to create a
@@ -102,71 +63,16 @@ class LogicalTestCase(FHDLTestCase):
         tc = TestCase(prog, initial_regs, initial_sprs, self.test_name)
         test_data.append(tc)
 
-    def test_rand(self):
-        insns = ["and", "or", "xor"]
-        for i in range(40):
-            choice = random.choice(insns)
-            lst = [f"{choice} 3, 1, 2"]
-            initial_regs = [0] * 32
-            initial_regs[1] = random.randint(0, (1<<64)-1)
-            initial_regs[2] = random.randint(0, (1<<64)-1)
-            self.run_tst_program(Program(lst), initial_regs)
-
-    def test_rand_imm_logical(self):
-        insns = ["andi.", "andis.", "ori", "oris", "xori", "xoris"]
-        for i in range(10):
-            choice = random.choice(insns)
-            imm = random.randint(0, (1<<16)-1)
-            lst = [f"{choice} 3, 1, {imm}"]
-            print(lst)
-            initial_regs = [0] * 32
-            initial_regs[1] = random.randint(0, (1<<64)-1)
-            self.run_tst_program(Program(lst), initial_regs)
-
-    @unittest.skip("broken")
-    def test_cntz(self):
-        insns = ["cntlzd", "cnttzd"]
-        for i in range(10):
-            choice = random.choice(insns)
-            lst = [f"{choice} 3, 1"]
-            print(lst)
-            initial_regs = [0] * 32
-            initial_regs[1] = random.randint(0, (1<<64)-1)
-            self.run_tst_program(Program(lst), initial_regs)
-
-    def test_parity(self):
-        insns = ["prtyw", "prtyd"]
-        for i in range(10):
-            choice = random.choice(insns)
-            lst = [f"{choice} 3, 1"]
-            print(lst)
-            initial_regs = [0] * 32
-            initial_regs[1] = random.randint(0, (1<<64)-1)
-            self.run_tst_program(Program(lst), initial_regs)
-
-    @unittest.skip("broken")
-    def test_popcnt(self):
-        insns = ["popcntb", "popcntw", "popcntd"]
-        for i in range(10):
-            choice = random.choice(insns)
-            lst = [f"{choice} 3, 1"]
-            print(lst)
-            initial_regs = [0] * 32
-            initial_regs[1] = random.randint(0, (1<<64)-1)
-            self.run_tst_program(Program(lst), initial_regs)
-
     def test_cmpb(self):
-        lst = ["cmpb 3, 1, 2"]
+        lst = ["b 0x1234"]
         initial_regs = [0] * 32
-        initial_regs[1] = 0xdeadbeefcafec0de
-        initial_regs[2] = 0xd0adb0000afec1de
         self.run_tst_program(Program(lst), initial_regs)
 
     def test_ilang(self):
         rec = CompALUOpSubset()
 
         pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
-        alu = LogicalBasePipe(pspec)
+        alu = BranchBasePipe(pspec)
         vl = rtlil.convert(alu, ports=[])
         with open("logical_pipeline.il", "w") as f:
             f.write(vl)
@@ -189,7 +95,7 @@ class TestRunner(FHDLTestCase):
         rec = CompALUOpSubset()
 
         pspec = ALUPipeSpec(id_wid=2, op_wid=get_rec_width(rec))
-        m.submodules.alu = alu = LogicalBasePipe(pspec)
+        m.submodules.alu = alu = BranchBasePipe(pspec)
 
         comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.e)
         comb += alu.p.valid_i.eq(1)
@@ -219,28 +125,12 @@ class TestRunner(FHDLTestCase):
                     yield instruction.eq(ins)          # raw binary instr.
                     yield Settle()
                     fn_unit = yield pdecode2.e.fn_unit
-                    self.assertEqual(fn_unit, Function.LOGICAL.value, code)
-                    yield from set_alu_inputs(alu, pdecode2, simulator)
-                    yield from set_extra_alu_inputs(alu, pdecode2, simulator)
+                    self.assertEqual(fn_unit, Function.BRANCH.value, code)
                     yield 
                     opname = code.split(' ')[0]
                     yield from simulator.call(opname)
                     index = simulator.pc.CIA.value//4
 
-                    vld = yield alu.n.valid_o
-                    while not vld:
-                        yield
-                        vld = yield alu.n.valid_o
-                    yield
-                    alu_out = yield alu.n.data_o.o
-                    out_reg_valid = yield pdecode2.e.write_reg.ok
-                    if out_reg_valid:
-                        write_reg_idx = yield pdecode2.e.write_reg.data
-                        expected = simulator.gpr(write_reg_idx).value
-                        print(f"expected {expected:x}, actual: {alu_out:x}")
-                        self.assertEqual(expected, alu_out, code)
-                    yield from self.check_extra_alu_outputs(alu, pdecode2,
-                                                            simulator)
 
         sim.add_sync_process(process)
         with sim.write_vcd("simulator.vcd", "simulator.gtkw",
diff --git a/src/soc/decoder/power_enums.py b/src/soc/decoder/power_enums.py
index 50a91855..b1e9b326 100644
--- a/src/soc/decoder/power_enums.py
+++ b/src/soc/decoder/power_enums.py
@@ -49,6 +49,7 @@ class Function(Enum):
     LDST = 2
     SHIFT_ROT = 3
     LOGICAL = 4
+    BRANCH = 5
 
 
 @unique
-- 
2.30.2