From f2fe0c00f38b95ab76df6110f9d0d868d7200e96 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Mon, 6 Jul 2020 16:34:31 +0100
Subject: [PATCH] add first cut at fu mul pipeline

---
 src/soc/fu/mul/main_stage.py | 74 +++++++-------------------------
 src/soc/fu/mul/pipe_data.py  | 27 +++++++++---
 src/soc/fu/mul/pipeline.py   | 20 ++++++---
 src/soc/fu/mul/post_stage.py | 81 ++++++++++++++++++++++++++++++++++++
 src/soc/fu/mul/pre_stage.py  | 53 +++++++++++++++++++++++
 5 files changed, 187 insertions(+), 68 deletions(-)
 create mode 100644 src/soc/fu/mul/post_stage.py
 create mode 100644 src/soc/fu/mul/pre_stage.py

diff --git a/src/soc/fu/mul/main_stage.py b/src/soc/fu/mul/main_stage.py
index ea40da35..97ba81d7 100644
--- a/src/soc/fu/mul/main_stage.py
+++ b/src/soc/fu/mul/main_stage.py
@@ -1,79 +1,37 @@
-# This stage is intended to do most of the work of executing multiply
-# instructions, as well as carry and overflow generation. This module
-# however should not gate the carry or overflow, that's up to the
-# output stage
-from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
+# This stage is intended to do the main work of an actual multiply
+
+from nmigen import Module
 from nmutil.pipemodbase import PipeModBase
-from soc.fu.alu.pipe_data import ALUOutputData
-from soc.fu.mul.pipe_data import MulInputData
+from soc.fu.mul.pipe_data import MulIntermediateData, MulOutputData
 from ieee754.part.partsig import PartitionedSignal
-from soc.decoder.power_enums import InternalOp
-from soc.fu.shift_rot.rotator import Rotator
-
-from soc.decoder.power_fields import DecodeFields
-from soc.decoder.power_fieldsn import SignalBitRange
 
 
-class ShiftRotMainStage(PipeModBase):
+class MulMainStage2(PipeModBase):
     def __init__(self, pspec):
-        super().__init__(pspec, "main")
-        self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
-        self.fields.create_specs()
+        super().__init__(pspec, "mul2")
 
     def ispec(self):
-        return MulInputData(self.pspec)
+        return MulIntermediateData(self.pspec) # pipeline stage input format
 
     def ospec(self):
-        return ALUOutputData(self.pspec)
+        return MulOutputData(self.pspec) # pipeline stage output format
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
 
-        # obtain me and mb fields from instruction.
-        m_fields = self.fields.instrs['M']
-        md_fields = self.fields.instrs['MD']
-        mb = Signal(m_fields['MB'][0:-1].shape())
-        me = Signal(m_fields['ME'][0:-1].shape())
-        mb_extra = Signal(1, reset_less=True)
-        comb += mb.eq(m_fields['MB'][0:-1])
-        comb += me.eq(m_fields['ME'][0:-1])
-        comb += mb_extra.eq(md_fields['mb'][0:-1][0])
-
-        # set up microwatt rotator module
-        m.submodules.rotator = rotator = Rotator()
-        comb += [
-            rotator.me.eq(me),
-            rotator.mb.eq(mb),
-            rotator.mb_extra.eq(mb_extra),
-            rotator.rs.eq(self.i.rs),
-            rotator.ra.eq(self.i.ra),
-            rotator.shift.eq(self.i.rb),
-            rotator.is_32bit.eq(self.i.ctx.op.is_32bit),
-            rotator.arith.eq(self.i.ctx.op.is_signed),
-        ]
+        # convenience variables
+        a, b, o = self.i.a, self.i.b, self.o.o
 
-        # instruction rotate type
-        mode = Signal(3, reset_less=True)
-        with m.Switch(self.i.ctx.op.insn_type):
-            with m.Case(InternalOp.OP_SHL):  comb += mode.eq(0b000)
-            with m.Case(InternalOp.OP_SHR):  comb += mode.eq(0b001) # R-shift
-            with m.Case(InternalOp.OP_RLC):  comb += mode.eq(0b110) # clear LR
-            with m.Case(InternalOp.OP_RLCL): comb += mode.eq(0b010) # clear L
-            with m.Case(InternalOp.OP_RLCR): comb += mode.eq(0b100) # clear R
+        # actual multiply (TODO: split into stages)
+        comb += o.eq(a * b)
 
-        comb += Cat(rotator.right_shift,
-                    rotator.clear_left,
-                    rotator.clear_right).eq(mode)
-                
-        # outputs from the microwatt rotator module
-        # XXX TODO: carry32
-        comb += [self.o.o.eq(rotator.result_o),
-                 self.o.xer_ca[0].eq(rotator.carry_out_o)]
-
-        ###### sticky overflow and context, both pass-through #####
+        ###### xer and context, all pass-through #####
 
+        comb += self.o.xer_ca.data.eq(self.i.xer_ca)
+        comb += self.o.neg_res.data.eq(self.i.neg_res)
         comb += self.o.xer_so.data.eq(self.i.xer_so)
         comb += self.o.ctx.eq(self.i.ctx)
 
         return m
+
diff --git a/src/soc/fu/mul/pipe_data.py b/src/soc/fu/mul/pipe_data.py
index 495d503b..429be008 100644
--- a/src/soc/fu/mul/pipe_data.py
+++ b/src/soc/fu/mul/pipe_data.py
@@ -1,10 +1,27 @@
 from soc.fu.alu.alu_input_record import CompALUOpSubset
 from soc.fu.pipe_data import IntegerData, CommonPipeSpec
-from soc.fu.alu.pipe_data import ALUOutputData
-from soc.fu.shift_rot.pipe_data import ShoftRotInputData
+from soc.fu.alu.pipe_data import ALUOutputData, ALUInputData
 
 
-# TODO: replace CompALUOpSubset with CompShiftRotOpSubset
-class ShiftRotPipeSpec(CommonPipeSpec):
-    regspec = (ShiftRotInputData.regspec, ALUOutputData.regspec)
+class MulIntermediateData(ALUInputData):
+    def __init__(self, pspec):
+        super().__init__(pspec)
+
+        neg_result = Signal(reset_less=True)
+        self.data.append(neg_result)
+
+
+class MulOutputData(IntegerData):
+    regspec = [('INT', 'o', '0:128'),
+               ('XER', 'xer_so', '32'), # XER bit 32: SO
+               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+    def __init__(self, pspec):
+        super().__init__(pspec, False)
+
+        neg_result = Signal(reset_less=True)
+        self.data.append(neg_result)
+
+
+class MulPipeSpec(CommonPipeSpec):
+    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
     opsubsetkls = CompALUOpSubset
diff --git a/src/soc/fu/mul/pipeline.py b/src/soc/fu/mul/pipeline.py
index e726d170..d32d7529 100644
--- a/src/soc/fu/mul/pipeline.py
+++ b/src/soc/fu/mul/pipeline.py
@@ -3,18 +3,27 @@ from nmutil.pipemodbase import PipeModBaseChain
 from soc.fu.shift_rot.input_stage import ShiftRotInputStage
 from soc.fu.shift_rot.main_stage import ShiftRotMainStage
 from soc.fu.alu.output_stage import ALUOutputStage
+from soc.fu.mul.main_stage import MulMainStage1, MulMainStage2, MulMainStage3
+
 
 class MulStages1(PipeModBaseChain):
     def get_chain(self):
-        inp = ALUInputStage(self.pspec)
-        main = MulMainStage1(self.pspec)
+        inp = ALUInputStage(self.pspec)   # a-invert, carry etc
+        main = MulMainStage1(self.pspec)  # detect signed/32-bit
         return [inp, main]
 
+
 class MulStages2(PipeModBaseChain):
     def get_chain(self):
-        main2 = MulMainStage2(self.pspec)
-        out = ALUOutputStage(self.pspec)
-        return [main2, out]
+        main2 = MulMainStage2(self.pspec) # actual multiply
+        return [main2]
+
+
+class MulStages3(PipeModBaseChain):
+    def get_chain(self):
+        main3 = MulMainStage3(self.pspec) # select output bits, invert, set ov
+        out = ALUOutputStage(self.pspec)  # do CR, XER and out-invert etc.
+        return [main3, out]
 
 
 class ShiftRotBasePipe(ControlBase):
@@ -23,6 +32,7 @@ class ShiftRotBasePipe(ControlBase):
         self.pspec = pspec
         self.pipe1 = MulStages1(pspec)
         self.pipe2 = MulStages2(pspec)
+        self.pipe2 = MulStages3(pspec)
         self._eqs = self.connect([self.pipe1, self.pipe2])
 
     def elaborate(self, platform):
diff --git a/src/soc/fu/mul/post_stage.py b/src/soc/fu/mul/post_stage.py
new file mode 100644
index 00000000..501b4ed5
--- /dev/null
+++ b/src/soc/fu/mul/post_stage.py
@@ -0,0 +1,81 @@
+# This stage is intended to do most of the work of analysing the multiply result
+
+from nmigen import (Module, Signal, Cat, Repl, Mux, signed)
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.alu.pipe_data import ALUOutputData
+from soc.fu.mul.pipe_data import MulOutputData
+from ieee754.part.partsig import PartitionedSignal
+from soc.decoder.power_enums import InternalOp
+
+
+class MulMainStage3(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "mul3")
+
+    def ispec(self):
+        return MulOutputData(self.pspec) # pipeline stage output format
+
+    def ospec(self):
+        return ALUOutputData(self.pspec) # defines pipeline stage output format
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # convenience variables
+        cry_o, o, cr0 = self.o.xer_ca, self.o.o, self.o.cr0
+        ov_o = self.o.xer_ov
+        o_i, cry_i, op = self.i.o, self.i.xer_ca, self.i.ctx.op
+
+        # check if op is 32-bit, and get sign bit from operand a
+        is_32bit = Signal(reset_less=True)
+        comb += is_32bit.eq(op.is_32bit)
+
+        # check negate: select signed/unsigned
+        o_s = Signal(signed(o.width * 2), reset_less=True)
+        mul_o = Signal(o.width * 2, reset_less=True)
+        comb += o_s.eq(-o_i)
+        comb += mul_o.eq(Mux(self.i.neg_res, o_s, o_i))
+        comb += o.ok.eq(1)
+
+        with m.Switch(op.insn_type):
+            # hi-32 replicated twice
+            with m.Case(InternalOp.OP_MUL_H32):
+                comb += o.data.eq(Repl(mul_o[32:64], 2))
+            # hi-64 
+            with m.Case(InternalOp.OP_MUL_H64):
+                comb += o.data.eq(mul_o[64:128])
+            # lo-64 - overflow
+            with m.Default():
+                comb += o.data.eq(mul_o[0:64])
+
+                # compute overflow
+                mul_ov = Signal(reset_less=True)
+                with m.If(is_32bit):
+                    m32 = mul_o[32:64]
+                    comb += mul_ov.eq(m32.bool() & ~m32.all())
+                with m.Else():
+                    m64 = mul_o[64:128]
+                    comb += mul_ov.eq(m64.bool() & ~m64.all())
+
+                # 32-bit (ov[1]) and 64-bit (ov[0]) overflow
+                ov = Signal(2, reset_less=True)
+                comb += ov[0].eq(mul_ov)
+                comb += ov[1].eq(mul_ov)
+                comb += ov_o.data.eq(ov)
+                comb += ov_o.ok.eq(1)
+
+        # https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
+        ca = Signal(2, reset_less=True)
+        comb += ca[0].eq(add_o[-1])                   # XER.CA
+        comb += ca[1].eq(add_o[33] ^ (a[32] ^ b[32])) # XER.CA32
+        comb += cry_o.data.eq(ca)
+        comb += cry_o.ok.eq(1)
+
+        ###### sticky overflow and context, both pass-through #####
+
+        comb += self.o.xer_so.data.eq(self.i.xer_so)
+        comb += self.o.ctx.eq(self.i.ctx)
+
+        return m
+
diff --git a/src/soc/fu/mul/pre_stage.py b/src/soc/fu/mul/pre_stage.py
new file mode 100644
index 00000000..ff1e3220
--- /dev/null
+++ b/src/soc/fu/mul/pre_stage.py
@@ -0,0 +1,53 @@
+# This stage is intended to do most of the work of executing multiply
+from nmigen import (Module, Signal, Mux)
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.alu.pipe_data import ALUInputData
+from soc.fu.mul.pipe_data import MulIntermediateData
+from ieee754.part.partsig import PartitionedSignal
+
+
+class MulMainStage1(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "mul1")
+
+    def ispec(self):
+        return ALUInputData(self.pspec) # defines pipeline stage input format
+
+    def ospec(self):
+        return MulIntermediateData(self.pspec) # pipeline stage output format
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # convenience variables
+        a, b = self.i.a, self.i.b
+        a_o, b_o, neg_res_o = self.o.a, self.o.b, self.o.neg_res
+
+        # check if op is 32-bit, and get sign bit from operand a
+        is_32bit = Signal(reset_less=True)
+        sign_a = Signal(reset_less=True)
+        sign_b = Signal(reset_less=True)
+        comb += is_32bit.eq(op.is_32bit)
+
+        # work out if a/b are negative (check 32-bit / signed)
+        comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed)
+        comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+
+        # work out if result is negative sign
+        comb += neg_res_o.eq(sign_a ^ sign_b)
+
+        # negation of a 64-bit value produces the same lower 32-bit
+        # result as negation of just the lower 32-bits, so we don't
+        # need to do anything special before negating
+        comb += a_o.eq(Mux(sign_a, -a, a))
+        comb += b_o.eq(Mux(sign_b, -b, b))
+
+        ###### XER and context, both pass-through #####
+
+        comb += self.o.xer_ca.data.eq(self.i.xer_ca)
+        comb += self.o.xer_so.data.eq(self.i.xer_so)
+        comb += self.o.ctx.eq(self.i.ctx)
+
+        return m
+
-- 
2.30.2