From c25d3e23de63d0cab676a76a1bb3e497640cc2a6 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Thu, 9 Jul 2020 11:49:51 +0100
Subject: [PATCH] remove xer_ca from DIV pipeline (took a bit of messing about)

---
 src/soc/fu/div/core_stages.py           |   6 +-
 src/soc/fu/div/input_stage.py           |   3 +
 src/soc/fu/div/output_stage.py          | 126 ++++++++++++++++++++++--
 src/soc/fu/div/pipeline.py              |  10 +-
 src/soc/fu/div/setup_stage.py           |   1 -
 src/soc/fu/div/test/test_pipe_caller.py |   5 -
 src/soc/fu/mul/output_stage.py          |  12 +++
 src/soc/fu/mul/pipeline.py              |   2 +-
 src/soc/fu/pipe_data.py                 |   7 +-
 9 files changed, 147 insertions(+), 25 deletions(-)
 create mode 100644 src/soc/fu/mul/output_stage.py

diff --git a/src/soc/fu/div/core_stages.py b/src/soc/fu/div/core_stages.py
index 3bbde7db..fdbe8659 100644
--- a/src/soc/fu/div/core_stages.py
+++ b/src/soc/fu/div/core_stages.py
@@ -3,14 +3,14 @@
 
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
-from soc.fu.logical.pipe_data import LogicalInputData
-from soc.fu.alu.pipe_data import ALUOutputData
 from ieee754.part.partsig import PartitionedSignal
 from soc.decoder.power_enums import InternalOp
 
 from soc.decoder.power_fields import DecodeFields
 from soc.decoder.power_fieldsn import SignalBitRange
-from soc.fu.div.pipe_data import CoreInputData, CoreInterstageData, CoreOutputData
+from soc.fu.div.pipe_data import (CoreInputData,
+                                  CoreInterstageData,
+                                  CoreOutputData)
 from ieee754.div_rem_sqrt_rsqrt.core import (DivPipeCoreSetupStage,
                                              DivPipeCoreCalculateStage,
                                              DivPipeCoreFinalStage)
diff --git a/src/soc/fu/div/input_stage.py b/src/soc/fu/div/input_stage.py
index 0849aded..a9ad6652 100644
--- a/src/soc/fu/div/input_stage.py
+++ b/src/soc/fu/div/input_stage.py
@@ -7,6 +7,9 @@ from soc.fu.div.pipe_data import DIVInputData
 
 # simply over-ride ALUInputStage ispec / ospec
 class DivMulInputStage(ALUInputStage):
+    def __init__(self, pspec):
+        super().__init__(pspec)
+
     def ispec(self): return DIVInputData(self.pspec)
     def ospec(self): return DIVInputData(self.pspec)
 
diff --git a/src/soc/fu/div/output_stage.py b/src/soc/fu/div/output_stage.py
index 67848dbe..9eb16f6c 100644
--- a/src/soc/fu/div/output_stage.py
+++ b/src/soc/fu/div/output_stage.py
@@ -1,12 +1,120 @@
-# This stage is intended to adjust the input data before sending it to
-# the actual ALU. Things like handling inverting the input, xer_ca
-# generation for subtraction, and handling of immediates should happen
-# in the base class (CommonOutputStage.elaborate).
-from soc.fu.alu.output_stage import ALUOutputStage
+# This stage is the setup stage that converts the inputs
+# into the values expected by DivPipeCore
+
+from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.logical.pipe_data import LogicalInputData
 from soc.fu.div.pipe_data import DivMulOutputData
+from ieee754.part.partsig import PartitionedSignal
+from soc.decoder.power_enums import InternalOp
+
+from soc.decoder.power_fields import DecodeFields
+from soc.decoder.power_fieldsn import SignalBitRange
+from soc.fu.div.pipe_data import CoreOutputData
+
+
+class DivOutputStage(PipeModBase):
+    def __init__(self, pspec):
+        super().__init__(pspec, "output_stage")
+        self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
+        self.fields.create_specs()
+        self.quotient_neg = Signal()
+        self.remainder_neg = Signal()
+        self.quotient_64 = Signal(64)
+        self.remainder_64 = Signal(64)
+
+    def ispec(self):
+        return CoreOutputData(self.pspec)
+
+    def ospec(self):
+        return DivMulOutputData(self.pspec)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        op = self.i.ctx.op
+        abs_quotient = self.i.core.quotient_root
+        fract_width = self.pspec.core_config.fract_width
+        # fract width of `DivPipeCoreOutputData.remainder`
+        remainder_fract_width = fract_width * 3
+        # fract width of `DivPipeCoreInputData.dividend`
+        dividend_fract_width = fract_width * 2
+        rem_start = remainder_fract_width - dividend_fract_width
+        abs_remainder = self.i.core.remainder[rem_start:rem_start+64]
+        dividend_neg = self.i.dividend_neg
+        divisor_neg = self.i.divisor_neg
+        quotient_64 = self.quotient_64
+        remainder_64 = self.remainder_64
+
+        comb += self.quotient_neg.eq(dividend_neg ^ divisor_neg)
+        # follows rules for truncating division
+        comb += self.remainder_neg.eq(dividend_neg)
+
+        # negation of a 64-bit value produces the same lower 32-bit
+        # result as negation of just the lower 32-bits, so we don't
+        # need to do anything special before negating
+        comb += [
+            quotient_64.eq(Mux(self.quotient_neg,
+                               -abs_quotient, abs_quotient)),
+            remainder_64.eq(Mux(self.remainder_neg,
+                                -abs_remainder, abs_remainder))
+        ]
+
+        xer_ov = self.o.xer_ov.data
+
+        def calc_overflow(dive_abs_overflow, sign_bit_mask):
+            nonlocal comb
+            overflow = dive_abs_overflow | self.i.div_by_zero
+            with m.If(op.is_signed):
+                comb += xer_ov.eq(overflow
+                                  | (abs_quotient > sign_bit_mask)
+                                  | ((abs_quotient == sign_bit_mask)
+                                     & ~self.quotient_neg))
+            with m.Else():
+                comb += xer_ov.eq(overflow)
+
+        with m.If(op.is_32bit):
+            calc_overflow(self.i.dive_abs_ov32, 0x80000000)
+        with m.Else():
+            calc_overflow(self.i.dive_abs_ov64, 0x8000000000000000)
+
+        ##########################
+        # main switch for DIV
+
+        o = self.o.o.data
+
+        with m.Switch(op.insn_type):
+            with m.Case(InternalOp.OP_DIVE):
+                with m.If(op.is_32bit):
+                    with m.If(op.is_signed):
+                        # matches POWER9's divweo behavior
+                        comb += o.eq(quotient_64[0:32].as_unsigned())
+                    with m.Else():
+                        comb += o.eq(quotient_64[0:32].as_unsigned())
+                with m.Else():
+                    comb += o.eq(quotient_64)
+            with m.Case(InternalOp.OP_DIV):
+                with m.If(op.is_32bit):
+                    with m.If(op.is_signed):
+                        # matches POWER9's divwo behavior
+                        comb += o.eq(quotient_64[0:32].as_unsigned())
+                    with m.Else():
+                        comb += o.eq(quotient_64[0:32].as_unsigned())
+                with m.Else():
+                    comb += o.eq(quotient_64)
+            with m.Case(InternalOp.OP_MOD):
+                with m.If(op.is_32bit):
+                    with m.If(op.is_signed):
+                        # matches POWER9's modsw behavior
+                        comb += o.eq(remainder_64[0:32].as_signed())
+                    with m.Else():
+                        comb += o.eq(remainder_64[0:32].as_unsigned())
+                with m.Else():
+                    comb += o.eq(remainder_64)
+
+        ###### sticky overflow and context, both pass-through #####
 
-# simply over-ride ALUOutputStage ispec / ospec
-class DivMulOutputStage(ALUOutputStage):
-    def ispec(self): return DivMulOutputData(self.pspec)
-    def ospec(self): return DivMulOutputData(self.pspec)
+        comb += self.o.xer_so.data.eq(self.i.xer_so)
+        comb += self.o.ctx.eq(self.i.ctx)
 
+        return m
diff --git a/src/soc/fu/div/pipeline.py b/src/soc/fu/div/pipeline.py
index a7355dd5..d72083e5 100644
--- a/src/soc/fu/div/pipeline.py
+++ b/src/soc/fu/div/pipeline.py
@@ -1,16 +1,16 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
-from soc.fu.alu.input_stage import ALUInputStage
-from soc.fu.alu.output_stage import ALUOutputStage
+from soc.fu.mul.output_stage import DivMulOutputStage
+from soc.fu.div.input_stage import DivMulInputStage
+from soc.fu.div.output_stage import DivOutputStage
 from soc.fu.div.setup_stage import DivSetupStage
 from soc.fu.div.core_stages import (DivCoreSetupStage, DivCoreCalculateStage,
                                     DivCoreFinalStage)
-from soc.fu.div.output_stage import DivOutputStage
 
 
 class DivStagesStart(PipeModBaseChain):
     def get_chain(self):
-        alu_input = ALUInputStage(self.pspec)
+        alu_input = DivMulInputStage(self.pspec)
         div_setup = DivSetupStage(self.pspec)
         core_setup = DivCoreSetupStage(self.pspec)
         return [alu_input, div_setup, core_setup]
@@ -33,7 +33,7 @@ class DivStagesEnd(PipeModBaseChain):
     def get_chain(self):
         core_final = DivCoreFinalStage(self.pspec)
         div_out = DivOutputStage(self.pspec)
-        alu_out = ALUOutputStage(self.pspec)
+        alu_out = DivMulOutputStage(self.pspec)
         return [core_final, div_out, alu_out]
 
 
diff --git a/src/soc/fu/div/setup_stage.py b/src/soc/fu/div/setup_stage.py
index 9b0455be..25daa201 100644
--- a/src/soc/fu/div/setup_stage.py
+++ b/src/soc/fu/div/setup_stage.py
@@ -4,7 +4,6 @@
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.div.pipe_data import DIVInputData
-from soc.fu.alu.pipe_data import ALUOutputData
 from ieee754.part.partsig import PartitionedSignal
 from soc.decoder.power_enums import InternalOp
 
diff --git a/src/soc/fu/div/test/test_pipe_caller.py b/src/soc/fu/div/test/test_pipe_caller.py
index 8ae19d5a..fb6a4025 100644
--- a/src/soc/fu/div/test/test_pipe_caller.py
+++ b/src/soc/fu/div/test/test_pipe_caller.py
@@ -25,7 +25,6 @@ def get_cu_inputs(dec2, sim):
 
     yield from ALUHelpers.get_sim_int_ra(res, sim, dec2) # RA
     yield from ALUHelpers.get_sim_int_rb(res, sim, dec2) # RB
-    yield from ALUHelpers.get_rd_sim_xer_ca(res, sim, dec2) # XER.ca
     yield from ALUHelpers.get_sim_xer_so(res, sim, dec2) # XER.so
 
     print ("alu get_cu_inputs", res)
@@ -43,7 +42,6 @@ def set_alu_inputs(alu, dec2, sim):
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
     yield from ALUHelpers.set_int_rb(alu, dec2, inp)
 
-    yield from ALUHelpers.set_xer_ca(alu, dec2, inp)
     yield from ALUHelpers.set_xer_so(alu, dec2, inp)
 
 
@@ -190,19 +188,16 @@ class TestRunner(FHDLTestCase):
 
         yield from ALUHelpers.get_cr_a(res, alu, dec2)
         yield from ALUHelpers.get_xer_ov(res, alu, dec2)
-        yield from ALUHelpers.get_xer_ca(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
         yield from ALUHelpers.get_xer_so(res, alu, dec2)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
         yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
-        yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
         yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
 
         ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))
         ALUHelpers.check_xer_ov(self, res, sim_o, code)
-        ALUHelpers.check_xer_ca(self, res, sim_o, code)
         ALUHelpers.check_int_o(self, res, sim_o, code)
         ALUHelpers.check_xer_so(self, res, sim_o, code)
 
diff --git a/src/soc/fu/mul/output_stage.py b/src/soc/fu/mul/output_stage.py
new file mode 100644
index 00000000..67848dbe
--- /dev/null
+++ b/src/soc/fu/mul/output_stage.py
@@ -0,0 +1,12 @@
+# This stage is intended to adjust the input data before sending it to
+# the actual ALU. Things like handling inverting the input, xer_ca
+# generation for subtraction, and handling of immediates should happen
+# in the base class (CommonOutputStage.elaborate).
+from soc.fu.alu.output_stage import ALUOutputStage
+from soc.fu.div.pipe_data import DivMulOutputData
+
+# simply over-ride ALUOutputStage ispec / ospec
+class DivMulOutputStage(ALUOutputStage):
+    def ispec(self): return DivMulOutputData(self.pspec)
+    def ospec(self): return DivMulOutputData(self.pspec)
+
diff --git a/src/soc/fu/mul/pipeline.py b/src/soc/fu/mul/pipeline.py
index 3816435d..f5a0f069 100644
--- a/src/soc/fu/mul/pipeline.py
+++ b/src/soc/fu/mul/pipeline.py
@@ -1,7 +1,7 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
 from soc.fu.div.input_stage import DivMulInputStage
-from soc.fu.div.output_stage import DivMulOutputStage
+from soc.fu.mul.output_stage import DivMulOutputStage
 from soc.fu.mul.pre_stage import MulMainStage1
 from soc.fu.mul.main_stage import MulMainStage2
 from soc.fu.mul.post_stage import MulMainStage3
diff --git a/src/soc/fu/pipe_data.py b/src/soc/fu/pipe_data.py
index 4201d400..7d6ac539 100644
--- a/src/soc/fu/pipe_data.py
+++ b/src/soc/fu/pipe_data.py
@@ -26,8 +26,13 @@ class IntegerData:
 
     def eq(self, i):
         eqs = [self.ctx.eq(i.ctx)]
+        assert len(self.data) == len(i.data), \
+               "length of %s mismatch against %s: %s %s" % \
+                   (repr(self), repr(i), repr(self.data), repr(i.data))
         for j in range(len(self.data)):
-            assert type(self.data[j]) == type(i.data[j])
+            assert type(self.data[j]) == type(i.data[j]), \
+                   "type mismatch in IntegerData %s %s" % \
+                   (repr(self.data[j]), repr(i.data[j]))
             eqs.append(self.data[j].eq(i.data[j]))
         return eqs
 
-- 
2.30.2