From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Thu, 9 Jul 2020 09:52:46 +0000 (+0100)
Subject: add new stages etc. to get multiply working without xer_ca
X-Git-Tag: div_pipeline~140
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=512e2d72912ba57913ab1b1297a085d5fae67181;p=soc.git

add new stages etc. to get multiply working without xer_ca
---

diff --git a/src/soc/fu/div/input_stage.py b/src/soc/fu/div/input_stage.py
new file mode 100644
index 00000000..0849aded
--- /dev/null
+++ b/src/soc/fu/div/input_stage.py
@@ -0,0 +1,12 @@
+# This stage is intended to adjust the input data before sending it to
+# the actual ALU. Things like handling inverting the input, xer_ca
+# generation for subtraction, and handling of immediates should happen
+# in the base class (CommonInputStage.elaborate).
+from soc.fu.alu.input_stage import ALUInputStage
+from soc.fu.div.pipe_data import DIVInputData
+
+# simply over-ride ALUInputStage ispec / ospec
+class DivMulInputStage(ALUInputStage):
+    def ispec(self): return DIVInputData(self.pspec)
+    def ospec(self): return DIVInputData(self.pspec)
+
diff --git a/src/soc/fu/div/output_stage.py b/src/soc/fu/div/output_stage.py
index 1db0bbb2..67848dbe 100644
--- a/src/soc/fu/div/output_stage.py
+++ b/src/soc/fu/div/output_stage.py
@@ -1,120 +1,12 @@
-# This stage is the setup stage that converts the inputs
-# into the values expected by DivPipeCore
+# This stage is intended to adjust the input data before sending it to
+# the actual ALU. Things like handling inverting the input, xer_ca
+# generation for subtraction, and handling of immediates should happen
+# in the base class (CommonOutputStage.elaborate).
+from soc.fu.alu.output_stage import ALUOutputStage
+from soc.fu.div.pipe_data import DivMulOutputData
+
+# simply over-ride ALUOutputStage ispec / ospec
+class DivMulOutputStage(ALUOutputStage):
+    def ispec(self): return DivMulOutputData(self.pspec)
+    def ospec(self): return DivMulOutputData(self.pspec)
 
-from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
-from nmutil.pipemodbase import PipeModBase
-from soc.fu.logical.pipe_data import LogicalInputData
-from soc.fu.alu.pipe_data import ALUOutputData
-from ieee754.part.partsig import PartitionedSignal
-from soc.decoder.power_enums import InternalOp
-
-from soc.decoder.power_fields import DecodeFields
-from soc.decoder.power_fieldsn import SignalBitRange
-from soc.fu.div.pipe_data import CoreOutputData
-
-
-class DivOutputStage(PipeModBase):
-    def __init__(self, pspec):
-        super().__init__(pspec, "output_stage")
-        self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
-        self.fields.create_specs()
-        self.quotient_neg = Signal()
-        self.remainder_neg = Signal()
-        self.quotient_64 = Signal(64)
-        self.remainder_64 = Signal(64)
-
-    def ispec(self):
-        return CoreOutputData(self.pspec)
-
-    def ospec(self):
-        return ALUOutputData(self.pspec)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        op = self.i.ctx.op
-        abs_quotient = self.i.core.quotient_root
-        fract_width = self.pspec.core_config.fract_width
-        # fract width of `DivPipeCoreOutputData.remainder`
-        remainder_fract_width = fract_width * 3
-        # fract width of `DivPipeCoreInputData.dividend`
-        dividend_fract_width = fract_width * 2
-        rem_start = remainder_fract_width - dividend_fract_width
-        abs_remainder = self.i.core.remainder[rem_start:rem_start+64]
-        dividend_neg = self.i.dividend_neg
-        divisor_neg = self.i.divisor_neg
-        quotient_64 = self.quotient_64
-        remainder_64 = self.remainder_64
-
-        comb += self.quotient_neg.eq(dividend_neg ^ divisor_neg)
-        # follows rules for truncating division
-        comb += self.remainder_neg.eq(dividend_neg)
-
-        # negation of a 64-bit value produces the same lower 32-bit
-        # result as negation of just the lower 32-bits, so we don't
-        # need to do anything special before negating
-        comb += [
-            quotient_64.eq(Mux(self.quotient_neg,
-                               -abs_quotient, abs_quotient)),
-            remainder_64.eq(Mux(self.remainder_neg,
-                                -abs_remainder, abs_remainder))
-        ]
-
-        xer_ov = self.o.xer_ov.data
-
-        def calc_overflow(dive_abs_overflow, sign_bit_mask):
-            nonlocal comb
-            overflow = dive_abs_overflow | self.i.div_by_zero
-            with m.If(op.is_signed):
-                comb += xer_ov.eq(overflow
-                                  | (abs_quotient > sign_bit_mask)
-                                  | ((abs_quotient == sign_bit_mask)
-                                     & ~self.quotient_neg))
-            with m.Else():
-                comb += xer_ov.eq(overflow)
-
-        with m.If(op.is_32bit):
-            calc_overflow(self.i.dive_abs_ov32, 0x80000000)
-        with m.Else():
-            calc_overflow(self.i.dive_abs_ov64, 0x8000000000000000)
-
-        ##########################
-        # main switch for DIV
-
-        o = self.o.o.data
-
-        with m.Switch(op.insn_type):
-            with m.Case(InternalOp.OP_DIVE):
-                with m.If(op.is_32bit):
-                    with m.If(op.is_signed):
-                        # matches POWER9's divweo behavior
-                        comb += o.eq(quotient_64[0:32].as_unsigned())
-                    with m.Else():
-                        comb += o.eq(quotient_64[0:32].as_unsigned())
-                with m.Else():
-                    comb += o.eq(quotient_64)
-            with m.Case(InternalOp.OP_DIV):
-                with m.If(op.is_32bit):
-                    with m.If(op.is_signed):
-                        # matches POWER9's divwo behavior
-                        comb += o.eq(quotient_64[0:32].as_unsigned())
-                    with m.Else():
-                        comb += o.eq(quotient_64[0:32].as_unsigned())
-                with m.Else():
-                    comb += o.eq(quotient_64)
-            with m.Case(InternalOp.OP_MOD):
-                with m.If(op.is_32bit):
-                    with m.If(op.is_signed):
-                        # matches POWER9's modsw behavior
-                        comb += o.eq(remainder_64[0:32].as_signed())
-                    with m.Else():
-                        comb += o.eq(remainder_64[0:32].as_unsigned())
-                with m.Else():
-                    comb += o.eq(remainder_64)
-
-        ###### sticky overflow and context, both pass-through #####
-
-        comb += self.o.xer_so.data.eq(self.i.xer_so)
-        comb += self.o.ctx.eq(self.i.ctx)
-
-        return m
diff --git a/src/soc/fu/mul/main_stage.py b/src/soc/fu/mul/main_stage.py
index ccdd0d35..3d620367 100644
--- a/src/soc/fu/mul/main_stage.py
+++ b/src/soc/fu/mul/main_stage.py
@@ -28,7 +28,6 @@ class MulMainStage2(PipeModBase):
 
         ###### xer and context, all pass-through #####
 
-        comb += self.o.xer_ca.eq(self.i.xer_ca)
         comb += self.o.neg_res.eq(self.i.neg_res)
         comb += self.o.neg_res32.eq(self.i.neg_res32)
         comb += self.o.xer_so.eq(self.i.xer_so)
diff --git a/src/soc/fu/mul/mul_input_record.py b/src/soc/fu/mul/mul_input_record.py
index 8554c536..51e7352e 100644
--- a/src/soc/fu/mul/mul_input_record.py
+++ b/src/soc/fu/mul/mul_input_record.py
@@ -20,8 +20,6 @@ class CompMULOpSubset(Record):
                   ('zero_a', 1),
                   ('invert_out', 1),
                   ('write_cr0', 1),
-                  ('input_carry', CryIn),
-                  ('output_carry', 1),
                   ('is_32bit', 1),
                   ('is_signed', 1),
                   ('insn', 32),
@@ -35,8 +33,6 @@ class CompMULOpSubset(Record):
         self.zero_a.reset_less = True
         self.invert_a.reset_less = True
         self.invert_out.reset_less = True
-        self.input_carry.reset_less = True
-        self.output_carry.reset_less = True
         self.is_32bit.reset_less = True
         self.is_signed.reset_less = True
 
@@ -53,8 +49,6 @@ class CompMULOpSubset(Record):
         return [self.insn_type,
                 self.invert_a,
                 self.invert_out,
-                self.input_carry,
-                self.output_carry,
                 self.is_32bit,
                 self.is_signed,
         ]
diff --git a/src/soc/fu/mul/pipe_data.py b/src/soc/fu/mul/pipe_data.py
index 38741f61..eef6cd83 100644
--- a/src/soc/fu/mul/pipe_data.py
+++ b/src/soc/fu/mul/pipe_data.py
@@ -1,10 +1,10 @@
 from soc.fu.mul.mul_input_record import CompMULOpSubset
 from soc.fu.pipe_data import IntegerData, CommonPipeSpec
-from soc.fu.alu.pipe_data import ALUOutputData, ALUInputData
+from soc.fu.div.pipe_data import DIVInputData, DivMulOutputData
 from nmigen import Signal
 
 
-class MulIntermediateData(ALUInputData):
+class MulIntermediateData(DIVInputData):
     def __init__(self, pspec):
         super().__init__(pspec)
 
@@ -28,5 +28,5 @@ class MulOutputData(IntegerData):
 
 
 class MulPipeSpec(CommonPipeSpec):
-    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
+    regspec = (DIVInputData.regspec, DivMulOutputData.regspec)
     opsubsetkls = CompMULOpSubset
diff --git a/src/soc/fu/mul/pipeline.py b/src/soc/fu/mul/pipeline.py
index a557c90e..3816435d 100644
--- a/src/soc/fu/mul/pipeline.py
+++ b/src/soc/fu/mul/pipeline.py
@@ -1,7 +1,7 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
-from soc.fu.alu.input_stage import ALUInputStage
-from soc.fu.alu.output_stage import ALUOutputStage
+from soc.fu.div.input_stage import DivMulInputStage
+from soc.fu.div.output_stage import DivMulOutputStage
 from soc.fu.mul.pre_stage import MulMainStage1
 from soc.fu.mul.main_stage import MulMainStage2
 from soc.fu.mul.post_stage import MulMainStage3
@@ -9,7 +9,7 @@ from soc.fu.mul.post_stage import MulMainStage3
 
 class MulStages1(PipeModBaseChain):
     def get_chain(self):
-        inp = ALUInputStage(self.pspec)   # a-invert, carry etc
+        inp = DivMulInputStage(self.pspec)   # a-invert (no carry)
         main = MulMainStage1(self.pspec)  # detect signed/32-bit
         return [inp, main]
 
@@ -23,7 +23,7 @@ class MulStages2(PipeModBaseChain):
 class MulStages3(PipeModBaseChain):
     def get_chain(self):
         main3 = MulMainStage3(self.pspec) # select output bits, invert, set ov
-        out = ALUOutputStage(self.pspec)  # do CR, XER and out-invert etc.
+        out = DivMulOutputStage(self.pspec)  # do CR, XER and out-invert etc.
         return [main3, out]
 
 
diff --git a/src/soc/fu/mul/post_stage.py b/src/soc/fu/mul/post_stage.py
index bdee2ec5..b200aa8f 100644
--- a/src/soc/fu/mul/post_stage.py
+++ b/src/soc/fu/mul/post_stage.py
@@ -2,7 +2,7 @@
 
 from nmigen import (Module, Signal, Cat, Repl, Mux, signed)
 from nmutil.pipemodbase import PipeModBase
-from soc.fu.alu.pipe_data import ALUOutputData
+from soc.fu.div.pipe_data import DivMulOutputData
 from soc.fu.mul.pipe_data import MulOutputData
 from ieee754.part.partsig import PartitionedSignal
 from soc.decoder.power_enums import InternalOp
@@ -16,16 +16,15 @@ class MulMainStage3(PipeModBase):
         return MulOutputData(self.pspec) # pipeline stage output format
 
     def ospec(self):
-        return ALUOutputData(self.pspec) # defines pipeline stage output format
+        return DivMulOutputData(self.pspec) # defines stage output format
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
 
         # convenience variables
-        cry_o, o, cr0 = self.o.xer_ca, self.o.o, self.o.cr0
-        ov_o = self.o.xer_ov
-        o_i, cry_i, op = self.i.o, self.i.xer_ca, self.i.ctx.op
+        o, cr0 = self.o.o, self.o.cr0
+        ov_o, o_i, op = self.o.xer_ov, self.i.o, self.i.ctx.op
 
         # check if op is 32-bit, and get sign bit from operand a
         is_32bit = Signal(reset_less=True)
@@ -64,13 +63,6 @@ class MulMainStage3(PipeModBase):
                 comb += ov_o.data.eq(ov)
                 comb += ov_o.ok.eq(1)
 
-        # https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
-        ca = Signal(2, reset_less=True)
-        comb += ca[0].eq(mul_o[-1])                      # XER.CA - XXX more?
-        comb += ca[1].eq(mul_o[32] ^ (self.i.neg_res32)) # XER.CA32
-        comb += cry_o.data.eq(ca)
-        comb += cry_o.ok.eq(1)
-
         ###### sticky overflow and context, both pass-through #####
 
         comb += self.o.xer_so.data.eq(self.i.xer_so)
diff --git a/src/soc/fu/mul/pre_stage.py b/src/soc/fu/mul/pre_stage.py
index 84363090..94563874 100644
--- a/src/soc/fu/mul/pre_stage.py
+++ b/src/soc/fu/mul/pre_stage.py
@@ -2,7 +2,7 @@
 
 from nmigen import (Module, Signal, Mux)
 from nmutil.pipemodbase import PipeModBase
-from soc.fu.alu.pipe_data import ALUInputData
+from soc.fu.div.pipe_data import DIVInputData
 from soc.fu.mul.pipe_data import MulIntermediateData
 from ieee754.part.partsig import PartitionedSignal
 from nmutil.util import eq32
@@ -12,7 +12,7 @@ class MulMainStage1(PipeModBase):
         super().__init__(pspec, "mul1")
 
     def ispec(self):
-        return ALUInputData(self.pspec) # defines pipeline stage input format
+        return DIVInputData(self.pspec) # defines pipeline stage input format
 
     def ospec(self):
         return MulIntermediateData(self.pspec) # pipeline stage output format
@@ -58,7 +58,6 @@ class MulMainStage1(PipeModBase):
 
         ###### XER and context, both pass-through #####
 
-        comb += self.o.xer_ca.eq(self.i.xer_ca)
         comb += self.o.xer_so.eq(self.i.xer_so)
         comb += self.o.ctx.eq(self.i.ctx)
 
diff --git a/src/soc/fu/mul/test/test_pipe_caller.py b/src/soc/fu/mul/test/test_pipe_caller.py
index cd93e129..cda81076 100644
--- a/src/soc/fu/mul/test/test_pipe_caller.py
+++ b/src/soc/fu/mul/test/test_pipe_caller.py
@@ -25,7 +25,6 @@ def get_cu_inputs(dec2, sim):
 
     yield from ALUHelpers.get_sim_int_ra(res, sim, dec2) # RA
     yield from ALUHelpers.get_sim_int_rb(res, sim, dec2) # RB
-    yield from ALUHelpers.get_rd_sim_xer_ca(res, sim, dec2) # XER.ca
     yield from ALUHelpers.get_sim_xer_so(res, sim, dec2) # XER.so
 
     print ("alu get_cu_inputs", res)
@@ -44,7 +43,6 @@ def set_alu_inputs(alu, dec2, sim):
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
     yield from ALUHelpers.set_int_rb(alu, dec2, inp)
 
-    yield from ALUHelpers.set_xer_ca(alu, dec2, inp)
     yield from ALUHelpers.set_xer_so(alu, dec2, inp)
 
 
@@ -245,19 +243,16 @@ class TestRunner(FHDLTestCase):
 
         yield from ALUHelpers.get_cr_a(res, alu, dec2)
         yield from ALUHelpers.get_xer_ov(res, alu, dec2)
-        yield from ALUHelpers.get_xer_ca(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
         yield from ALUHelpers.get_xer_so(res, alu, dec2)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
         yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
-        yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
         yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
 
         ALUHelpers.check_int_o(self, res, sim_o, code)
         ALUHelpers.check_xer_ov(self, res, sim_o, code)
-        ALUHelpers.check_xer_ca(self, res, sim_o, code)
         ALUHelpers.check_xer_so(self, res, sim_o, code)
         ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))