From 17bae1e0a1697afa6da10b1be28c69b27f4ef47b Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 28 Jul 2019 13:14:19 +0100
Subject: [PATCH] get fpdiv/fsqrt/frsqrt up and running

---
 src/ieee754/fpdiv/div0.py     | 136 ++++++++++++++++------------------
 src/ieee754/fpdiv/div2.py     |  89 +++++++++++++++-------
 src/ieee754/fpdiv/pipeline.py |  43 ++++++-----
 3 files changed, 146 insertions(+), 122 deletions(-)

diff --git a/src/ieee754/fpdiv/div0.py b/src/ieee754/fpdiv/div0.py
index c93ba723..1197c267 100644
--- a/src/ieee754/fpdiv/div0.py
+++ b/src/ieee754/fpdiv/div0.py
@@ -3,7 +3,7 @@
 Relevant bugreport: http://bugs.libre-riscv.org/show_bug.cgi?id=99
 """
 
-from nmigen import Module, Signal, Cat, Elaboratable, Const
+from nmigen import Module, Signal, Cat, Elaboratable, Const, Mux
 from nmigen.cli import main, verilog
 
 from ieee754.fpcommon.fpbase import (FPNumBaseRecord, Overflow)
@@ -11,6 +11,7 @@ from ieee754.fpcommon.fpbase import FPState
 from ieee754.fpcommon.denorm import FPSCData
 from ieee754.fpcommon.getop import FPPipeContext
 from ieee754.div_rem_sqrt_rsqrt.div_pipe import DivPipeInputData
+from ieee754.div_rem_sqrt_rsqrt.core import DivPipeCoreOperation as DPCOp
 
 
 class FPDivStage0Mod(Elaboratable):
@@ -50,84 +51,75 @@ class FPDivStage0Mod(Elaboratable):
         # it is PURELY the *ENTRY* point into the chain, performing
         # "preparation" work.
 
-        with m.If(~self.i.out_do_z):
-            # do conversion here, of both self.i.a and self.i.b,
-            # into DivPipeInputData dividend and divisor.
-
-            # XXX *sigh* magic constants...
-            if self.pspec.width == 16:
-                if self.pspec.log2_radix == 1:
-                    extra = 2
-                elif self.pspec.log2_radix == 3:
-                    extra = 2
-                else:
-                    extra = 3
-            elif self.pspec.width == 32:
-                if self.pspec.log2_radix == 1:
-                    extra = 3
-                else:
-                    extra = 4
-            elif self.pspec.width == 64:
-                if self.pspec.log2_radix == 1:
-                    extra = 2
-                elif self.pspec.log2_radix == 3:
-                    extra = 2
-                else:
-                    extra = 3
-
-            # the mantissas, having been de-normalised (and containing
-            # a "1" in the MSB) represent numbers in the range 0.5 to
-            # 0.9999999-recurring.  the min and max range of the
-            # result is therefore 0.4999999 (0.5/0.99999) and 1.9999998
-            # (0.99999/0.5).
+        # mantissas start in the range [1.0, 2.0)
+
+        is_div = Signal(reset_less=True)
+        need_exp_adj = Signal(reset_less=True)
+
+        # ``self.i.a.rmw`` fractional bits and 2 integer bits
+        adj_a_m_fract_width = self.i.a.rmw
+        adj_a_m = Signal(self.i.a.rmw + 2, reset_less=True)
+
+        adj_a_e = Signal((len(self.i.a.e), True), reset_less=True)
+
+        m.d.comb += [is_div.eq(self.i.ctx.op == int(DPCOp.UDivRem)),
+                     need_exp_adj.eq(~is_div & self.i.a.e[0]),
+                     adj_a_m.eq(self.i.a.m << need_exp_adj),
+                     adj_a_e.eq(self.i.a.e - need_exp_adj)]
+
+        # adj_a_m now in the range [1.0, 4.0) for sqrt/rsqrt
+        # and [1.0, 2.0) for div
+
+        dividend_fract_width = self.pspec.core_config.fract_width * 2
+        dividend = Signal(len(self.o.dividend),
+                          reset_less=True)
 
+        divr_rad_fract_width = self.pspec.core_config.fract_width
+        divr_rad = Signal(len(self.o.divisor_radicand),
+                          reset_less=True)
+
+        a_m_fract_width = self.i.a.rmw
+        b_m_fract_width = self.i.b.rmw
+
+        m.d.comb += [
+            dividend.eq(self.i.a.m << (
+                dividend_fract_width - a_m_fract_width)),
+            divr_rad.eq(Mux(is_div,
+                            self.i.b.m << (
+                                divr_rad_fract_width - b_m_fract_width),
+                            adj_a_m << (
+                                divr_rad_fract_width - adj_a_m_fract_width))),
+        ]
+
+        m.d.comb += [
+            self.o.dividend.eq(dividend),
+            self.o.divisor_radicand.eq(divr_rad),
+        ]
+
+        # set default since it's not always set; non-zero value for debugging
+        m.d.comb += self.o.operation.eq(1)
+
+        with m.If(~self.i.out_do_z):
             # DIV
-            with m.If(self.i.ctx.op == 0):
-                am0 = Signal(len(self.i.a.m)+1, reset_less=True)
-                bm0 = Signal(len(self.i.b.m)+1, reset_less=True)
-                m.d.comb += [
-                             am0.eq(Cat(self.i.a.m, 0)),
-                             bm0.eq(Cat(self.i.b.m, 0)),
-                            ]
-
-                # zero-extend the mantissas (room for sticky/round/guard)
-                # plus the extra MSB.
-                m.d.comb += [self.o.z.e.eq(self.i.a.e - self.i.b.e + 1),
+            with m.If(self.i.ctx.op == int(DPCOp.UDivRem)):
+                m.d.comb += [self.o.z.e.eq(self.i.a.e - self.i.b.e),
                              self.o.z.s.eq(self.i.a.s ^ self.i.b.s),
-                             self.o.dividend[len(self.i.a.m)+extra:].eq(am0),
-                             self.o.divisor_radicand.eq(bm0),
-                             self.o.operation.eq(Const(0)) # XXX DIV operation
-                    ]
+                             self.o.operation.eq(int(DPCOp.UDivRem))
+                             ]
 
             # SQRT
-            with m.Elif(self.i.ctx.op == 1):
-                am0 = Signal(len(self.i.a.m)+3, reset_less=True)
-                with m.If(self.i.a.e[0]):
-                    m.d.comb += am0.eq(Cat(self.i.a.m, 0)<<(extra-2))
-                    m.d.comb += self.o.z.e.eq(((self.i.a.e+1) >> 1)+1)
-                with m.Else():
-                    m.d.comb += am0.eq(Cat(0, self.i.a.m)<<(extra-2))
-                    m.d.comb += self.o.z.e.eq((self.i.a.e >> 1)+1)
-
-                m.d.comb += [self.o.z.s.eq(self.i.a.s),
-                             self.o.divisor_radicand.eq(am0),
-                             self.o.operation.eq(Const(1)) # XXX SQRT operation
-                    ]
+            with m.Elif(self.i.ctx.op == int(DPCOp.SqrtRem)):
+                m.d.comb += [self.o.z.e.eq(adj_a_e >> 1),
+                             self.o.z.s.eq(self.i.a.s),
+                             self.o.operation.eq(int(DPCOp.SqrtRem))
+                             ]
 
             # RSQRT
-            with m.Elif(self.i.ctx.op == 2):
-                am0 = Signal(len(self.i.a.m)+3, reset_less=True)
-                with m.If(self.i.a.e[0]):
-                    m.d.comb += am0.eq(Cat(self.i.a.m, 0)<<(extra-3))
-                    m.d.comb += self.o.z.e.eq(-((self.i.a.e+1) >> 1)+4)
-                with m.Else():
-                    m.d.comb += am0.eq(Cat(self.i.a.m)<<(extra-2))
-                    m.d.comb += self.o.z.e.eq(-(self.i.a.e >> 1)+4)
-
-                m.d.comb += [self.o.z.s.eq(self.i.a.s),
-                             self.o.divisor_radicand.eq(am0),
-                             self.o.operation.eq(Const(2)) # XXX RSQRT operation
-                    ]
+            with m.Elif(self.i.ctx.op == int(DPCOp.RSqrtRem)):
+                m.d.comb += [self.o.z.e.eq(-(adj_a_e >> 1)),
+                             self.o.z.s.eq(self.i.a.s),
+                             self.o.operation.eq(int(DPCOp.RSqrtRem))
+                             ]
 
         # these are required and must not be touched
         m.d.comb += self.o.oz.eq(self.i.oz)
diff --git a/src/ieee754/fpdiv/div2.py b/src/ieee754/fpdiv/div2.py
index 0261528b..f8d98f4e 100644
--- a/src/ieee754/fpdiv/div2.py
+++ b/src/ieee754/fpdiv/div2.py
@@ -21,12 +21,12 @@ class FPDivStage2Mod(FPState, Elaboratable):
         self.o = self.ospec()
 
     def ispec(self):
-        return DivPipeOutputData(self.pspec) # Q/Rem in...
+        return DivPipeOutputData(self.pspec)  # Q/Rem in...
 
     def ospec(self):
         # XXX REQUIRED.  MUST NOT BE CHANGED.  this is the format
         # required for ongoing processing (normalisation, correction etc.)
-        return FPAddStage1Data(self.pspec) # out to post-process
+        return FPAddStage1Data(self.pspec)  # out to post-process
 
     def process(self, i):
         return self.o
@@ -40,8 +40,8 @@ class FPDivStage2Mod(FPState, Elaboratable):
     def elaborate(self, platform):
         m = Module()
 
-        # copies sign and exponent and mantissa (mantissa to be overridden
-        # below)
+        # copies sign and exponent and mantissa (mantissa and exponent to be
+        # overridden below)
         m.d.comb += self.o.z.eq(self.i.z)
 
         # TODO: this is "phase 3" of divide (the very end of the pipeline)
@@ -52,6 +52,54 @@ class FPDivStage2Mod(FPState, Elaboratable):
         # NOTE: this phase does NOT do ACTUAL DIV processing, it ONLY
         # does "conversion" *out* of the Q/REM last stage
 
+        # Operations and input/output mantissa ranges:
+        # fdiv:
+        #   dividend [1.0, 2.0)
+        #   divisor [1.0, 2.0)
+        #   result (0.5, 2.0)
+        #
+        # fsqrt:
+        #   radicand [1.0, 4.0)
+        #   result [1.0, 2.0)
+        #
+        # frsqrt:
+        #   radicand [1.0, 4.0)
+        #   result (0.5, 1.0]
+
+        # following section partially normalizes result to the range [1.0, 2.0)
+
+        qr_int_part = Signal(2, reset_less=True)
+        m.d.comb += qr_int_part.eq(
+            self.i.quotient_root[self.pspec.core_config.fract_width:][:2])
+
+        need_shift = Signal(reset_less=True)
+
+        # shift left when result is less than 2.0 since result_m has 1 more
+        # fraction bit, making assigning to it the equivalent of dividing by 2.
+        # this all comes out to:
+        # if quotient_root < 2.0:
+        #     # div by 2 from assign; mul by 2 from shift left
+        #     result = (quotient_root * 2) / 2
+        # else:
+        #     # div by 2 from assign
+        #     result = quotient_root / 2
+        m.d.comb += need_shift.eq(qr_int_part < 2)
+
+        # one extra fraction bit to accommodate the result when not shifting
+        # and for effective div by 2
+        result_m_fract_width = self.pspec.core_config.fract_width + 1
+        # 1 integer bit since the numbers are less than 2.0
+        result_m = Signal(1 + result_m_fract_width, reset_less=True)
+        result_e = Signal(len(self.i.z.e), reset_less=True)
+
+        m.d.comb += [
+            result_m.eq(self.i.quotient_root << need_shift),
+            result_e.eq(self.i.z.e + (1 - need_shift))
+        ]
+
+        # result_m is now in the range [1.0, 2.0)
+
+        # FIXME: below comment block out of date
         # NOTE: see FPDivStage0Mod comment.  the quotient is assumed
         # to be in the range 0.499999-recurring to 1.999998.  normalisation
         # will take care of that, *however*, it *might* be necessary to
@@ -59,30 +107,16 @@ class FPDivStage2Mod(FPState, Elaboratable):
         # mantissa to compensate.  this is pretty much exactly what's
         # done in FPMUL, due to 0.5-0.9999 * 0.5-0.9999 also producing
         # values within the range 0.5 to 1.999998
+        # FIXME: above comment block out of date
 
-        with m.If(~self.i.out_do_z):
-            mw = self.o.z.m_width
-            # TODO: compensate for answer being in range 0.49999 to 1.99998
-            pl = len(self.i.quotient_root) + 1
-            pt = Signal(pl, reset_less=True)
-            m.d.comb += pt.eq(Cat(0, self.i.quotient_root))
-            p = Signal(pl-1, reset_less=True) # drop top bit
-            with m.If(self.i.quotient_root[-1]):
-                m.d.comb += p.eq(pt[1:])
-            with m.Else():
-                # get 1 bit of extra accuracy if the mantissa top bit is zero
-                m.d.comb += p.eq(pt)
-                m.d.comb += self.o.z.e.eq(self.i.z.e-1)
-
-            # TODO: use p here instead of quotient_root, direct.
-            # XXX what to do about remainder? shift that as well?
-            # hmm, how about concatenate remainder and quotient...
+        with m.If(~self.i.out_do_z):  # FIXME: does this need to be conditional?
             m.d.comb += [
-                self.o.z.m.eq(p[-mw:]),
-                self.o.of.m0.eq(p[-mw]), # copy of LSB
-                self.o.of.guard.eq(p[-mw-1]),
-                self.o.of.round_bit.eq(p[-mw-2]),
-                self.o.of.sticky.eq(p[:-mw-2].bool() | self.i.remainder.bool())
+                self.o.z.m.eq(result_m[3:]),
+                self.o.of.m0.eq(result_m[3]),  # copy of LSB
+                self.o.of.guard.eq(result_m[2]),
+                self.o.of.round_bit.eq(result_m[1]),
+                self.o.of.sticky.eq(result_m[0] | self.i.remainder.bool()),
+                self.o.z.e.eq(result_e),
             ]
 
         m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
@@ -106,7 +140,7 @@ class FPDivStage2(FPState):
         """
         self.mod.setup(m, i)
 
-        m.d.sync += self.norm_stb.eq(0) # sets to zero when not in div1 state
+        m.d.sync += self.norm_stb.eq(0)  # sets to zero when not in div1 state
 
         m.d.sync += self.out_of.eq(self.mod.out_of)
         m.d.sync += self.out_z.eq(self.mod.out_z)
@@ -114,4 +148,3 @@ class FPDivStage2(FPState):
 
     def action(self, m):
         m.next = "normalise_1"
-
diff --git a/src/ieee754/fpdiv/pipeline.py b/src/ieee754/fpdiv/pipeline.py
index 18375ef8..68d07eef 100644
--- a/src/ieee754/fpdiv/pipeline.py
+++ b/src/ieee754/fpdiv/pipeline.py
@@ -88,7 +88,7 @@ class FPDIVBasePipe(ControlBase):
         # get number of stages, set up loop.
         n_stages = pspec.core_config.n_stages
         max_n_comb_stages = self.pspec.n_comb_stages
-        print ("n_stages", n_stages)
+        print("n_stages", n_stages)
         stage_idx = 0
 
         end = False
@@ -98,22 +98,22 @@ class FPDIVBasePipe(ControlBase):
             # needs to convert input from pipestart ospec
             if stage_idx == 0:
                 n_comb_stages -= 1
-                kls = FPDivStagesSetup # does n_comb_stages-1 calcs as well
+                kls = FPDivStagesSetup  # does n_comb_stages-1 calcs as well
 
             # needs to convert output to pipeend ispec
             elif stage_idx + n_comb_stages >= n_stages:
-                kls = FPDivStagesFinal # does n_comb_stages-1 calcs as well
+                kls = FPDivStagesFinal  # does n_comb_stages-1 calcs as well
                 end = True
                 n_comb_stages = n_stages - stage_idx
 
             # intermediary stage
             else:
-                kls = FPDivStagesIntermediate # does n_comb_stages calcs
+                kls = FPDivStagesIntermediate  # does n_comb_stages calcs
 
             # create (in each pipe) a StageChain n_comb_stages in length
             pipechain.append(kls(self.pspec, n_comb_stages, stage_idx))
-            stage_idx += n_comb_stages # increment so that each CalcStage
-                                       # gets a (correct) unique index
+            stage_idx += n_comb_stages  # increment so that each CalcStage
+            # gets a (correct) unique index
 
         self.pipechain = pipechain
 
@@ -137,6 +137,7 @@ class FPDIVBasePipe(ControlBase):
 
         return m
 
+
 def roundup(x, mod):
     return x if x % mod == 0 else x + mod - x % mod
 
@@ -160,24 +161,22 @@ class FPDIVMuxInOut(ReservationStations):
         # get the standard mantissa width, store in the pspec HOWEVER...
         fmt = FPFormat.standard(width)
         log2_radix = 3     # tested options so far: 1, 2 and 3.
-        n_comb_stages = 3  # TODO (depends on how many RS's we want)
-
-        # ...5 extra bits on the mantissa: MSB is zero, MSB-1 is 1
-        # then there is guard, round and sticky at the LSB end.
-        # also: round up to nearest radix
-        if width == 16:
-            extra = 5
-        elif width == 32:
-            extra = 6
-        elif width == 64:
-            extra = 5
-        fmt.m_width = roundup(fmt.m_width + extra, log2_radix)
-        print ("width", fmt.m_width)
-
-        cfg = DivPipeCoreConfig(fmt.m_width, fmt.fraction_width, log2_radix)
+
+        # TODO (depends on how many RS's we want)
+        #n_comb_stages = width // (2 * log2_radix)  # 2 compute steps per stage
+        n_comb_stages = 2  # FIXME: switch back
+
+        fraction_width = fmt.fraction_width
+
+        # extra bits needed: guard + round
+        fraction_width += 2
+
+        # rounding width to a multiple of log2_radix is not needed,
+        # DivPipeCoreCalculateStage just internally reduces log2_radix on
+        # the last stage
+        cfg = DivPipeCoreConfig(fmt.width, fraction_width, log2_radix)
 
         self.pspec.fpformat = fmt
-        self.pspec.log2_radix = log2_radix
         self.pspec.n_comb_stages = n_comb_stages
         self.pspec.core_config = cfg
 
-- 
2.30.2