From 17bae1e0a1697afa6da10b1be28c69b27f4ef47b Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Sun, 28 Jul 2019 13:14:19 +0100 Subject: [PATCH] get fpdiv/fsqrt/frsqrt up and running --- src/ieee754/fpdiv/div0.py | 136 ++++++++++++++++------------------ src/ieee754/fpdiv/div2.py | 89 +++++++++++++++------- src/ieee754/fpdiv/pipeline.py | 43 ++++++----- 3 files changed, 146 insertions(+), 122 deletions(-) diff --git a/src/ieee754/fpdiv/div0.py b/src/ieee754/fpdiv/div0.py index c93ba723..1197c267 100644 --- a/src/ieee754/fpdiv/div0.py +++ b/src/ieee754/fpdiv/div0.py @@ -3,7 +3,7 @@ Relevant bugreport: http://bugs.libre-riscv.org/show_bug.cgi?id=99 """ -from nmigen import Module, Signal, Cat, Elaboratable, Const +from nmigen import Module, Signal, Cat, Elaboratable, Const, Mux from nmigen.cli import main, verilog from ieee754.fpcommon.fpbase import (FPNumBaseRecord, Overflow) @@ -11,6 +11,7 @@ from ieee754.fpcommon.fpbase import FPState from ieee754.fpcommon.denorm import FPSCData from ieee754.fpcommon.getop import FPPipeContext from ieee754.div_rem_sqrt_rsqrt.div_pipe import DivPipeInputData +from ieee754.div_rem_sqrt_rsqrt.core import DivPipeCoreOperation as DPCOp class FPDivStage0Mod(Elaboratable): @@ -50,84 +51,75 @@ class FPDivStage0Mod(Elaboratable): # it is PURELY the *ENTRY* point into the chain, performing # "preparation" work. - with m.If(~self.i.out_do_z): - # do conversion here, of both self.i.a and self.i.b, - # into DivPipeInputData dividend and divisor. - - # XXX *sigh* magic constants... - if self.pspec.width == 16: - if self.pspec.log2_radix == 1: - extra = 2 - elif self.pspec.log2_radix == 3: - extra = 2 - else: - extra = 3 - elif self.pspec.width == 32: - if self.pspec.log2_radix == 1: - extra = 3 - else: - extra = 4 - elif self.pspec.width == 64: - if self.pspec.log2_radix == 1: - extra = 2 - elif self.pspec.log2_radix == 3: - extra = 2 - else: - extra = 3 - - # the mantissas, having been de-normalised (and containing - # a "1" in the MSB) represent numbers in the range 0.5 to - # 0.9999999-recurring. the min and max range of the - # result is therefore 0.4999999 (0.5/0.99999) and 1.9999998 - # (0.99999/0.5). + # mantissas start in the range [1.0, 2.0) + + is_div = Signal(reset_less=True) + need_exp_adj = Signal(reset_less=True) + + # ``self.i.a.rmw`` fractional bits and 2 integer bits + adj_a_m_fract_width = self.i.a.rmw + adj_a_m = Signal(self.i.a.rmw + 2, reset_less=True) + + adj_a_e = Signal((len(self.i.a.e), True), reset_less=True) + + m.d.comb += [is_div.eq(self.i.ctx.op == int(DPCOp.UDivRem)), + need_exp_adj.eq(~is_div & self.i.a.e[0]), + adj_a_m.eq(self.i.a.m << need_exp_adj), + adj_a_e.eq(self.i.a.e - need_exp_adj)] + + # adj_a_m now in the range [1.0, 4.0) for sqrt/rsqrt + # and [1.0, 2.0) for div + + dividend_fract_width = self.pspec.core_config.fract_width * 2 + dividend = Signal(len(self.o.dividend), + reset_less=True) + divr_rad_fract_width = self.pspec.core_config.fract_width + divr_rad = Signal(len(self.o.divisor_radicand), + reset_less=True) + + a_m_fract_width = self.i.a.rmw + b_m_fract_width = self.i.b.rmw + + m.d.comb += [ + dividend.eq(self.i.a.m << ( + dividend_fract_width - a_m_fract_width)), + divr_rad.eq(Mux(is_div, + self.i.b.m << ( + divr_rad_fract_width - b_m_fract_width), + adj_a_m << ( + divr_rad_fract_width - adj_a_m_fract_width))), + ] + + m.d.comb += [ + self.o.dividend.eq(dividend), + self.o.divisor_radicand.eq(divr_rad), + ] + + # set default since it's not always set; non-zero value for debugging + m.d.comb += self.o.operation.eq(1) + + with m.If(~self.i.out_do_z): # DIV - with m.If(self.i.ctx.op == 0): - am0 = Signal(len(self.i.a.m)+1, reset_less=True) - bm0 = Signal(len(self.i.b.m)+1, reset_less=True) - m.d.comb += [ - am0.eq(Cat(self.i.a.m, 0)), - bm0.eq(Cat(self.i.b.m, 0)), - ] - - # zero-extend the mantissas (room for sticky/round/guard) - # plus the extra MSB. - m.d.comb += [self.o.z.e.eq(self.i.a.e - self.i.b.e + 1), + with m.If(self.i.ctx.op == int(DPCOp.UDivRem)): + m.d.comb += [self.o.z.e.eq(self.i.a.e - self.i.b.e), self.o.z.s.eq(self.i.a.s ^ self.i.b.s), - self.o.dividend[len(self.i.a.m)+extra:].eq(am0), - self.o.divisor_radicand.eq(bm0), - self.o.operation.eq(Const(0)) # XXX DIV operation - ] + self.o.operation.eq(int(DPCOp.UDivRem)) + ] # SQRT - with m.Elif(self.i.ctx.op == 1): - am0 = Signal(len(self.i.a.m)+3, reset_less=True) - with m.If(self.i.a.e[0]): - m.d.comb += am0.eq(Cat(self.i.a.m, 0)<<(extra-2)) - m.d.comb += self.o.z.e.eq(((self.i.a.e+1) >> 1)+1) - with m.Else(): - m.d.comb += am0.eq(Cat(0, self.i.a.m)<<(extra-2)) - m.d.comb += self.o.z.e.eq((self.i.a.e >> 1)+1) - - m.d.comb += [self.o.z.s.eq(self.i.a.s), - self.o.divisor_radicand.eq(am0), - self.o.operation.eq(Const(1)) # XXX SQRT operation - ] + with m.Elif(self.i.ctx.op == int(DPCOp.SqrtRem)): + m.d.comb += [self.o.z.e.eq(adj_a_e >> 1), + self.o.z.s.eq(self.i.a.s), + self.o.operation.eq(int(DPCOp.SqrtRem)) + ] # RSQRT - with m.Elif(self.i.ctx.op == 2): - am0 = Signal(len(self.i.a.m)+3, reset_less=True) - with m.If(self.i.a.e[0]): - m.d.comb += am0.eq(Cat(self.i.a.m, 0)<<(extra-3)) - m.d.comb += self.o.z.e.eq(-((self.i.a.e+1) >> 1)+4) - with m.Else(): - m.d.comb += am0.eq(Cat(self.i.a.m)<<(extra-2)) - m.d.comb += self.o.z.e.eq(-(self.i.a.e >> 1)+4) - - m.d.comb += [self.o.z.s.eq(self.i.a.s), - self.o.divisor_radicand.eq(am0), - self.o.operation.eq(Const(2)) # XXX RSQRT operation - ] + with m.Elif(self.i.ctx.op == int(DPCOp.RSqrtRem)): + m.d.comb += [self.o.z.e.eq(-(adj_a_e >> 1)), + self.o.z.s.eq(self.i.a.s), + self.o.operation.eq(int(DPCOp.RSqrtRem)) + ] # these are required and must not be touched m.d.comb += self.o.oz.eq(self.i.oz) diff --git a/src/ieee754/fpdiv/div2.py b/src/ieee754/fpdiv/div2.py index 0261528b..f8d98f4e 100644 --- a/src/ieee754/fpdiv/div2.py +++ b/src/ieee754/fpdiv/div2.py @@ -21,12 +21,12 @@ class FPDivStage2Mod(FPState, Elaboratable): self.o = self.ospec() def ispec(self): - return DivPipeOutputData(self.pspec) # Q/Rem in... + return DivPipeOutputData(self.pspec) # Q/Rem in... def ospec(self): # XXX REQUIRED. MUST NOT BE CHANGED. this is the format # required for ongoing processing (normalisation, correction etc.) - return FPAddStage1Data(self.pspec) # out to post-process + return FPAddStage1Data(self.pspec) # out to post-process def process(self, i): return self.o @@ -40,8 +40,8 @@ class FPDivStage2Mod(FPState, Elaboratable): def elaborate(self, platform): m = Module() - # copies sign and exponent and mantissa (mantissa to be overridden - # below) + # copies sign and exponent and mantissa (mantissa and exponent to be + # overridden below) m.d.comb += self.o.z.eq(self.i.z) # TODO: this is "phase 3" of divide (the very end of the pipeline) @@ -52,6 +52,54 @@ class FPDivStage2Mod(FPState, Elaboratable): # NOTE: this phase does NOT do ACTUAL DIV processing, it ONLY # does "conversion" *out* of the Q/REM last stage + # Operations and input/output mantissa ranges: + # fdiv: + # dividend [1.0, 2.0) + # divisor [1.0, 2.0) + # result (0.5, 2.0) + # + # fsqrt: + # radicand [1.0, 4.0) + # result [1.0, 2.0) + # + # frsqrt: + # radicand [1.0, 4.0) + # result (0.5, 1.0] + + # following section partially normalizes result to the range [1.0, 2.0) + + qr_int_part = Signal(2, reset_less=True) + m.d.comb += qr_int_part.eq( + self.i.quotient_root[self.pspec.core_config.fract_width:][:2]) + + need_shift = Signal(reset_less=True) + + # shift left when result is less than 2.0 since result_m has 1 more + # fraction bit, making assigning to it the equivalent of dividing by 2. + # this all comes out to: + # if quotient_root < 2.0: + # # div by 2 from assign; mul by 2 from shift left + # result = (quotient_root * 2) / 2 + # else: + # # div by 2 from assign + # result = quotient_root / 2 + m.d.comb += need_shift.eq(qr_int_part < 2) + + # one extra fraction bit to accommodate the result when not shifting + # and for effective div by 2 + result_m_fract_width = self.pspec.core_config.fract_width + 1 + # 1 integer bit since the numbers are less than 2.0 + result_m = Signal(1 + result_m_fract_width, reset_less=True) + result_e = Signal(len(self.i.z.e), reset_less=True) + + m.d.comb += [ + result_m.eq(self.i.quotient_root << need_shift), + result_e.eq(self.i.z.e + (1 - need_shift)) + ] + + # result_m is now in the range [1.0, 2.0) + + # FIXME: below comment block out of date # NOTE: see FPDivStage0Mod comment. the quotient is assumed # to be in the range 0.499999-recurring to 1.999998. normalisation # will take care of that, *however*, it *might* be necessary to @@ -59,30 +107,16 @@ class FPDivStage2Mod(FPState, Elaboratable): # mantissa to compensate. this is pretty much exactly what's # done in FPMUL, due to 0.5-0.9999 * 0.5-0.9999 also producing # values within the range 0.5 to 1.999998 + # FIXME: above comment block out of date - with m.If(~self.i.out_do_z): - mw = self.o.z.m_width - # TODO: compensate for answer being in range 0.49999 to 1.99998 - pl = len(self.i.quotient_root) + 1 - pt = Signal(pl, reset_less=True) - m.d.comb += pt.eq(Cat(0, self.i.quotient_root)) - p = Signal(pl-1, reset_less=True) # drop top bit - with m.If(self.i.quotient_root[-1]): - m.d.comb += p.eq(pt[1:]) - with m.Else(): - # get 1 bit of extra accuracy if the mantissa top bit is zero - m.d.comb += p.eq(pt) - m.d.comb += self.o.z.e.eq(self.i.z.e-1) - - # TODO: use p here instead of quotient_root, direct. - # XXX what to do about remainder? shift that as well? - # hmm, how about concatenate remainder and quotient... + with m.If(~self.i.out_do_z): # FIXME: does this need to be conditional? m.d.comb += [ - self.o.z.m.eq(p[-mw:]), - self.o.of.m0.eq(p[-mw]), # copy of LSB - self.o.of.guard.eq(p[-mw-1]), - self.o.of.round_bit.eq(p[-mw-2]), - self.o.of.sticky.eq(p[:-mw-2].bool() | self.i.remainder.bool()) + self.o.z.m.eq(result_m[3:]), + self.o.of.m0.eq(result_m[3]), # copy of LSB + self.o.of.guard.eq(result_m[2]), + self.o.of.round_bit.eq(result_m[1]), + self.o.of.sticky.eq(result_m[0] | self.i.remainder.bool()), + self.o.z.e.eq(result_e), ] m.d.comb += self.o.out_do_z.eq(self.i.out_do_z) @@ -106,7 +140,7 @@ class FPDivStage2(FPState): """ self.mod.setup(m, i) - m.d.sync += self.norm_stb.eq(0) # sets to zero when not in div1 state + m.d.sync += self.norm_stb.eq(0) # sets to zero when not in div1 state m.d.sync += self.out_of.eq(self.mod.out_of) m.d.sync += self.out_z.eq(self.mod.out_z) @@ -114,4 +148,3 @@ class FPDivStage2(FPState): def action(self, m): m.next = "normalise_1" - diff --git a/src/ieee754/fpdiv/pipeline.py b/src/ieee754/fpdiv/pipeline.py index 18375ef8..68d07eef 100644 --- a/src/ieee754/fpdiv/pipeline.py +++ b/src/ieee754/fpdiv/pipeline.py @@ -88,7 +88,7 @@ class FPDIVBasePipe(ControlBase): # get number of stages, set up loop. n_stages = pspec.core_config.n_stages max_n_comb_stages = self.pspec.n_comb_stages - print ("n_stages", n_stages) + print("n_stages", n_stages) stage_idx = 0 end = False @@ -98,22 +98,22 @@ class FPDIVBasePipe(ControlBase): # needs to convert input from pipestart ospec if stage_idx == 0: n_comb_stages -= 1 - kls = FPDivStagesSetup # does n_comb_stages-1 calcs as well + kls = FPDivStagesSetup # does n_comb_stages-1 calcs as well # needs to convert output to pipeend ispec elif stage_idx + n_comb_stages >= n_stages: - kls = FPDivStagesFinal # does n_comb_stages-1 calcs as well + kls = FPDivStagesFinal # does n_comb_stages-1 calcs as well end = True n_comb_stages = n_stages - stage_idx # intermediary stage else: - kls = FPDivStagesIntermediate # does n_comb_stages calcs + kls = FPDivStagesIntermediate # does n_comb_stages calcs # create (in each pipe) a StageChain n_comb_stages in length pipechain.append(kls(self.pspec, n_comb_stages, stage_idx)) - stage_idx += n_comb_stages # increment so that each CalcStage - # gets a (correct) unique index + stage_idx += n_comb_stages # increment so that each CalcStage + # gets a (correct) unique index self.pipechain = pipechain @@ -137,6 +137,7 @@ class FPDIVBasePipe(ControlBase): return m + def roundup(x, mod): return x if x % mod == 0 else x + mod - x % mod @@ -160,24 +161,22 @@ class FPDIVMuxInOut(ReservationStations): # get the standard mantissa width, store in the pspec HOWEVER... fmt = FPFormat.standard(width) log2_radix = 3 # tested options so far: 1, 2 and 3. - n_comb_stages = 3 # TODO (depends on how many RS's we want) - - # ...5 extra bits on the mantissa: MSB is zero, MSB-1 is 1 - # then there is guard, round and sticky at the LSB end. - # also: round up to nearest radix - if width == 16: - extra = 5 - elif width == 32: - extra = 6 - elif width == 64: - extra = 5 - fmt.m_width = roundup(fmt.m_width + extra, log2_radix) - print ("width", fmt.m_width) - - cfg = DivPipeCoreConfig(fmt.m_width, fmt.fraction_width, log2_radix) + + # TODO (depends on how many RS's we want) + #n_comb_stages = width // (2 * log2_radix) # 2 compute steps per stage + n_comb_stages = 2 # FIXME: switch back + + fraction_width = fmt.fraction_width + + # extra bits needed: guard + round + fraction_width += 2 + + # rounding width to a multiple of log2_radix is not needed, + # DivPipeCoreCalculateStage just internally reduces log2_radix on + # the last stage + cfg = DivPipeCoreConfig(fmt.width, fraction_width, log2_radix) self.pspec.fpformat = fmt - self.pspec.log2_radix = log2_radix self.pspec.n_comb_stages = n_comb_stages self.pspec.core_config = cfg -- 2.30.2