From f2fe0c00f38b95ab76df6110f9d0d868d7200e96 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Mon, 6 Jul 2020 16:34:31 +0100 Subject: [PATCH] add first cut at fu mul pipeline --- src/soc/fu/mul/main_stage.py | 74 +++++++------------------------- src/soc/fu/mul/pipe_data.py | 27 +++++++++--- src/soc/fu/mul/pipeline.py | 20 ++++++--- src/soc/fu/mul/post_stage.py | 81 ++++++++++++++++++++++++++++++++++++ src/soc/fu/mul/pre_stage.py | 53 +++++++++++++++++++++++ 5 files changed, 187 insertions(+), 68 deletions(-) create mode 100644 src/soc/fu/mul/post_stage.py create mode 100644 src/soc/fu/mul/pre_stage.py diff --git a/src/soc/fu/mul/main_stage.py b/src/soc/fu/mul/main_stage.py index ea40da35..97ba81d7 100644 --- a/src/soc/fu/mul/main_stage.py +++ b/src/soc/fu/mul/main_stage.py @@ -1,79 +1,37 @@ -# This stage is intended to do most of the work of executing multiply -# instructions, as well as carry and overflow generation. This module -# however should not gate the carry or overflow, that's up to the -# output stage -from nmigen import (Module, Signal, Cat, Repl, Mux, Const) +# This stage is intended to do the main work of an actual multiply + +from nmigen import Module from nmutil.pipemodbase import PipeModBase -from soc.fu.alu.pipe_data import ALUOutputData -from soc.fu.mul.pipe_data import MulInputData +from soc.fu.mul.pipe_data import MulIntermediateData, MulOutputData from ieee754.part.partsig import PartitionedSignal -from soc.decoder.power_enums import InternalOp -from soc.fu.shift_rot.rotator import Rotator - -from soc.decoder.power_fields import DecodeFields -from soc.decoder.power_fieldsn import SignalBitRange -class ShiftRotMainStage(PipeModBase): +class MulMainStage2(PipeModBase): def __init__(self, pspec): - super().__init__(pspec, "main") - self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn]) - self.fields.create_specs() + super().__init__(pspec, "mul2") def ispec(self): - return MulInputData(self.pspec) + return MulIntermediateData(self.pspec) # pipeline stage input format def ospec(self): - return ALUOutputData(self.pspec) + return MulOutputData(self.pspec) # pipeline stage output format def elaborate(self, platform): m = Module() comb = m.d.comb - # obtain me and mb fields from instruction. - m_fields = self.fields.instrs['M'] - md_fields = self.fields.instrs['MD'] - mb = Signal(m_fields['MB'][0:-1].shape()) - me = Signal(m_fields['ME'][0:-1].shape()) - mb_extra = Signal(1, reset_less=True) - comb += mb.eq(m_fields['MB'][0:-1]) - comb += me.eq(m_fields['ME'][0:-1]) - comb += mb_extra.eq(md_fields['mb'][0:-1][0]) - - # set up microwatt rotator module - m.submodules.rotator = rotator = Rotator() - comb += [ - rotator.me.eq(me), - rotator.mb.eq(mb), - rotator.mb_extra.eq(mb_extra), - rotator.rs.eq(self.i.rs), - rotator.ra.eq(self.i.ra), - rotator.shift.eq(self.i.rb), - rotator.is_32bit.eq(self.i.ctx.op.is_32bit), - rotator.arith.eq(self.i.ctx.op.is_signed), - ] + # convenience variables + a, b, o = self.i.a, self.i.b, self.o.o - # instruction rotate type - mode = Signal(3, reset_less=True) - with m.Switch(self.i.ctx.op.insn_type): - with m.Case(InternalOp.OP_SHL): comb += mode.eq(0b000) - with m.Case(InternalOp.OP_SHR): comb += mode.eq(0b001) # R-shift - with m.Case(InternalOp.OP_RLC): comb += mode.eq(0b110) # clear LR - with m.Case(InternalOp.OP_RLCL): comb += mode.eq(0b010) # clear L - with m.Case(InternalOp.OP_RLCR): comb += mode.eq(0b100) # clear R + # actual multiply (TODO: split into stages) + comb += o.eq(a * b) - comb += Cat(rotator.right_shift, - rotator.clear_left, - rotator.clear_right).eq(mode) - - # outputs from the microwatt rotator module - # XXX TODO: carry32 - comb += [self.o.o.eq(rotator.result_o), - self.o.xer_ca[0].eq(rotator.carry_out_o)] - - ###### sticky overflow and context, both pass-through ##### + ###### xer and context, all pass-through ##### + comb += self.o.xer_ca.data.eq(self.i.xer_ca) + comb += self.o.neg_res.data.eq(self.i.neg_res) comb += self.o.xer_so.data.eq(self.i.xer_so) comb += self.o.ctx.eq(self.i.ctx) return m + diff --git a/src/soc/fu/mul/pipe_data.py b/src/soc/fu/mul/pipe_data.py index 495d503b..429be008 100644 --- a/src/soc/fu/mul/pipe_data.py +++ b/src/soc/fu/mul/pipe_data.py @@ -1,10 +1,27 @@ from soc.fu.alu.alu_input_record import CompALUOpSubset from soc.fu.pipe_data import IntegerData, CommonPipeSpec -from soc.fu.alu.pipe_data import ALUOutputData -from soc.fu.shift_rot.pipe_data import ShoftRotInputData +from soc.fu.alu.pipe_data import ALUOutputData, ALUInputData -# TODO: replace CompALUOpSubset with CompShiftRotOpSubset -class ShiftRotPipeSpec(CommonPipeSpec): - regspec = (ShiftRotInputData.regspec, ALUOutputData.regspec) +class MulIntermediateData(ALUInputData): + def __init__(self, pspec): + super().__init__(pspec) + + neg_result = Signal(reset_less=True) + self.data.append(neg_result) + + +class MulOutputData(IntegerData): + regspec = [('INT', 'o', '0:128'), + ('XER', 'xer_so', '32'), # XER bit 32: SO + ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32 + def __init__(self, pspec): + super().__init__(pspec, False) + + neg_result = Signal(reset_less=True) + self.data.append(neg_result) + + +class MulPipeSpec(CommonPipeSpec): + regspec = (ALUInputData.regspec, ALUOutputData.regspec) opsubsetkls = CompALUOpSubset diff --git a/src/soc/fu/mul/pipeline.py b/src/soc/fu/mul/pipeline.py index e726d170..d32d7529 100644 --- a/src/soc/fu/mul/pipeline.py +++ b/src/soc/fu/mul/pipeline.py @@ -3,18 +3,27 @@ from nmutil.pipemodbase import PipeModBaseChain from soc.fu.shift_rot.input_stage import ShiftRotInputStage from soc.fu.shift_rot.main_stage import ShiftRotMainStage from soc.fu.alu.output_stage import ALUOutputStage +from soc.fu.mul.main_stage import MulMainStage1, MulMainStage2, MulMainStage3 + class MulStages1(PipeModBaseChain): def get_chain(self): - inp = ALUInputStage(self.pspec) - main = MulMainStage1(self.pspec) + inp = ALUInputStage(self.pspec) # a-invert, carry etc + main = MulMainStage1(self.pspec) # detect signed/32-bit return [inp, main] + class MulStages2(PipeModBaseChain): def get_chain(self): - main2 = MulMainStage2(self.pspec) - out = ALUOutputStage(self.pspec) - return [main2, out] + main2 = MulMainStage2(self.pspec) # actual multiply + return [main2] + + +class MulStages3(PipeModBaseChain): + def get_chain(self): + main3 = MulMainStage3(self.pspec) # select output bits, invert, set ov + out = ALUOutputStage(self.pspec) # do CR, XER and out-invert etc. + return [main3, out] class ShiftRotBasePipe(ControlBase): @@ -23,6 +32,7 @@ class ShiftRotBasePipe(ControlBase): self.pspec = pspec self.pipe1 = MulStages1(pspec) self.pipe2 = MulStages2(pspec) + self.pipe2 = MulStages3(pspec) self._eqs = self.connect([self.pipe1, self.pipe2]) def elaborate(self, platform): diff --git a/src/soc/fu/mul/post_stage.py b/src/soc/fu/mul/post_stage.py new file mode 100644 index 00000000..501b4ed5 --- /dev/null +++ b/src/soc/fu/mul/post_stage.py @@ -0,0 +1,81 @@ +# This stage is intended to do most of the work of analysing the multiply result + +from nmigen import (Module, Signal, Cat, Repl, Mux, signed) +from nmutil.pipemodbase import PipeModBase +from soc.fu.alu.pipe_data import ALUOutputData +from soc.fu.mul.pipe_data import MulOutputData +from ieee754.part.partsig import PartitionedSignal +from soc.decoder.power_enums import InternalOp + + +class MulMainStage3(PipeModBase): + def __init__(self, pspec): + super().__init__(pspec, "mul3") + + def ispec(self): + return MulOutputData(self.pspec) # pipeline stage output format + + def ospec(self): + return ALUOutputData(self.pspec) # defines pipeline stage output format + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # convenience variables + cry_o, o, cr0 = self.o.xer_ca, self.o.o, self.o.cr0 + ov_o = self.o.xer_ov + o_i, cry_i, op = self.i.o, self.i.xer_ca, self.i.ctx.op + + # check if op is 32-bit, and get sign bit from operand a + is_32bit = Signal(reset_less=True) + comb += is_32bit.eq(op.is_32bit) + + # check negate: select signed/unsigned + o_s = Signal(signed(o.width * 2), reset_less=True) + mul_o = Signal(o.width * 2, reset_less=True) + comb += o_s.eq(-o_i) + comb += mul_o.eq(Mux(self.i.neg_res, o_s, o_i)) + comb += o.ok.eq(1) + + with m.Switch(op.insn_type): + # hi-32 replicated twice + with m.Case(InternalOp.OP_MUL_H32): + comb += o.data.eq(Repl(mul_o[32:64], 2)) + # hi-64 + with m.Case(InternalOp.OP_MUL_H64): + comb += o.data.eq(mul_o[64:128]) + # lo-64 - overflow + with m.Default(): + comb += o.data.eq(mul_o[0:64]) + + # compute overflow + mul_ov = Signal(reset_less=True) + with m.If(is_32bit): + m32 = mul_o[32:64] + comb += mul_ov.eq(m32.bool() & ~m32.all()) + with m.Else(): + m64 = mul_o[64:128] + comb += mul_ov.eq(m64.bool() & ~m64.all()) + + # 32-bit (ov[1]) and 64-bit (ov[0]) overflow + ov = Signal(2, reset_less=True) + comb += ov[0].eq(mul_ov) + comb += ov[1].eq(mul_ov) + comb += ov_o.data.eq(ov) + comb += ov_o.ok.eq(1) + + # https://bugs.libre-soc.org/show_bug.cgi?id=319#c5 + ca = Signal(2, reset_less=True) + comb += ca[0].eq(add_o[-1]) # XER.CA + comb += ca[1].eq(add_o[33] ^ (a[32] ^ b[32])) # XER.CA32 + comb += cry_o.data.eq(ca) + comb += cry_o.ok.eq(1) + + ###### sticky overflow and context, both pass-through ##### + + comb += self.o.xer_so.data.eq(self.i.xer_so) + comb += self.o.ctx.eq(self.i.ctx) + + return m + diff --git a/src/soc/fu/mul/pre_stage.py b/src/soc/fu/mul/pre_stage.py new file mode 100644 index 00000000..ff1e3220 --- /dev/null +++ b/src/soc/fu/mul/pre_stage.py @@ -0,0 +1,53 @@ +# This stage is intended to do most of the work of executing multiply +from nmigen import (Module, Signal, Mux) +from nmutil.pipemodbase import PipeModBase +from soc.fu.alu.pipe_data import ALUInputData +from soc.fu.mul.pipe_data import MulIntermediateData +from ieee754.part.partsig import PartitionedSignal + + +class MulMainStage1(PipeModBase): + def __init__(self, pspec): + super().__init__(pspec, "mul1") + + def ispec(self): + return ALUInputData(self.pspec) # defines pipeline stage input format + + def ospec(self): + return MulIntermediateData(self.pspec) # pipeline stage output format + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # convenience variables + a, b = self.i.a, self.i.b + a_o, b_o, neg_res_o = self.o.a, self.o.b, self.o.neg_res + + # check if op is 32-bit, and get sign bit from operand a + is_32bit = Signal(reset_less=True) + sign_a = Signal(reset_less=True) + sign_b = Signal(reset_less=True) + comb += is_32bit.eq(op.is_32bit) + + # work out if a/b are negative (check 32-bit / signed) + comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed) + comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed) + + # work out if result is negative sign + comb += neg_res_o.eq(sign_a ^ sign_b) + + # negation of a 64-bit value produces the same lower 32-bit + # result as negation of just the lower 32-bits, so we don't + # need to do anything special before negating + comb += a_o.eq(Mux(sign_a, -a, a)) + comb += b_o.eq(Mux(sign_b, -b, b)) + + ###### XER and context, both pass-through ##### + + comb += self.o.xer_ca.data.eq(self.i.xer_ca) + comb += self.o.xer_so.data.eq(self.i.xer_so) + comb += self.o.ctx.eq(self.i.ctx) + + return m + -- 2.30.2