From: Luke Kenneth Casson Leighton Date: Fri, 22 May 2020 10:03:22 +0000 (+0100) Subject: add cookie-cut mul pipeline template X-Git-Tag: div_pipeline~951 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ac88348dfe4478778d03f899c19d08c5fb9455b5;p=soc.git add cookie-cut mul pipeline template --- diff --git a/src/soc/fu/mul/__init__.py b/src/soc/fu/mul/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/soc/fu/mul/formal/proof_main_stage.py b/src/soc/fu/mul/formal/proof_main_stage.py new file mode 100644 index 00000000..afcf12e7 --- /dev/null +++ b/src/soc/fu/mul/formal/proof_main_stage.py @@ -0,0 +1,110 @@ +# Proof of correctness for partitioned equal signal combiner +# Copyright (C) 2020 Michael Nolan + +from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl, + signed) +from nmigen.asserts import Assert, AnyConst, Assume, Cover +from nmigen.test.utils import FHDLTestCase +from nmigen.cli import rtlil + +from soc.fu.shift_rot.main_stage import ShiftRotMainStage +from soc.fu.alu.pipe_data import ALUPipeSpec +from soc.fu.alu.alu_input_record import CompALUOpSubset +from soc.decoder.power_enums import InternalOp +import unittest + + +# This defines a module to drive the device under test and assert +# properties about its outputs +class Driver(Elaboratable): + def __init__(self): + # inputs and outputs + pass + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + rec = CompALUOpSubset() + recwidth = 0 + # Setup random inputs for dut.op + for p in rec.ports(): + width = p.width + recwidth += width + comb += p.eq(AnyConst(width)) + + pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth) + m.submodules.dut = dut = ShiftRotMainStage(pspec) + + # convenience variables + a = dut.i.rs + b = dut.i.rb + ra = dut.i.ra + carry_in = dut.i.xer_ca[0] + carry_in32 = dut.i.xer_ca[1] + so_in = dut.i.xer_so + carry_out = dut.o.xer_ca + o = dut.o.o + + # setup random inputs + comb += [a.eq(AnyConst(64)), + b.eq(AnyConst(64)), + carry_in.eq(AnyConst(1)), + carry_in32.eq(AnyConst(1)), + so_in.eq(AnyConst(1))] + + comb += dut.i.ctx.op.eq(rec) + + # Assert that op gets copied from the input to output + for rec_sig in rec.ports(): + name = rec_sig.name + dut_sig = getattr(dut.o.ctx.op, name) + comb += Assert(dut_sig == rec_sig) + + # signed and signed/32 versions of input a + a_signed = Signal(signed(64)) + a_signed_32 = Signal(signed(32)) + comb += a_signed.eq(a) + comb += a_signed_32.eq(a[0:32]) + + # main assertion of arithmetic operations + with m.Switch(rec.insn_type): + with m.Case(InternalOp.OP_SHL): + comb += Assume(ra == 0) + with m.If(rec.is_32bit): + comb += Assert(o[0:32] == ((a << b[0:6]) & 0xffffffff)) + comb += Assert(o[32:64] == 0) + with m.Else(): + comb += Assert(o == ((a << b[0:7]) & ((1 << 64)-1))) + with m.Case(InternalOp.OP_SHR): + comb += Assume(ra == 0) + with m.If(~rec.is_signed): + with m.If(rec.is_32bit): + comb += Assert(o[0:32] == (a[0:32] >> b[0:6])) + comb += Assert(o[32:64] == 0) + with m.Else(): + comb += Assert(o == (a >> b[0:7])) + with m.Else(): + with m.If(rec.is_32bit): + comb += Assert(o[0:32] == (a_signed_32 >> b[0:6])) + comb += Assert(o[32:64] == Repl(a[31], 32)) + with m.Else(): + comb += Assert(o == (a_signed >> b[0:7])) + + return m + + +class ALUTestCase(FHDLTestCase): + def test_formal(self): + module = Driver() + self.assertFormal(module, mode="bmc", depth=2) + self.assertFormal(module, mode="cover", depth=2) + def test_ilang(self): + dut = Driver() + vl = rtlil.convert(dut, ports=[]) + with open("main_stage.il", "w") as f: + f.write(vl) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/soc/fu/mul/main_stage.py b/src/soc/fu/mul/main_stage.py new file mode 100644 index 00000000..ea40da35 --- /dev/null +++ b/src/soc/fu/mul/main_stage.py @@ -0,0 +1,79 @@ +# This stage is intended to do most of the work of executing multiply +# instructions, as well as carry and overflow generation. This module +# however should not gate the carry or overflow, that's up to the +# output stage +from nmigen import (Module, Signal, Cat, Repl, Mux, Const) +from nmutil.pipemodbase import PipeModBase +from soc.fu.alu.pipe_data import ALUOutputData +from soc.fu.mul.pipe_data import MulInputData +from ieee754.part.partsig import PartitionedSignal +from soc.decoder.power_enums import InternalOp +from soc.fu.shift_rot.rotator import Rotator + +from soc.decoder.power_fields import DecodeFields +from soc.decoder.power_fieldsn import SignalBitRange + + +class ShiftRotMainStage(PipeModBase): + def __init__(self, pspec): + super().__init__(pspec, "main") + self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn]) + self.fields.create_specs() + + def ispec(self): + return MulInputData(self.pspec) + + def ospec(self): + return ALUOutputData(self.pspec) + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + + # obtain me and mb fields from instruction. + m_fields = self.fields.instrs['M'] + md_fields = self.fields.instrs['MD'] + mb = Signal(m_fields['MB'][0:-1].shape()) + me = Signal(m_fields['ME'][0:-1].shape()) + mb_extra = Signal(1, reset_less=True) + comb += mb.eq(m_fields['MB'][0:-1]) + comb += me.eq(m_fields['ME'][0:-1]) + comb += mb_extra.eq(md_fields['mb'][0:-1][0]) + + # set up microwatt rotator module + m.submodules.rotator = rotator = Rotator() + comb += [ + rotator.me.eq(me), + rotator.mb.eq(mb), + rotator.mb_extra.eq(mb_extra), + rotator.rs.eq(self.i.rs), + rotator.ra.eq(self.i.ra), + rotator.shift.eq(self.i.rb), + rotator.is_32bit.eq(self.i.ctx.op.is_32bit), + rotator.arith.eq(self.i.ctx.op.is_signed), + ] + + # instruction rotate type + mode = Signal(3, reset_less=True) + with m.Switch(self.i.ctx.op.insn_type): + with m.Case(InternalOp.OP_SHL): comb += mode.eq(0b000) + with m.Case(InternalOp.OP_SHR): comb += mode.eq(0b001) # R-shift + with m.Case(InternalOp.OP_RLC): comb += mode.eq(0b110) # clear LR + with m.Case(InternalOp.OP_RLCL): comb += mode.eq(0b010) # clear L + with m.Case(InternalOp.OP_RLCR): comb += mode.eq(0b100) # clear R + + comb += Cat(rotator.right_shift, + rotator.clear_left, + rotator.clear_right).eq(mode) + + # outputs from the microwatt rotator module + # XXX TODO: carry32 + comb += [self.o.o.eq(rotator.result_o), + self.o.xer_ca[0].eq(rotator.carry_out_o)] + + ###### sticky overflow and context, both pass-through ##### + + comb += self.o.xer_so.data.eq(self.i.xer_so) + comb += self.o.ctx.eq(self.i.ctx) + + return m diff --git a/src/soc/fu/mul/pipe_data.py b/src/soc/fu/mul/pipe_data.py new file mode 100644 index 00000000..495d503b --- /dev/null +++ b/src/soc/fu/mul/pipe_data.py @@ -0,0 +1,10 @@ +from soc.fu.alu.alu_input_record import CompALUOpSubset +from soc.fu.pipe_data import IntegerData, CommonPipeSpec +from soc.fu.alu.pipe_data import ALUOutputData +from soc.fu.shift_rot.pipe_data import ShoftRotInputData + + +# TODO: replace CompALUOpSubset with CompShiftRotOpSubset +class ShiftRotPipeSpec(CommonPipeSpec): + regspec = (ShiftRotInputData.regspec, ALUOutputData.regspec) + opsubsetkls = CompALUOpSubset diff --git a/src/soc/fu/mul/pipeline.py b/src/soc/fu/mul/pipeline.py new file mode 100644 index 00000000..25745171 --- /dev/null +++ b/src/soc/fu/mul/pipeline.py @@ -0,0 +1,31 @@ +from nmutil.singlepipe import ControlBase +from nmutil.pipemodbase import PipeModBaseChain +from soc.fu.shift_rot.input_stage import ShiftRotInputStage +from soc.fu.shift_rot.main_stage import ShiftRotMainStage +from soc.fu.alu.output_stage import ALUOutputStage + +class MulStages1(PipeModBaseChain): + def get_chain(self): + inp = ALUInputStage(self.pspec) + main = MulMainStage1(self.pspec) + return [inp, main] + +class MulStages2(PipeModBaseChain): + def get_chain(self): + main2 = MulMainStage2(self.pspec) + out = ALUOutputStage(self.pspec) + return [main2, out] + + +class ShiftRotBasePipe(ControlBase): + def __init__(self, pspec): + ControlBase.__init__(self) + self.pipe1 = MulStages1(pspec) + self.pipe2 = MulStages2(pspec) + self._eqs = self.connect([self.pipe1, self.pipe2]) + + def elaborate(self, platform): + m = ControlBase.elaborate(self, platform) + m.submodules.pipe = self.pipe1 + m.d.comb += self._eqs + return m diff --git a/src/soc/fu/mul/test/test_pipe_caller.py b/src/soc/fu/mul/test/test_pipe_caller.py new file mode 100644 index 00000000..713ebd61 --- /dev/null +++ b/src/soc/fu/mul/test/test_pipe_caller.py @@ -0,0 +1,277 @@ +from nmigen import Module, Signal +from nmigen.back.pysim import Simulator, Delay, Settle +from nmigen.test.utils import FHDLTestCase +from nmigen.cli import rtlil +import unittest +from soc.decoder.isa.caller import ISACaller, special_sprs +from soc.decoder.power_decoder import (create_pdecode) +from soc.decoder.power_decoder2 import (PowerDecode2) +from soc.decoder.power_enums import (XER_bits, Function) +from soc.decoder.selectable_int import SelectableInt +from soc.simulator.program import Program +from soc.decoder.isa.all import ISA + + +from soc.fu.shift_rot.pipeline import ShiftRotBasePipe +from soc.fu.alu.alu_input_record import CompALUOpSubset +from soc.fu.shift_rot.pipe_data import ShiftRotPipeSpec +import random + +class TestCase: + def __init__(self, program, regs, sprs, name): + self.program = program + self.regs = regs + self.sprs = sprs + self.name = name + +def get_rec_width(rec): + recwidth = 0 + # Setup random inputs for dut.op + for p in rec.ports(): + width = p.width + recwidth += width + return recwidth + +def set_alu_inputs(alu, dec2, sim): + inputs = [] + # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43 + # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok)) + # and place it into data_i.b + + reg3_ok = yield dec2.e.read_reg3.ok + if reg3_ok: + reg3_sel = yield dec2.e.read_reg3.data + data3 = sim.gpr(reg3_sel).value + else: + data3 = 0 + reg1_ok = yield dec2.e.read_reg1.ok + if reg1_ok: + reg1_sel = yield dec2.e.read_reg1.data + data1 = sim.gpr(reg1_sel).value + else: + data1 = 0 + reg2_ok = yield dec2.e.read_reg2.ok + imm_ok = yield dec2.e.imm_data.ok + if reg2_ok: + reg2_sel = yield dec2.e.read_reg2.data + data2 = sim.gpr(reg2_sel).value + elif imm_ok: + data2 = yield dec2.e.imm_data.imm + else: + data2 = 0 + + yield alu.p.data_i.ra.eq(data1) + yield alu.p.data_i.rb.eq(data2) + yield alu.p.data_i.rs.eq(data3) + + +def set_extra_alu_inputs(alu, dec2, sim): + carry = 1 if sim.spr['XER'][XER_bits['CA']] else 0 + carry32 = 1 if sim.spr['XER'][XER_bits['CA32']] else 0 + yield alu.p.data_i.xer_ca[0].eq(carry) + yield alu.p.data_i.xer_ca[1].eq(carry32) + so = 1 if sim.spr['XER'][XER_bits['SO']] else 0 + yield alu.p.data_i.xer_so.eq(so) + + +# This test bench is a bit different than is usual. Initially when I +# was writing it, I had all of the tests call a function to create a +# device under test and simulator, initialize the dut, run the +# simulation for ~2 cycles, and assert that the dut output what it +# should have. However, this was really slow, since it needed to +# create and tear down the dut and simulator for every test case. + +# Now, instead of doing that, every test case in ShiftRotTestCase puts some +# data into the test_data list below, describing the instructions to +# be tested and the initial state. Once all the tests have been run, +# test_data gets passed to TestRunner which then sets up the DUT and +# simulator once, runs all the data through it, and asserts that the +# results match the pseudocode sim at every cycle. + +# By doing this, I've reduced the time it takes to run the test suite +# massively. Before, it took around 1 minute on my computer, now it +# takes around 3 seconds + +test_data = [] + + +class ShiftRotTestCase(FHDLTestCase): + def __init__(self, name): + super().__init__(name) + self.test_name = name + def run_tst_program(self, prog, initial_regs=[0] * 32, initial_sprs={}): + tc = TestCase(prog, initial_regs, initial_sprs, self.test_name) + test_data.append(tc) + + + def test_shift(self): + insns = ["slw", "sld", "srw", "srd", "sraw", "srad"] + for i in range(20): + choice = random.choice(insns) + lst = [f"{choice} 3, 1, 2"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + initial_regs[2] = random.randint(0, 63) + print(initial_regs[1], initial_regs[2]) + self.run_tst_program(Program(lst), initial_regs) + + + def test_shift_arith(self): + lst = ["sraw 3, 1, 2"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + initial_regs[2] = random.randint(0, 63) + print(initial_regs[1], initial_regs[2]) + self.run_tst_program(Program(lst), initial_regs) + + def test_shift_once(self): + lst = ["slw 3, 1, 4", + "slw 3, 1, 2"] + initial_regs = [0] * 32 + initial_regs[1] = 0x80000000 + initial_regs[2] = 0x40 + initial_regs[4] = 0x00 + self.run_tst_program(Program(lst), initial_regs) + + def test_rlwinm(self): + for i in range(10): + mb = random.randint(0,31) + me = random.randint(0,31) + sh = random.randint(0,31) + lst = [f"rlwinm 3, 1, {mb}, {me}, {sh}"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + self.run_tst_program(Program(lst), initial_regs) + + def test_rlwimi(self): + lst = ["rlwimi 3, 1, 5, 20, 6"] + initial_regs = [0] * 32 + initial_regs[1] = 0xdeadbeef + initial_regs[3] = 0x12345678 + self.run_tst_program(Program(lst), initial_regs) + + def test_rlwnm(self): + lst = ["rlwnm 3, 1, 2, 20, 6"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + initial_regs[2] = random.randint(0, 63) + self.run_tst_program(Program(lst), initial_regs) + + def test_rldicl(self): + lst = ["rldicl 3, 1, 5, 20"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + self.run_tst_program(Program(lst), initial_regs) + + def test_rldicr(self): + lst = ["rldicr 3, 1, 5, 20"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + self.run_tst_program(Program(lst), initial_regs) + + def test_rlc(self): + insns = ["rldic", "rldicl", "rldicr"] + for i in range(20): + choice = random.choice(insns) + sh = random.randint(0, 63) + m = random.randint(0, 63) + lst = [f"{choice} 3, 1, {sh}, {m}"] + initial_regs = [0] * 32 + initial_regs[1] = random.randint(0, (1<<64)-1) + self.run_tst_program(Program(lst), initial_regs) + + def test_ilang(self): + pspec = ShiftRotPipeSpec(id_wid=2) + alu = ShiftRotBasePipe(pspec) + vl = rtlil.convert(alu, ports=alu.ports()) + with open("pipeline.il", "w") as f: + f.write(vl) + + +class TestRunner(FHDLTestCase): + def __init__(self, test_data): + super().__init__("run_all") + self.test_data = test_data + + def run_all(self): + m = Module() + comb = m.d.comb + instruction = Signal(32) + + pdecode = create_pdecode() + + m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode) + + pspec = ShiftRotPipeSpec(id_wid=2) + m.submodules.alu = alu = ShiftRotBasePipe(pspec) + + comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.e) + comb += alu.p.valid_i.eq(1) + comb += alu.n.ready_i.eq(1) + comb += pdecode2.dec.raw_opcode_in.eq(instruction) + sim = Simulator(m) + + sim.add_clock(1e-6) + def process(): + for test in self.test_data: + print(test.name) + program = test.program + self.subTest(test.name) + simulator = ISA(pdecode2, test.regs, test.sprs, 0) + gen = program.generate_instructions() + instructions = list(zip(gen, program.assembly.splitlines())) + + index = simulator.pc.CIA.value//4 + while index < len(instructions): + ins, code = instructions[index] + + print("0x{:X}".format(ins & 0xffffffff)) + print(code) + + # ask the decoder to decode this binary data (endian'd) + yield pdecode2.dec.bigendian.eq(0) # little / big? + yield instruction.eq(ins) # raw binary instr. + yield Settle() + fn_unit = yield pdecode2.e.fn_unit + self.assertEqual(fn_unit, Function.SHIFT_ROT.value) + yield from set_alu_inputs(alu, pdecode2, simulator) + yield from set_extra_alu_inputs(alu, pdecode2, simulator) + yield + opname = code.split(' ')[0] + yield from simulator.call(opname) + index = simulator.pc.CIA.value//4 + + vld = yield alu.n.valid_o + while not vld: + yield + vld = yield alu.n.valid_o + yield + alu_out = yield alu.n.data_o.o + out_reg_valid = yield pdecode2.e.write_reg.ok + if out_reg_valid: + write_reg_idx = yield pdecode2.e.write_reg.data + expected = simulator.gpr(write_reg_idx).value + msg = f"expected {expected:x}, actual: {alu_out:x}" + self.assertEqual(expected, alu_out, msg) + yield from self.check_extra_alu_outputs(alu, pdecode2, + simulator) + + sim.add_sync_process(process) + with sim.write_vcd("simulator.vcd", "simulator.gtkw", + traces=[]): + sim.run() + def check_extra_alu_outputs(self, alu, dec2, sim): + rc = yield dec2.e.rc.data + if rc: + cr_expected = sim.crl[0].get_range().value + cr_actual = yield alu.n.data_o.cr0 + self.assertEqual(cr_expected, cr_actual) + + +if __name__ == "__main__": + unittest.main(exit=False) + suite = unittest.TestSuite() + suite.addTest(TestRunner(test_data)) + + runner = unittest.TextTestRunner() + runner.run(suite)