From: Jacob Lifshay Date: Wed, 17 Nov 2021 20:49:41 +0000 (-0800) Subject: start adding bitmanip FU X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e281a933b5e0b7b0c85040116a404873f4ee0f17;p=soc.git start adding bitmanip FU --- diff --git a/src/soc/fu/bitmanip/__init__.py b/src/soc/fu/bitmanip/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/soc/fu/bitmanip/input_record.py b/src/soc/fu/bitmanip/input_record.py new file mode 100644 index 00000000..18af8eaa --- /dev/null +++ b/src/soc/fu/bitmanip/input_record.py @@ -0,0 +1,27 @@ +from soc.fu.base_input_record import CompOpSubsetBase +from nmigen.hdl.rec import Layout + +from openpower.decoder.power_enums import MicrOp, Function, CryIn + + +class CompBitManipOpSubset(CompOpSubsetBase): + """CompBitManipOpSubset + + a copy of the relevant subset information from Decode2Execute1Type + needed for ALU operations. use with eq_from_execute1 (below) to + grab subsets. + """ + + def __init__(self, name=None): + layout = (('insn_type', MicrOp), + ('fn_unit', Function), + ('imm_data', Layout((("data", 64), ("ok", 1)))), + ('rc', Layout((("rc", 1), ("ok", 1)))), # Data + ('oe', Layout((("oe", 1), ("ok", 1)))), # Data + ('write_cr0', 1), + ('is_32bit', 1), + ('is_signed', 1), + ('insn', 32), + ) + + super().__init__(layout, name=name) diff --git a/src/soc/fu/bitmanip/input_stage.py b/src/soc/fu/bitmanip/input_stage.py new file mode 100644 index 00000000..e8234c36 --- /dev/null +++ b/src/soc/fu/bitmanip/input_stage.py @@ -0,0 +1,26 @@ +# This stage is intended to adjust the input data before sending it to +# the acutal ALU. Things like handling inverting the input, carry_in +# generation for subtraction, and handling of immediates should happen +# here +from soc.fu.common_input_stage import CommonInputStage +from soc.fu.bitmanip.pipe_data import BitManipInputData + + +class BitManipInputStage(CommonInputStage): + def __init__(self, pspec): + super().__init__(pspec, "input") + + def ispec(self): + return BitManipInputData(self.pspec) + + def ospec(self): + return BitManipInputData(self.pspec) + + def elaborate(self, platform): + m = super().elaborate(platform) # handles A, carry and sticky overflow + comb = m.d.comb + + # operand rc + comb += self.o.rc.eq(self.i.rc) + + return m diff --git a/src/soc/fu/bitmanip/main_stage.py b/src/soc/fu/bitmanip/main_stage.py new file mode 100644 index 00000000..b31df431 --- /dev/null +++ b/src/soc/fu/bitmanip/main_stage.py @@ -0,0 +1,59 @@ +# License: LGPLv3+ +# Copyright (C) 2020 Michael Nolan +# Copyright (C) 2020 Luke Kenneth Casson Leighton +# Copyright (C) 2021 Jacob Lifshay + +# This stage is intended to do most of the work of executing bitmanip +# instructions, as well as overflow generation. This module however should not +# gate the overflow, that's up to the output stage +from nmigen.hdl.dsl import Module +from nmutil.pipemodbase import PipeModBase +from soc.fu.bitmanip.pipe_data import (BitManipOutputData, + BitManipInputData) +from openpower.decoder.power_enums import MicrOp +from openpower.decoder.power_fields import DecodeFields +from openpower.decoder.power_fieldsn import SignalBitRange +from nmutil.lut import BitwiseLut + + +class BitManipMainStage(PipeModBase): + def __init__(self, pspec): + super().__init__(pspec, "main") + self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn]) + self.fields.create_specs() + + def ispec(self): + return BitManipInputData(self.pspec) + + def ospec(self): + return BitManipOutputData(self.pspec) + + def elaborate(self, platform): + m = Module() + comb = m.d.comb + op = self.i.ctx.op + o = self.o.o + + bitwise_lut = BitwiseLut(input_count=3, width=64) + m.submodules.bitwise_lut = bitwise_lut + comb += bitwise_lut.inputs[0].eq(self.i.rb) + comb += bitwise_lut.inputs[1].eq(self.i.ra) + comb += bitwise_lut.inputs[2].eq(self.i.rc) + + comb += o.ok.eq(1) # defaults to enabled + + with m.Switch(op.insn_type): + with m.Case(MicrOp.OP_TERNLOG): + # TODO: this only works for ternaryi, change to get lut value + # from register when we implement other variants + comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI) + comb += o.data.eq(bitwise_lut.output) + with m.Default(): + comb += o.ok.eq(0) # otherwise disable + + ###### sticky overflow and context, both pass-through ##### + + comb += self.o.xer_so.data.eq(self.i.xer_so) + comb += self.o.ctx.eq(self.i.ctx) + + return m diff --git a/src/soc/fu/bitmanip/output_stage.py b/src/soc/fu/bitmanip/output_stage.py new file mode 100644 index 00000000..ea2d9de9 --- /dev/null +++ b/src/soc/fu/bitmanip/output_stage.py @@ -0,0 +1,15 @@ +# This stage is intended to handle the gating of carry and overflow +# out, summary overflow generation, and updating the condition +# register +from soc.fu.common_output_stage import CommonOutputStage +from soc.fu.bitmanip.pipe_data import (BitManipOutputData, + BitManipOutputDataFinal) + + +class BitManipOutputStage(CommonOutputStage): + + def ispec(self): + return BitManipOutputData(self.pspec) + + def ospec(self): + return BitManipOutputDataFinal(self.pspec) diff --git a/src/soc/fu/bitmanip/pipe_data.py b/src/soc/fu/bitmanip/pipe_data.py new file mode 100644 index 00000000..c0420b78 --- /dev/null +++ b/src/soc/fu/bitmanip/pipe_data.py @@ -0,0 +1,47 @@ +from soc.fu.bitmanip.input_record import CompBitManipOpSubset +from soc.fu.pipe_data import FUBaseData, CommonPipeSpec +from soc.fu.alu.pipe_data import ALUOutputData + + +class BitManipInputData(FUBaseData): + regspec = [ + ('INT', 'ra', '0:63'), # RA + ('INT', 'rb', '0:63'), # RB + ('INT', 'rc', '0:63'), # RC + ('XER', 'xer_so', '32'), # XER bit 32: SO + ] + + def __init__(self, pspec): + super().__init__(pspec, False) + + +# input to bitmanip final stage (common output) +class BitManipOutputData(FUBaseData): + regspec = [ + ('INT', 'o', '0:63'), # RT + ('CR', 'cr_a', '0:3'), + ('XER', 'xer_so', '32'), # bit0: so + ] + + def __init__(self, pspec): + super().__init__(pspec, True) + # convenience + self.cr0 = self.cr_a + + +# output from bitmanip final stage (common output) - note that XER.so +# is *not* included (the only reason it's in the input is because of CR0) +class BitManipOutputDataFinal(FUBaseData): + regspec = [('INT', 'o', '0:63'), # RT + ('CR', 'cr_a', '0:3'), + ] + + def __init__(self, pspec): + super().__init__(pspec, True) + # convenience + self.cr0 = self.cr_a + + +class BitManipPipeSpec(CommonPipeSpec): + regspec = (BitManipInputData.regspec, BitManipOutputDataFinal.regspec) + opsubsetkls = CompBitManipOpSubset diff --git a/src/soc/fu/bitmanip/pipeline.py b/src/soc/fu/bitmanip/pipeline.py new file mode 100644 index 00000000..c0804536 --- /dev/null +++ b/src/soc/fu/bitmanip/pipeline.py @@ -0,0 +1,34 @@ +from nmutil.singlepipe import ControlBase +from nmutil.pipemodbase import PipeModBaseChain +from soc.fu.bitmanip.input_stage import BitManipInputStage +from soc.fu.bitmanip.main_stage import BitManipMainStage +from soc.fu.bitmanip.output_stage import BitManipOutputStage + + +class BitManipStages(PipeModBaseChain): + def get_chain(self): + inp = BitManipInputStage(self.pspec) + main = BitManipMainStage(self.pspec) + return [inp, main] + + +class BitManipStageEnd(PipeModBaseChain): + def get_chain(self): + out = BitManipOutputStage(self.pspec) + return [out] + + +class BitManipBasePipe(ControlBase): + def __init__(self, pspec): + ControlBase.__init__(self) + self.pspec = pspec + self.pipe1 = BitManipStages(pspec) + self.pipe2 = BitManipStageEnd(pspec) + self._eqs = self.connect([self.pipe1, self.pipe2]) + + def elaborate(self, platform): + m = ControlBase.elaborate(self, platform) + m.submodules.pipe1 = self.pipe1 + m.submodules.pipe2 = self.pipe2 + m.d.comb += self._eqs + return m diff --git a/src/soc/fu/bitmanip/test/__init__.py b/src/soc/fu/bitmanip/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/soc/fu/bitmanip/test/test_pipe_caller.py b/src/soc/fu/bitmanip/test/test_pipe_caller.py new file mode 100644 index 00000000..a6690209 --- /dev/null +++ b/src/soc/fu/bitmanip/test/test_pipe_caller.py @@ -0,0 +1,191 @@ +import random +from soc.fu.bitmanip.pipe_data import BitManipPipeSpec +from soc.fu.bitmanip.pipeline import BitManipBasePipe +from openpower.test.common import TestAccumulatorBase, TestCase, ALUHelpers +from openpower.endian import bigendian +from openpower.decoder.isa.all import ISA +from openpower.simulator.program import Program +from openpower.decoder.power_enums import (XER_bits, Function, CryIn) +from openpower.decoder.power_decoder2 import (PowerDecode2) +from openpower.decoder.power_decoder import (create_pdecode) +import unittest +from nmigen.cli import rtlil +from nmigen import Module, Signal + +# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell +# Also, check out the cxxsim nmigen branch, and latest yosys from git +from nmutil.sim_tmp_alternative import Simulator, Settle + +from openpower.test.bitmanip.bitmanip_cases import BitManipTestCase + + +def get_cu_inputs(dec2, sim): + """naming (res) must conform to BitManipFunctionUnit input regspec + """ + res = {} + + yield from ALUHelpers.get_sim_int_ra(res, sim, dec2) # RA + yield from ALUHelpers.get_sim_int_rb(res, sim, dec2) # RB + yield from ALUHelpers.get_sim_int_rc(res, sim, dec2) # RC + yield from ALUHelpers.get_sim_xer_so(res, sim, dec2) # XER.so + + print("alu get_cu_inputs", res) + + return res + + +def set_alu_inputs(alu, dec2, sim): + inp = yield from get_cu_inputs(dec2, sim) + yield from ALUHelpers.set_int_ra(alu, dec2, inp) + yield from ALUHelpers.set_int_rb(alu, dec2, inp) + yield from ALUHelpers.set_int_rc(alu, dec2, inp) + yield from ALUHelpers.set_xer_so(alu, dec2, inp) + + +# This test bench is a bit different than is usual. Initially when I +# was writing it, I had all of the tests call a function to create a +# device under test and simulator, initialize the dut, run the +# simulation for ~2 cycles, and assert that the dut output what it +# should have. However, this was really slow, since it needed to +# create and tear down the dut and simulator for every test case. + +# Now, instead of doing that, every test case in BitManipTestCase puts some +# data into the test_data list below, describing the instructions to +# be tested and the initial state. Once all the tests have been run, +# test_data gets passed to TestRunner which then sets up the DUT and +# simulator once, runs all the data through it, and asserts that the +# results match the pseudocode sim at every cycle. + +# By doing this, I've reduced the time it takes to run the test suite +# massively. Before, it took around 1 minute on my computer, now it +# takes around 3 seconds + + +class BitManipIlangCase(TestAccumulatorBase): + + def case_ilang(self): + pspec = BitManipPipeSpec(id_wid=2) + alu = BitManipBasePipe(pspec) + vl = rtlil.convert(alu, ports=alu.ports()) + with open("bitmanip_pipeline.il", "w") as f: + f.write(vl) + + +class TestRunner(unittest.TestCase): + def __init__(self, test_data): + super().__init__("run_all") + self.test_data = test_data + + def execute(self, alu, instruction, pdecode2, test): + program = test.program + simulator = ISA(pdecode2, test.regs, test.sprs, test.cr, + test.mem, test.msr, + bigendian=bigendian) + gen = program.generate_instructions() + instructions = list(zip(gen, program.assembly.splitlines())) + + index = simulator.pc.CIA.value//4 + while index < len(instructions): + ins, code = instructions[index] + + print("0x{:X}".format(ins & 0xffffffff)) + print(code) + + # ask the decoder to decode this binary data (endian'd) + yield pdecode2.dec.bigendian.eq(bigendian) # little / big? + yield instruction.eq(ins) # raw binary instr. + yield Settle() + fn_unit = yield pdecode2.e.do.fn_unit + self.assertEqual(fn_unit, Function.BITMANIP.value) + yield from set_alu_inputs(alu, pdecode2, simulator) + + # set valid for one cycle, propagate through pipeline... + yield alu.p.i_valid.eq(1) + yield + yield alu.p.i_valid.eq(0) + + opname = code.split(' ')[0] + yield from simulator.call(opname) + index = simulator.pc.CIA.value//4 + + vld = yield alu.n.o_valid + while not vld: + yield + vld = yield alu.n.o_valid + yield + alu_out = yield alu.n.o_data.o.data + + yield from self.check_alu_outputs(alu, pdecode2, + simulator, code) + yield Settle() + + def run_all(self): + m = Module() + comb = m.d.comb + instruction = Signal(32) + + fn_name = "BITMANIP" + opkls = BitManipPipeSpec.opsubsetkls + + m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name) + pdecode = pdecode2.dec + + pspec = BitManipPipeSpec(id_wid=2) + m.submodules.alu = alu = BitManipBasePipe(pspec) + + comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do) + comb += alu.n.i_ready.eq(1) + comb += pdecode2.dec.raw_opcode_in.eq(instruction) + sim = Simulator(m) + + sim.add_clock(1e-6) + + def process(): + for test in self.test_data: + print(test.name) + program = test.program + with self.subTest(test.name): + yield from self.execute(alu, instruction, pdecode2, test) + + sim.add_sync_process(process) + with sim.write_vcd("bitmanip_simulator.vcd"): + sim.run() + + def check_alu_outputs(self, alu, dec2, sim, code): + + rc = yield dec2.e.do.rc.rc + cridx_ok = yield dec2.e.write_cr.ok + cridx = yield dec2.e.write_cr.data + + print("check extra output", repr(code), cridx_ok, cridx) + if rc: + self.assertEqual(cridx, 0, code) + + sim_o = {} + res = {} + + yield from ALUHelpers.get_cr_a(res, alu, dec2) + yield from ALUHelpers.get_xer_ca(res, alu, dec2) + yield from ALUHelpers.get_int_o(res, alu, dec2) + + print("hw outputs", res) + + yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2) + yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2) + yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2) + + print("sim outputs", sim_o) + + ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code)) + ALUHelpers.check_xer_ca(self, res, sim_o, code) + ALUHelpers.check_int_o(self, res, sim_o, code) + + +if __name__ == "__main__": + unittest.main(exit=False) + suite = unittest.TestSuite() + suite.addTest(TestRunner(BitManipTestCase().test_data)) + suite.addTest(TestRunner(BitManipIlangCase().test_data)) + + runner = unittest.TextTestRunner() + runner.run(suite)