X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fsimple%2Fissuer.py;h=15bd1760a5ab93f233d8cb7cdff813d7b0833096;hb=HEAD;hp=6aa790191c38a96d01d608a817d09417c50b6bdb;hpb=ed1504b692c40bed445a48fbb58fdde32dea4da6;p=soc.git diff --git a/src/soc/simple/issuer.py b/src/soc/simple/issuer.py index 6aa79019..15bd1760 100644 --- a/src/soc/simple/issuer.py +++ b/src/soc/simple/issuer.py @@ -33,7 +33,7 @@ from openpower.decoder.decode2execute1 import Data from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR, SVP64PredMode) from openpower.state import CoreState -from openpower.consts import (CR, SVP64CROffs) +from openpower.consts import (CR, SVP64CROffs, MSR) from soc.experiment.testmem import TestMemory # test only for instructions from soc.regfile.regfiles import StateRegs, FastRegs from soc.simple.core import NonProductionCore @@ -48,7 +48,7 @@ from soc.bus.SPBlock512W64B8W import SPBlock512W64B8W from soc.clock.select import ClockSelect from soc.clock.dummypll import DummyPLL from openpower.sv.svstate import SVSTATERec - +from soc.experiment.icache import ICache from nmutil.util import rising_edge @@ -63,11 +63,10 @@ def get_insn(f_instr_o, pc): # gets state input or reads from state regfile -def state_get(m, core_rst, state_i, name, regfile, regnum): +def state_get(m, res, core_rst, state_i, name, regfile, regnum): comb = m.d.comb sync = m.d.sync - # read the PC - res = Signal(64, reset_less=True, name=name) + # read the {insert state variable here} res_ok_delay = Signal(name="%s_ok_delay" % name) with m.If(~core_rst): sync += res_ok_delay.eq(~state_i.ok) @@ -75,12 +74,11 @@ def state_get(m, core_rst, state_i, name, regfile, regnum): # incoming override (start from pc_i) comb += res.eq(state_i.data) with m.Else(): - # otherwise read StateRegs regfile for PC... + # otherwise read StateRegs regfile for {insert state here}... comb += regfile.ren.eq(1 << regnum) # ... but on a 1-clock delay with m.If(res_ok_delay): comb += res.eq(regfile.o_data) - return res def get_predint(m, mask, name): @@ -158,201 +156,34 @@ def get_predcr(m, mask, name): return idx, invert -# Fetch Finite State Machine. -# WARNING: there are currently DriverConflicts but it's actually working. -# TODO, here: everything that is global in nature, information from the -# main TestIssuerInternal, needs to move to either ispec() or ospec(). -# not only that: TestIssuerInternal.imem can entirely move into here -# because imem is only ever accessed inside the FetchFSM. -class FetchFSM(ControlBase): - def __init__(self, allow_overlap, svp64_en, imem, core_rst, - pdecode2, cur_state, - dbg, core, svstate, nia, is_svp64_mode): - self.allow_overlap = allow_overlap - self.svp64_en = svp64_en - self.imem = imem - self.core_rst = core_rst - self.pdecode2 = pdecode2 - self.cur_state = cur_state - self.dbg = dbg - self.core = core - self.svstate = svstate - self.nia = nia - self.is_svp64_mode = is_svp64_mode - - # set up pipeline ControlBase and allocate i/o specs - # (unusual: normally done by the Pipeline API) - super().__init__(stage=self) - self.p.i_data, self.n.o_data = self.new_specs(None) - self.i, self.o = self.p.i_data, self.n.o_data - - # next 3 functions are Stage API Compliance - def setup(self, m, i): - pass - - def ispec(self): - return FetchInput() - - def ospec(self): - return FetchOutput() - - def elaborate(self, platform): - """fetch FSM - - this FSM performs fetch of raw instruction data, partial-decodes - it 32-bit at a time to detect SVP64 prefixes, and will optionally - read a 2nd 32-bit quantity if that occurs. - """ - m = super().elaborate(platform) - - dbg = self.dbg - core = self.core, - pc = self.i.pc - svstate = self.svstate - nia = self.nia - is_svp64_mode = self.is_svp64_mode - fetch_pc_o_ready = self.p.o_ready - fetch_pc_i_valid = self.p.i_valid - fetch_insn_o_valid = self.n.o_valid - fetch_insn_i_ready = self.n.i_ready - - comb = m.d.comb - sync = m.d.sync - pdecode2 = self.pdecode2 - cur_state = self.cur_state - dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode - - msr_read = Signal(reset=1) - - # don't read msr every cycle - staterf = self.core.regs.rf['state'] - state_r_msr = staterf.r_ports['msr'] # MSR rd - - comb += state_r_msr.ren.eq(0) - - with m.FSM(name='fetch_fsm'): - - # waiting (zzz) - with m.State("IDLE"): - with m.If(~dbg.stopping_o): - comb += fetch_pc_o_ready.eq(1) - with m.If(fetch_pc_i_valid): - # instruction allowed to go: start by reading the PC - # capture the PC and also drop it into Insn Memory - # we have joined a pair of combinatorial memory - # lookups together. this is Generally Bad. - comb += self.imem.a_pc_i.eq(pc) - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - sync += cur_state.pc.eq(pc) - sync += cur_state.svstate.eq(svstate) # and svstate - - # initiate read of MSR. arrives one clock later - comb += state_r_msr.ren.eq(1 << StateRegs.MSR) - sync += msr_read.eq(0) - - m.next = "INSN_READ" # move to "wait for bus" phase - - # dummy pause to find out why simulation is not keeping up - with m.State("INSN_READ"): - if self.allow_overlap: - stopping = dbg.stopping_o - else: - stopping = Const(0) - with m.If(stopping): - # stopping: jump back to idle - m.next = "IDLE" - with m.Else(): - # one cycle later, msr/sv read arrives. valid only once. - with m.If(~msr_read): - sync += msr_read.eq(1) # yeah don't read it again - sync += cur_state.msr.eq(state_r_msr.o_data) - with m.If(self.imem.f_busy_o): # zzz... - # busy: stay in wait-read - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - with m.Else(): - # not busy: instruction fetched - insn = get_insn(self.imem.f_instr_o, cur_state.pc) - if self.svp64_en: - svp64 = self.svp64 - # decode the SVP64 prefix, if any - comb += svp64.raw_opcode_in.eq(insn) - comb += svp64.bigendian.eq(self.core_bigendian_i) - # pass the decoded prefix (if any) to PowerDecoder2 - sync += pdecode2.sv_rm.eq(svp64.svp64_rm) - sync += pdecode2.is_svp64_mode.eq(is_svp64_mode) - # remember whether this is a prefixed instruction, - # so the FSM can readily loop when VL==0 - sync += is_svp64_mode.eq(svp64.is_svp64_mode) - # calculate the address of the following instruction - insn_size = Mux(svp64.is_svp64_mode, 8, 4) - sync += nia.eq(cur_state.pc + insn_size) - with m.If(~svp64.is_svp64_mode): - # with no prefix, store the instruction - # and hand it directly to the next FSM - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - with m.Else(): - # fetch the rest of the instruction from memory - comb += self.imem.a_pc_i.eq(cur_state.pc + 4) - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - m.next = "INSN_READ2" - else: - # not SVP64 - 32-bit only - sync += nia.eq(cur_state.pc + 4) - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - - with m.State("INSN_READ2"): - with m.If(self.imem.f_busy_o): # zzz... - # busy: stay in wait-read - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - with m.Else(): - # not busy: instruction fetched - insn = get_insn(self.imem.f_instr_o, cur_state.pc+4) - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - # TODO: probably can start looking at pdecode2.rm_dec - # here or maybe even in INSN_READ state, if svp64_mode - # detected, in order to trigger - and wait for - the - # predicate reading. - if self.svp64_en: - pmode = pdecode2.rm_dec.predmode - """ - if pmode != SVP64PredMode.ALWAYS.value: - fire predicate loading FSM and wait before - moving to INSN_READY - else: - sync += self.srcmask.eq(-1) # set to all 1s - sync += self.dstmask.eq(-1) # set to all 1s - m.next = "INSN_READY" - """ - - with m.State("INSN_READY"): - # hand over the instruction, to be decoded - comb += fetch_insn_o_valid.eq(1) - with m.If(fetch_insn_i_ready): - m.next = "IDLE" +class TestIssuerBase(Elaboratable): + """TestIssuerBase - common base class for Issuers - # whatever was done above, over-ride it if core reset is held - with m.If(self.core_rst): - sync += nia.eq(0) + takes care of power-on reset, peripherals, debug, DEC/TB, + and gets PC/MSR/SVSTATE from the State Regfile etc. + """ - return m + def __init__(self, pspec): + # test if microwatt compatibility is to be enabled + self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and + (pspec.microwatt_compat == True)) + self.alt_reset = Signal(reset_less=True) # not connected yet (microwatt) + # test if fabric compatibility is to be enabled + self.fabric_compat = (hasattr(pspec, "fabric_compat") and + (pspec.fabric_compat == True)) -class TestIssuerInternal(Elaboratable): - """TestIssuer - reads instructions from TestMemory and issues them + if self.microwatt_compat or self.fabric_compat: - efficiency and speed is not the main goal here: functional correctness - and code clarity is. optimisations (which almost 100% interfere with - easy understanding) come later. - """ + if hasattr(pspec, "microwatt_old"): + self.microwatt_old = pspec.microwatt_old + else: + self.microwatt_old = True # PLEASE DO NOT ALTER THIS - def __init__(self, pspec): + if hasattr(pspec, "microwatt_debug"): + self.microwatt_debug = pspec.microwatt_debug + else: + self.microwatt_debug = True # set to False when using an FPGA # test is SVP64 is to be enabled self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True) @@ -365,14 +196,20 @@ class TestIssuerInternal(Elaboratable): self.allow_overlap = (hasattr(pspec, "allow_overlap") and (pspec.allow_overlap == True)) + # and get the core domain + self.core_domain = "coresync" + if (hasattr(pspec, "core_domain") and + isinstance(pspec.core_domain, str)): + self.core_domain = pspec.core_domain + # JTAG interface. add this right at the start because if it's # added it *modifies* the pspec, by adding enable/disable signals # for parts of the rest of the core self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag' - self.dbg_domain = "sync" # sigh "dbgsunc" too problematic - # self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock + #self.dbg_domain = "sync" # sigh "dbgsunc" too problematic + self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock if self.jtag_en: - # XXX MUST keep this up-to-date with litex, and + # XXX MUST keep this up-to-date with fabric, and # soc-cocotb-sim, and err.. all needs sorting out, argh subset = ['uart', 'mtwi', @@ -409,6 +246,8 @@ class TestIssuerInternal(Elaboratable): self.xics_icp = XICS_ICP() self.xics_ics = XICS_ICS() self.int_level_i = self.xics_ics.int_level_i + else: + self.ext_irq = Signal() # add GPIO peripheral? self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True @@ -418,7 +257,7 @@ class TestIssuerInternal(Elaboratable): # main instruction core. suitable for prototyping / demo only self.core = core = NonProductionCore(pspec) - self.core_rst = ResetSignal("coresync") + self.core_rst = ResetSignal(self.core_domain) # instruction decoder. goes into Trap Record #pdecode = create_pdecode() @@ -432,15 +271,24 @@ class TestIssuerInternal(Elaboratable): if self.svp64_en: self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix + self.update_svstate = Signal() # set this if updating svstate + self.new_svstate = new_svstate = SVSTATERec("new_svstate") + # Test Instruction memory + if hasattr(core, "icache"): + # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit + # truly dreadful. needs a huge reorg. + pspec.icache = core.icache self.imem = ConfigFetchUnit(pspec).fu # DMI interface self.dbg = CoreDebug() + self.dbg_rst_i = Signal(reset_less=True) # instruction go/monitor self.pc_o = Signal(64, reset_less=True) self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me" + self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me" self.svstate_i = Data(64, "svstate_i") # ditto self.core_bigendian_i = Signal() # TODO: set based on MSR.LE self.busy_o = Signal(reset_less=True) @@ -448,18 +296,23 @@ class TestIssuerInternal(Elaboratable): # STATE regfile read /write ports for PC, MSR, SVSTATE staterf = self.core.regs.rf['state'] + self.state_r_msr = staterf.r_ports['msr'] # MSR rd self.state_r_pc = staterf.r_ports['cia'] # PC rd - self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd + + self.state_w_msr = staterf.w_ports['d_wr2'] # MSR wr + self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr # DMI interface access intrf = self.core.regs.rf['int'] + fastrf = self.core.regs.rf['fast'] crrf = self.core.regs.rf['cr'] xerrf = self.core.regs.rf['xer'] - self.int_r = intrf.r_ports['dmi'] # INT read - self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read - self.xer_r = xerrf.r_ports['full_xer'] # XER read + self.int_r = intrf.r_ports['dmi'] # INT DMI read + self.cr_r = crrf.r_ports['full_cr_dbg'] # CR DMI read + self.xer_r = xerrf.r_ports['full_xer'] # XER DMI read + self.fast_r = fastrf.r_ports['dmi'] # FAST DMI read if self.svp64_en: # for predication @@ -469,6 +322,8 @@ class TestIssuerInternal(Elaboratable): # hack method of keeping an eye on whether branch/trap set the PC self.state_nia = self.core.regs.rf['state'].w_ports['nia'] self.state_nia.wen.name = 'state_nia_wen' + # and whether SPR pipeline sets DEC or TB (fu/spr/main_stage.py) + self.state_spr = self.core.regs.rf['state'].w_ports['state1'] # pulse to synchronize the simulator at instruction end self.insn_done = Signal() @@ -481,128 +336,779 @@ class TestIssuerInternal(Elaboratable): self.srcmask = Signal(64) self.dstmask = Signal(64) - def fetch_predicate_fsm(self, m, - pred_insn_i_valid, pred_insn_o_ready, - pred_mask_o_valid, pred_mask_i_ready): - """fetch_predicate_fsm - obtains (constructs in the case of CR) - src/dest predicate masks + # sigh, the wishbone addresses are not wishbone-compliant + # in old versions of microwatt, tplaten_3d_game is a new one + if self.microwatt_compat or self.fabric_compat: + self.ibus_adr = Signal(32, name='wishbone_insn_out.adr') + self.dbus_adr = Signal(32, name='wishbone_data_out.adr') + + # add an output of the PC and instruction, and whether it was requested + # this is for verilator debug purposes + if self.microwatt_compat or self.fabric_compat: + self.nia = Signal(64) + self.msr_o = Signal(64) + self.nia_req = Signal(1) + self.insn = Signal(32) + self.ldst_req = Signal(1) + self.ldst_addr = Signal(1) + + # for pausing dec/tb during an SPR pipeline event, this + # ensures that an SPR write (mtspr) to TB or DEC does not + # get overwritten by the DEC/TB FSM + self.pause_dec_tb = Signal() - https://bugs.libre-soc.org/show_bug.cgi?id=617 - the predicates can be read here, by using IntRegs r_ports['pred'] - or CRRegs r_ports['pred']. in the case of CRs it will have to - be done through multiple reads, extracting one relevant at a time. - later, a faster way would be to use the 32-bit-wide CR port but - this is more complex decoding, here. equivalent code used in - ISACaller is "from openpower.decoder.isa.caller import get_predcr" + def setup_peripherals(self, m): + comb, sync = m.d.comb, m.d.sync - note: this ENTIRE FSM is not to be called when svp64 is disabled - """ - comb = m.d.comb - sync = m.d.sync - pdecode2 = self.pdecode2 - rm_dec = pdecode2.rm_dec # SVP64RMModeDecode - predmode = rm_dec.predmode - srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred - cr_pred, int_pred = self.cr_pred, self.int_pred # read regfiles - # get src/dst step, so we can skip already used mask bits - cur_state = self.cur_state - srcstep = cur_state.svstate.srcstep - dststep = cur_state.svstate.dststep - cur_vl = cur_state.svstate.vl + # okaaaay so the debug module must be in coresync clock domain + # but NOT its reset signal. to cope with this, set every single + # submodule explicitly in coresync domain, debug and JTAG + # in their own one but using *external* reset. + csd = DomainRenamer(self.core_domain) + dbd = DomainRenamer(self.dbg_domain) - # decode predicates - sregread, sinvert, sunary, sall1s = get_predint(m, srcpred, 's') - dregread, dinvert, dunary, dall1s = get_predint(m, dstpred, 'd') - sidx, scrinvert = get_predcr(m, srcpred, 's') - didx, dcrinvert = get_predcr(m, dstpred, 'd') + if self.microwatt_compat or self.fabric_compat: + m.submodules.core = core = self.core + else: + m.submodules.core = core = csd(self.core) - # store fetched masks, for either intpred or crpred - # when src/dst step is not zero, the skipped mask bits need to be - # shifted-out, before actually storing them in src/dest mask - new_srcmask = Signal(64, reset_less=True) - new_dstmask = Signal(64, reset_less=True) + # this _so_ needs sorting out. ICache is added down inside + # LoadStore1 and is already a submodule of LoadStore1 + if not isinstance(self.imem, ICache): + m.submodules.imem = imem = csd(self.imem) - with m.FSM(name="fetch_predicate"): + # set up JTAG Debug Module (in correct domain) + m.submodules.dbg = dbg = dbd(self.dbg) + if self.jtag_en: + m.submodules.jtag = jtag = dbd(self.jtag) + # TODO: UART2GDB mux, here, from external pin + # see https://bugs.libre-soc.org/show_bug.cgi?id=499 + sync += dbg.dmi.connect_to(jtag.dmi) - with m.State("FETCH_PRED_IDLE"): - comb += pred_insn_o_ready.eq(1) - with m.If(pred_insn_i_valid): - with m.If(predmode == SVP64PredMode.INT): - # skip fetching destination mask register, when zero - with m.If(dall1s): - sync += new_dstmask.eq(-1) - # directly go to fetch source mask register - # guaranteed not to be zero (otherwise predmode - # would be SVP64PredMode.ALWAYS, not INT) - comb += int_pred.addr.eq(sregread) - comb += int_pred.ren.eq(1) - m.next = "INT_SRC_READ" - # fetch destination predicate register - with m.Else(): - comb += int_pred.addr.eq(dregread) - comb += int_pred.ren.eq(1) - m.next = "INT_DST_READ" - with m.Elif(predmode == SVP64PredMode.CR): - # go fetch masks from the CR register file - sync += new_srcmask.eq(0) - sync += new_dstmask.eq(0) - m.next = "CR_READ" - with m.Else(): - sync += self.srcmask.eq(-1) - sync += self.dstmask.eq(-1) - m.next = "FETCH_PRED_DONE" + # fixup the clocks in microwatt-compat mode (but leave resets alone + # so that microwatt soc.vhdl can pull a reset on the core or DMI + # can do it, just like in TestIssuer) + if self.microwatt_compat or self.fabric_compat: + intclk = ClockSignal(self.core_domain) + dbgclk = ClockSignal(self.dbg_domain) + if self.core_domain != 'sync': + comb += intclk.eq(ClockSignal()) + if self.dbg_domain != 'sync': + comb += dbgclk.eq(ClockSignal()) + + # if using old version of microwatt + # drop the first 3 bits of the incoming wishbone addresses + if self.microwatt_compat or self.fabric_compat: + ibus = self.imem.ibus + dbus = self.core.l0.cmpi.wb_bus() + if self.microwatt_old: + comb += self.ibus_adr.eq(Cat(Const(0, 3), ibus.adr)) + comb += self.dbus_adr.eq(Cat(Const(0, 3), dbus.adr)) + else: + comb += self.ibus_adr.eq(ibus.adr) + comb += self.dbus_adr.eq(dbus.adr) + if self.microwatt_debug: + # microwatt verilator debug purposes + pi = self.core.l0.cmpi.pi.pi + comb += self.ldst_req.eq(pi.addr_ok_o) + comb += self.ldst_addr.eq(pi.addr) - with m.State("INT_DST_READ"): - # store destination mask - inv = Repl(dinvert, 64) - with m.If(dunary): - # set selected mask bit for 1<1 loop - with m.If(~dbg.core_stop_o & ~core_rst): - comb += exec_pc_i_ready.eq(1) - # see https://bugs.libre-soc.org/show_bug.cgi?id=636 - # the exception info needs to be blatted into - # pdecode.ldst_exc, and the instruction "re-run". - # when ldst_exc.happened is set, the PowerDecoder2 - # reacts very differently: it re-writes the instruction - # with a "trap" (calls PowerDecoder2.trap()) which - # will *overwrite* whatever was requested and jump the - # PC to the exception address, as well as alter MSR. - # nothing else needs to be done other than to note - # the change of PC and MSR (and, later, SVSTATE) - with m.If(exc_happened): - sync += pdecode2.ldst_exc.eq(core.fus.get_exc("ldst0")) - - with m.If(exec_pc_o_valid): - - # was this the last loop iteration? - is_last = Signal() - cur_vl = cur_state.svstate.vl - comb += is_last.eq(next_srcstep == cur_vl) - - # return directly to Decode if Execute generated an - # exception. - with m.If(pdecode2.ldst_exc.happened): - m.next = "DECODE_SV" - - # if either PC or SVSTATE were changed by the previous - # instruction, go directly back to Fetch, without - # updating either PC or SVSTATE - with m.Elif(pc_changed | sv_changed): - m.next = "ISSUE_START" + # when using "single-step" mode, checking dbg.stopping_o + # prevents progress. allow execute to proceed once started + stopping = Const(0) + #if self.allow_overlap: + # stopping = dbg.stopping_o + with m.If(stopping): + # stopping: jump back to idle + m.next = "ISSUE_START" + if flush_needed: + # request the icache to stop asserting "failed" + comb += core.icache.flush_in.eq(1) + # stop instruction fault + sync += pdecode2.instr_fault.eq(0) + with m.Else(): + comb += exec_insn_i_valid.eq(1) # trigger execute + with m.If(exec_insn_o_ready): # execute acknowledged us + m.next = "EXECUTE_WAIT" - # also return to Fetch, when no output was a vector - # (regardless of SRCSTEP and VL), or when the last - # instruction was really the last one of the VL loop - with m.Elif((~pdecode2.loop_continue) | is_last): - # before going back to fetch, update the PC state - # register with the NIA. - # ok here we are not reading the branch unit. - # TODO: this just blithely overwrites whatever - # pipeline updated the PC - comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) - comb += self.state_w_pc.i_data.eq(nia) - # reset SRCSTEP before returning to Fetch - if self.svp64_en: - with m.If(pdecode2.loop_continue): - comb += new_svstate.srcstep.eq(0) - comb += new_svstate.dststep.eq(0) - comb += update_svstate.eq(1) - else: + with m.State("EXECUTE_WAIT"): + comb += exec_pc_i_ready.eq(1) + # see https://bugs.libre-soc.org/show_bug.cgi?id=636 + # the exception info needs to be blatted into + # pdecode.ldst_exc, and the instruction "re-run". + # when ldst_exc.happened is set, the PowerDecoder2 + # reacts very differently: it re-writes the instruction + # with a "trap" (calls PowerDecoder2.trap()) which + # will *overwrite* whatever was requested and jump the + # PC to the exception address, as well as alter MSR. + # nothing else needs to be done other than to note + # the change of PC and MSR (and, later, SVSTATE) + with m.If(exc_happened): + mmu = core.fus.get_exc("mmu0") + ldst = core.fus.get_exc("ldst0") + if mmu is not None: + with m.If(fetch_failed): + # instruction fetch: exception is from MMU + # reset instr_fault (highest priority) + sync += pdecode2.ldst_exc.eq(mmu) + sync += pdecode2.instr_fault.eq(0) + if flush_needed: + # request icache to stop asserting "failed" + comb += core.icache.flush_in.eq(1) + with m.If(~fetch_failed): + # otherwise assume it was a LDST exception + sync += pdecode2.ldst_exc.eq(ldst) + + with m.If(exec_pc_o_valid): + + # was this the last loop iteration? + is_last = Signal() + cur_vl = cur_state.svstate.vl + comb += is_last.eq(next_srcstep == cur_vl) + + with m.If(pdecode2.instr_fault): + # reset instruction fault, try again + sync += pdecode2.instr_fault.eq(0) + m.next = "ISSUE_START" + + # return directly to Decode if Execute generated an + # exception. + with m.Elif(pdecode2.ldst_exc.happened): + m.next = "DECODE_SV" + + # if MSR, PC or SVSTATE were changed by the previous + # instruction, go directly back to Fetch, without + # updating either MSR PC or SVSTATE + with m.Elif(self.msr_changed | self.pc_changed | + self.sv_changed): + m.next = "ISSUE_START" + + # also return to Fetch, when no output was a vector + # (regardless of SRCSTEP and VL), or when the last + # instruction was really the last one of the VL loop + with m.Elif((~pdecode2.loop_continue) | is_last): + # before going back to fetch, update the PC state + # register with the NIA. + # ok here we are not reading the branch unit. + # TODO: this just blithely overwrites whatever + # pipeline updated the PC + comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) + comb += self.state_w_pc.i_data.eq(nia) + # reset SRCSTEP before returning to Fetch + if self.svp64_en: + with m.If(pdecode2.loop_continue): comb += new_svstate.srcstep.eq(0) comb += new_svstate.dststep.eq(0) - comb += update_svstate.eq(1) - m.next = "ISSUE_START" + comb += self.update_svstate.eq(1) + else: + comb += new_svstate.srcstep.eq(0) + comb += new_svstate.dststep.eq(0) + comb += self.update_svstate.eq(1) + m.next = "ISSUE_START" - # returning to Execute? then, first update SRCSTEP - with m.Else(): - comb += new_svstate.srcstep.eq(next_srcstep) - comb += new_svstate.dststep.eq(next_dststep) - comb += update_svstate.eq(1) - # return to mask skip loop - m.next = "PRED_SKIP" + # returning to Execute? then, first update SRCSTEP + with m.Else(): + comb += new_svstate.srcstep.eq(next_srcstep) + comb += new_svstate.dststep.eq(next_dststep) + comb += self.update_svstate.eq(1) + # return to mask skip loop + m.next = "PRED_SKIP" - with m.Else(): - comb += dbg.core_stopped_i.eq(1) - # while stopped, allow updating the PC and SVSTATE - with m.If(self.pc_i.ok): - comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) - comb += self.state_w_pc.i_data.eq(self.pc_i.data) - sync += pc_changed.eq(1) - with m.If(self.svstate_i.ok): - comb += new_svstate.eq(self.svstate_i.data) - comb += update_svstate.eq(1) - sync += sv_changed.eq(1) # check if svstate needs updating: if so, write it to State Regfile - with m.If(update_svstate): - comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE) - comb += self.state_w_sv.i_data.eq(new_svstate) - sync += cur_state.svstate.eq(new_svstate) # for next clock + with m.If(self.update_svstate): + sync += cur_state.svstate.eq(self.new_svstate) # for next clock - def execute_fsm(self, m, core, pc_changed, sv_changed, + def execute_fsm(self, m, core, exec_insn_i_valid, exec_insn_o_ready, exec_pc_o_valid, exec_pc_i_ready): """execute FSM @@ -964,12 +1503,19 @@ class TestIssuerInternal(Elaboratable): comb = m.d.comb sync = m.d.sync + dbg = self.dbg pdecode2 = self.pdecode2 + cur_state = self.cur_state # temporaries core_busy_o = core.n.o_data.busy_o # core is busy core_ivalid_i = core.p.i_valid # instruction is valid + if hasattr(core, "icache"): + fetch_failed = core.icache.i_out.fetch_failed + else: + fetch_failed = Const(0, 1) + with m.FSM(name="exec_fsm"): # waiting for instruction bus (stays there until not busy) @@ -977,18 +1523,34 @@ class TestIssuerInternal(Elaboratable): comb += exec_insn_o_ready.eq(1) with m.If(exec_insn_i_valid): comb += core_ivalid_i.eq(1) # instruction is valid/issued - sync += sv_changed.eq(0) - sync += pc_changed.eq(0) + sync += self.sv_changed.eq(0) + sync += self.pc_changed.eq(0) + sync += self.msr_changed.eq(0) with m.If(core.p.o_ready): # only move if accepted m.next = "INSN_ACTIVE" # move to "wait completion" # instruction started: must wait till it finishes with m.State("INSN_ACTIVE"): - # note changes to PC and SVSTATE + # note changes to MSR, PC and SVSTATE with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)): - sync += sv_changed.eq(1) + sync += self.sv_changed.eq(1) + with m.If(self.state_nia.wen & (1 << StateRegs.MSR)): + sync += self.msr_changed.eq(1) with m.If(self.state_nia.wen & (1 << StateRegs.PC)): - sync += pc_changed.eq(1) + sync += self.pc_changed.eq(1) + # and note changes to DEC/TB, to be passed to DEC/TB FSM + with m.If(self.state_spr.wen & (1 << StateRegs.TB)): + comb += self.pause_dec_tb.eq(1) + # but also zero-out the cur_state DEC so that, on + # the next instruction, if it is "enable interrupt" + # the delay between the DEC/TB FSM reading and updating + # cur_state.dec doesn't trigger a spurious interrupt. + # the DEC/TB FSM will read the regfile and update to + # the correct value, so having cur_state.dec set to zero + # for a while is no big deal. + with m.If(self.state_spr.wen & (1 << StateRegs.DEC)): + comb += self.pause_dec_tb.eq(1) + sync += cur_state.dec.eq(0) # only needs top bit clear with m.If(~core_busy_o): # instruction done! comb += exec_pc_o_valid.eq(1) with m.If(exec_pc_i_ready): @@ -1002,104 +1564,17 @@ class TestIssuerInternal(Elaboratable): # if we erroneously indicate "done" here, it is as if # there were *TWO* instructions: # 1) the failed LDST 2) a TRAP. - with m.If(~pdecode2.ldst_exc.happened): + with m.If(~pdecode2.ldst_exc.happened & + ~pdecode2.instr_fault): comb += self.insn_done.eq(1) m.next = "INSN_START" # back to fetch - - def setup_peripherals(self, m): - comb, sync = m.d.comb, m.d.sync - - # okaaaay so the debug module must be in coresync clock domain - # but NOT its reset signal. to cope with this, set every single - # submodule explicitly in coresync domain, debug and JTAG - # in their own one but using *external* reset. - csd = DomainRenamer("coresync") - dbd = DomainRenamer(self.dbg_domain) - - m.submodules.core = core = csd(self.core) - m.submodules.imem = imem = csd(self.imem) - m.submodules.dbg = dbg = dbd(self.dbg) - if self.jtag_en: - m.submodules.jtag = jtag = dbd(self.jtag) - # TODO: UART2GDB mux, here, from external pin - # see https://bugs.libre-soc.org/show_bug.cgi?id=499 - sync += dbg.dmi.connect_to(jtag.dmi) - - cur_state = self.cur_state - - # 4x 4k SRAM blocks. these simply "exist", they get routed in litex - if self.sram4x4k: - for i, sram in enumerate(self.sram4k): - m.submodules["sram4k_%d" % i] = csd(sram) - comb += sram.enable.eq(self.wb_sram_en) - - # XICS interrupt handler - if self.xics: - m.submodules.xics_icp = icp = csd(self.xics_icp) - m.submodules.xics_ics = ics = csd(self.xics_ics) - comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP - sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core - - # GPIO test peripheral - if self.gpio: - m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio) - - # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl) - # XXX causes litex ECP5 test to get wrong idea about input and output - # (but works with verilator sim *sigh*) - # if self.gpio and self.xics: - # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0]) - - # instruction decoder - pdecode = create_pdecode() - m.submodules.dec2 = pdecode2 = csd(self.pdecode2) - if self.svp64_en: - m.submodules.svp64 = svp64 = csd(self.svp64) - - # convenience - dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer - intrf = self.core.regs.rf['int'] - - # clock delay power-on reset - cd_por = ClockDomain(reset_less=True) - cd_sync = ClockDomain() - core_sync = ClockDomain("coresync") - m.domains += cd_por, cd_sync, core_sync - if self.dbg_domain != "sync": - dbg_sync = ClockDomain(self.dbg_domain) - m.domains += dbg_sync - - ti_rst = Signal(reset_less=True) - delay = Signal(range(4), reset=3) - with m.If(delay != 0): - m.d.por += delay.eq(delay - 1) - comb += cd_por.clk.eq(ClockSignal()) - - # power-on reset delay - core_rst = ResetSignal("coresync") - comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal()) - comb += core_rst.eq(ti_rst) - - # debug clock is same as coresync, but reset is *main external* - if self.dbg_domain != "sync": - dbg_rst = ResetSignal(self.dbg_domain) - comb += dbg_rst.eq(ResetSignal()) - - # busy/halted signals from core - core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy - comb += self.busy_o.eq(core_busy_o) - comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i) - - # temporary hack: says "go" immediately for both address gen and ST - l0 = core.l0 - ldst = core.fus.fus['ldst0'] - st_go_edge = rising_edge(m, ldst.st.rel_o) - # link addr-go direct to rel - m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) - m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel + # terminate returns directly to INSN_START + with m.If(dbg.terminate_i): + # comb += self.insn_done.eq(1) - no because it's not + m.next = "INSN_START" # back to fetch def elaborate(self, platform): - m = Module() + m = super().elaborate(platform) # convenience comb, sync = m.d.comb, m.d.sync cur_state = self.cur_state @@ -1109,43 +1584,17 @@ class TestIssuerInternal(Elaboratable): # set up peripherals and core core_rst = self.core_rst - self.setup_peripherals(m) - - # reset current state if core reset requested - with m.If(core_rst): - m.d.sync += self.cur_state.eq(0) - - # PC and instruction from I-Memory - comb += self.pc_o.eq(cur_state.pc) - pc_changed = Signal() # note write to PC - sv_changed = Signal() # note write to SVSTATE # indicate to outside world if any FU is still executing comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing - # read state either from incoming override or from regfile - # TODO: really should be doing MSR in the same way - pc = state_get(m, core_rst, self.pc_i, - "pc", # read PC - self.state_r_pc, StateRegs.PC) - svstate = state_get(m, core_rst, self.svstate_i, - "svstate", # read SVSTATE - self.state_r_sv, StateRegs.SVSTATE) - - # don't write pc every cycle - comb += self.state_w_pc.wen.eq(0) - comb += self.state_w_pc.i_data.eq(0) - # address of the next instruction, in the absence of a branch # depends on the instruction size nia = Signal(64) # connect up debug signals - # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o) - comb += dbg.terminate_i.eq(core.o.core_terminate_o) - comb += dbg.state.pc.eq(pc) - comb += dbg.state.svstate.eq(svstate) - comb += dbg.state.msr.eq(cur_state.msr) + with m.If(core.o.core_terminate_o): + comb += dbg.terminate_i.eq(1) # pass the prefix mode from Fetch to Issue, so the latter can loop # on VL==0 @@ -1188,20 +1637,11 @@ class TestIssuerInternal(Elaboratable): # Issue is where the VL for-loop # lives. the ready/valid # signalling is used to communicate between the four. - # set up Fetch FSM - fetch = FetchFSM(self.allow_overlap, self.svp64_en, - self.imem, core_rst, pdecode2, cur_state, - dbg, core, svstate, nia, is_svp64_mode) - m.submodules.fetch = fetch - # connect up in/out data to existing Signals - comb += fetch.p.i_data.pc.eq(pc) - # and the ready/valid signalling - comb += fetch_pc_o_ready.eq(fetch.p.o_ready) - comb += fetch.p.i_valid.eq(fetch_pc_i_valid) - comb += fetch_insn_o_valid.eq(fetch.n.o_valid) - comb += fetch.n.i_ready.eq(fetch_insn_i_ready) - - self.issue_fsm(m, core, pc_changed, sv_changed, nia, + self.fetch_fsm(m, dbg, core, core_rst, nia, is_svp64_mode, + fetch_pc_o_ready, fetch_pc_i_valid, + fetch_insn_o_valid, fetch_insn_i_ready) + + self.issue_fsm(m, core, nia, dbg, core_rst, is_svp64_mode, fetch_pc_o_ready, fetch_pc_i_valid, fetch_insn_o_valid, fetch_insn_i_ready, @@ -1215,168 +1655,25 @@ class TestIssuerInternal(Elaboratable): pred_insn_i_valid, pred_insn_o_ready, pred_mask_o_valid, pred_mask_i_ready) - self.execute_fsm(m, core, pc_changed, sv_changed, + self.execute_fsm(m, core, exec_insn_i_valid, exec_insn_o_ready, exec_pc_o_valid, exec_pc_i_ready) - # this bit doesn't have to be in the FSM: connect up to read - # regfiles on demand from DMI - self.do_dmi(m, dbg) - - # DEC and TB inc/dec FSM. copy of DEC is put into CoreState, - # (which uses that in PowerDecoder2 to raise 0x900 exception) - self.tb_dec_fsm(m, cur_state.dec) - - return m - - def do_dmi(self, m, dbg): - """deals with DMI debug requests - - currently only provides read requests for the INT regfile, CR and XER - it will later also deal with *writing* to these regfiles. - """ - comb = m.d.comb - sync = m.d.sync - dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer - intrf = self.core.regs.rf['int'] - - with m.If(d_reg.req): # request for regfile access being made - # TODO: error-check this - # XXX should this be combinatorial? sync better? - if intrf.unary: - comb += self.int_r.ren.eq(1 << d_reg.addr) - else: - comb += self.int_r.addr.eq(d_reg.addr) - comb += self.int_r.ren.eq(1) - d_reg_delay = Signal() - sync += d_reg_delay.eq(d_reg.req) - with m.If(d_reg_delay): - # data arrives one clock later - comb += d_reg.data.eq(self.int_r.o_data) - comb += d_reg.ack.eq(1) - - # sigh same thing for CR debug - with m.If(d_cr.req): # request for regfile access being made - comb += self.cr_r.ren.eq(0b11111111) # enable all - d_cr_delay = Signal() - sync += d_cr_delay.eq(d_cr.req) - with m.If(d_cr_delay): - # data arrives one clock later - comb += d_cr.data.eq(self.cr_r.o_data) - comb += d_cr.ack.eq(1) - - # aaand XER... - with m.If(d_xer.req): # request for regfile access being made - comb += self.xer_r.ren.eq(0b111111) # enable all - d_xer_delay = Signal() - sync += d_xer_delay.eq(d_xer.req) - with m.If(d_xer_delay): - # data arrives one clock later - comb += d_xer.data.eq(self.xer_r.o_data) - comb += d_xer.ack.eq(1) - - def tb_dec_fsm(self, m, spr_dec): - """tb_dec_fsm - - this is a FSM for updating either dec or tb. it runs alternately - DEC, TB, DEC, TB. note that SPR pipeline could have written a new - value to DEC, however the regfile has "passthrough" on it so this - *should* be ok. - - see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076 - """ - - comb, sync = m.d.comb, m.d.sync - fast_rf = self.core.regs.rf['fast'] - fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB - fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB - - with m.FSM() as fsm: - - # initiates read of current DEC - with m.State("DEC_READ"): - comb += fast_r_dectb.addr.eq(FastRegs.DEC) - comb += fast_r_dectb.ren.eq(1) - m.next = "DEC_WRITE" - - # waits for DEC read to arrive (1 cycle), updates with new value - with m.State("DEC_WRITE"): - new_dec = Signal(64) - # TODO: MSR.LPCR 32-bit decrement mode - comb += new_dec.eq(fast_r_dectb.o_data - 1) - comb += fast_w_dectb.addr.eq(FastRegs.DEC) - comb += fast_w_dectb.wen.eq(1) - comb += fast_w_dectb.i_data.eq(new_dec) - sync += spr_dec.eq(new_dec) # copy into cur_state for decoder - m.next = "TB_READ" - - # initiates read of current TB - with m.State("TB_READ"): - comb += fast_r_dectb.addr.eq(FastRegs.TB) - comb += fast_r_dectb.ren.eq(1) - m.next = "TB_WRITE" - - # waits for read TB to arrive, initiates write of current TB - with m.State("TB_WRITE"): - new_tb = Signal(64) - comb += new_tb.eq(fast_r_dectb.o_data + 1) - comb += fast_w_dectb.addr.eq(FastRegs.TB) - comb += fast_w_dectb.wen.eq(1) - comb += fast_w_dectb.i_data.eq(new_tb) - m.next = "DEC_READ" + # whatever was done above, over-ride it if core reset is held. + # set NIA to pc_at_reset + with m.If(core_rst): + sync += nia.eq(self.core.pc_at_reset) return m - def __iter__(self): - yield from self.pc_i.ports() - yield self.pc_o - yield self.memerr_o - yield from self.core.ports() - yield from self.imem.ports() - yield self.core_bigendian_i - yield self.busy_o - - def ports(self): - return list(self) - - def external_ports(self): - ports = self.pc_i.ports() - ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o, - ] - - if self.jtag_en: - ports += list(self.jtag.external_ports()) - else: - # don't add DMI if JTAG is enabled - ports += list(self.dbg.dmi.ports()) - - ports += list(self.imem.ibus.fields.values()) - ports += list(self.core.l0.cmpi.wb_bus().fields.values()) - - if self.sram4x4k: - for sram in self.sram4k: - ports += list(sram.bus.fields.values()) - - if self.xics: - ports += list(self.xics_icp.bus.fields.values()) - ports += list(self.xics_ics.bus.fields.values()) - ports.append(self.int_level_i) - - if self.gpio: - ports += list(self.simple_gpio.bus.fields.values()) - ports.append(self.gpio_o) - - return ports - - def ports(self): - return list(self) - class TestIssuer(Elaboratable): def __init__(self, pspec): self.ti = TestIssuerInternal(pspec) self.pll = DummyPLL(instance=True) + self.dbg_rst_i = Signal(reset_less=True) + # PLL direct clock or not self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll if self.pll_en: @@ -1423,23 +1720,24 @@ class TestIssuer(Elaboratable): # internal clock is set to selector clock-out. has the side-effect of # running TestIssuer at this speed (see DomainRenamer("intclk") above) # debug clock runs at coresync internal clock - cd_coresync = ClockDomain("coresync") - #m.domains += cd_coresync if self.ti.dbg_domain != 'sync': cd_dbgsync = ClockDomain("dbgsync") - #m.domains += cd_dbgsync - intclk = ClockSignal("coresync") + intclk = ClockSignal(self.ti.core_domain) dbgclk = ClockSignal(self.ti.dbg_domain) # XXX BYPASS PLL XXX # XXX BYPASS PLL XXX # XXX BYPASS PLL XXX if self.pll_en: comb += intclk.eq(self.ref_clk) + assert self.ti.core_domain != 'sync', \ + "cannot set core_domain to sync and use pll at the same time" else: - comb += intclk.eq(ClockSignal()) + if self.ti.core_domain != 'sync': + comb += intclk.eq(ClockSignal()) if self.ti.dbg_domain != 'sync': dbgclk = ClockSignal(self.ti.dbg_domain) comb += dbgclk.eq(intclk) + comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i) return m @@ -1470,7 +1768,7 @@ if __name__ == '__main__': } pspec = TestMemPspec(ldst_ifacetype='bare_wb', imem_ifacetype='bare_wb', - addr_wid=48, + addr_wid=64, mask_wid=8, reg_wid=64, units=units)