X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fsimple%2Fissuer.py;h=156fce3cdf79ddc786804e6342a49533b12f3397;hb=8c0a56c349b0d5650026e0ce9031272104cdc39a;hp=5763e37a2492f401d11bd1fe46a26833879229b6;hpb=49f620605cd0e78b8ec8ab67488c7e60ad673a23;p=soc.git diff --git a/src/soc/simple/issuer.py b/src/soc/simple/issuer.py index 5763e37a..156fce3c 100644 --- a/src/soc/simple/issuer.py +++ b/src/soc/simple/issuer.py @@ -33,7 +33,7 @@ from openpower.decoder.decode2execute1 import Data from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR, SVP64PredMode) from openpower.state import CoreState -from openpower.consts import (CR, SVP64CROffs) +from openpower.consts import (CR, SVP64CROffs, MSR) from soc.experiment.testmem import TestMemory # test only for instructions from soc.regfile.regfiles import StateRegs, FastRegs from soc.simple.core import NonProductionCore @@ -63,11 +63,10 @@ def get_insn(f_instr_o, pc): # gets state input or reads from state regfile -def state_get(m, core_rst, state_i, name, regfile, regnum): +def state_get(m, res, core_rst, state_i, name, regfile, regnum): comb = m.d.comb sync = m.d.sync - # read the PC - res = Signal(64, reset_less=True, name=name) + # read the {insert state variable here} res_ok_delay = Signal(name="%s_ok_delay" % name) with m.If(~core_rst): sync += res_ok_delay.eq(~state_i.ok) @@ -75,12 +74,11 @@ def state_get(m, core_rst, state_i, name, regfile, regnum): # incoming override (start from pc_i) comb += res.eq(state_i.data) with m.Else(): - # otherwise read StateRegs regfile for PC... + # otherwise read StateRegs regfile for {insert state here}... comb += regfile.ren.eq(1 << regnum) # ... but on a 1-clock delay with m.If(res_ok_delay): comb += res.eq(regfile.o_data) - return res def get_predint(m, mask, name): @@ -158,208 +156,11 @@ def get_predcr(m, mask, name): return idx, invert -# Fetch Finite State Machine. -# WARNING: there are currently DriverConflicts but it's actually working. -# TODO, here: everything that is global in nature, information from the -# main TestIssuerInternal, needs to move to either ispec() or ospec(). -# not only that: TestIssuerInternal.imem can entirely move into here -# because imem is only ever accessed inside the FetchFSM. -class FetchFSM(ControlBase): - def __init__(self, allow_overlap, svp64_en, imem, core_rst, - pdecode2, cur_state, - dbg, core, svstate, nia, is_svp64_mode): - self.allow_overlap = allow_overlap - self.svp64_en = svp64_en - self.imem = imem - self.core_rst = core_rst - self.pdecode2 = pdecode2 - self.cur_state = cur_state - self.dbg = dbg - self.core = core - self.svstate = svstate - self.nia = nia - self.is_svp64_mode = is_svp64_mode - - # set up pipeline ControlBase and allocate i/o specs - # (unusual: normally done by the Pipeline API) - super().__init__(stage=self) - self.p.i_data, self.n.o_data = self.new_specs(None) - self.i, self.o = self.p.i_data, self.n.o_data - - # next 3 functions are Stage API Compliance - def setup(self, m, i): - pass - - def ispec(self): - return FetchInput() - - def ospec(self): - return FetchOutput() - - def elaborate(self, platform): - """fetch FSM - - this FSM performs fetch of raw instruction data, partial-decodes - it 32-bit at a time to detect SVP64 prefixes, and will optionally - read a 2nd 32-bit quantity if that occurs. - """ - m = super().elaborate(platform) - - dbg = self.dbg - core = self.core - pc = self.i.pc - svstate = self.svstate - nia = self.nia - is_svp64_mode = self.is_svp64_mode - fetch_pc_o_ready = self.p.o_ready - fetch_pc_i_valid = self.p.i_valid - fetch_insn_o_valid = self.n.o_valid - fetch_insn_i_ready = self.n.i_ready - - comb = m.d.comb - sync = m.d.sync - pdecode2 = self.pdecode2 - cur_state = self.cur_state - dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode - - msr_read = Signal(reset=1) - - # also note instruction fetch failed - if hasattr(core, "icache"): - fetch_failed = core.icache.i_out.fetch_failed - flush_needed = True - else: - fetch_failed = Const(0, 1) - flush_needed = False - - # don't read msr every cycle - staterf = self.core.regs.rf['state'] - state_r_msr = staterf.r_ports['msr'] # MSR rd - - comb += state_r_msr.ren.eq(0) - - with m.FSM(name='fetch_fsm'): - - # waiting (zzz) - with m.State("IDLE"): - with m.If(~dbg.stopping_o & ~fetch_failed): - comb += fetch_pc_o_ready.eq(1) - with m.If(fetch_pc_i_valid & ~fetch_failed): - # instruction allowed to go: start by reading the PC - # capture the PC and also drop it into Insn Memory - # we have joined a pair of combinatorial memory - # lookups together. this is Generally Bad. - comb += self.imem.a_pc_i.eq(pc) - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - sync += cur_state.pc.eq(pc) - sync += cur_state.svstate.eq(svstate) # and svstate - - # initiate read of MSR. arrives one clock later - comb += state_r_msr.ren.eq(1 << StateRegs.MSR) - sync += msr_read.eq(0) - - m.next = "INSN_READ" # move to "wait for bus" phase - - # dummy pause to find out why simulation is not keeping up - with m.State("INSN_READ"): - if self.allow_overlap: - stopping = dbg.stopping_o - else: - stopping = Const(0) - with m.If(stopping): - # stopping: jump back to idle - m.next = "IDLE" - with m.Else(): - # one cycle later, msr/sv read arrives. valid only once. - with m.If(~msr_read): - sync += msr_read.eq(1) # yeah don't read it again - sync += cur_state.msr.eq(state_r_msr.o_data) - with m.If(self.imem.f_busy_o & ~fetch_failed): # zzz... - # busy but not fetch failed: stay in wait-read - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - with m.Else(): - # not busy (or fetch failed!): instruction fetched - # when fetch failed, the instruction gets ignored - # by the decoder - insn = get_insn(self.imem.f_instr_o, cur_state.pc) - if self.svp64_en: - svp64 = self.svp64 - # decode the SVP64 prefix, if any - comb += svp64.raw_opcode_in.eq(insn) - comb += svp64.bigendian.eq(self.core_bigendian_i) - # pass the decoded prefix (if any) to PowerDecoder2 - sync += pdecode2.sv_rm.eq(svp64.svp64_rm) - sync += pdecode2.is_svp64_mode.eq(is_svp64_mode) - # remember whether this is a prefixed instruction, - # so the FSM can readily loop when VL==0 - sync += is_svp64_mode.eq(svp64.is_svp64_mode) - # calculate the address of the following instruction - insn_size = Mux(svp64.is_svp64_mode, 8, 4) - sync += nia.eq(cur_state.pc + insn_size) - with m.If(~svp64.is_svp64_mode): - # with no prefix, store the instruction - # and hand it directly to the next FSM - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - with m.Else(): - # fetch the rest of the instruction from memory - comb += self.imem.a_pc_i.eq(cur_state.pc + 4) - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - m.next = "INSN_READ2" - else: - # not SVP64 - 32-bit only - sync += nia.eq(cur_state.pc + 4) - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - - with m.State("INSN_READ2"): - with m.If(self.imem.f_busy_o): # zzz... - # busy: stay in wait-read - comb += self.imem.a_i_valid.eq(1) - comb += self.imem.f_i_valid.eq(1) - with m.Else(): - # not busy: instruction fetched - insn = get_insn(self.imem.f_instr_o, cur_state.pc+4) - sync += dec_opcode_o.eq(insn) - m.next = "INSN_READY" - # TODO: probably can start looking at pdecode2.rm_dec - # here or maybe even in INSN_READ state, if svp64_mode - # detected, in order to trigger - and wait for - the - # predicate reading. - if self.svp64_en: - pmode = pdecode2.rm_dec.predmode - """ - if pmode != SVP64PredMode.ALWAYS.value: - fire predicate loading FSM and wait before - moving to INSN_READY - else: - sync += self.srcmask.eq(-1) # set to all 1s - sync += self.dstmask.eq(-1) # set to all 1s - m.next = "INSN_READY" - """ - - with m.State("INSN_READY"): - # hand over the instruction, to be decoded - comb += fetch_insn_o_valid.eq(1) - with m.If(fetch_insn_i_ready): - m.next = "IDLE" - - # whatever was done above, over-ride it if core reset is held - with m.If(self.core_rst): - sync += nia.eq(0) - - return m - +class TestIssuerBase(Elaboratable): + """TestIssuerBase - common base class for Issuers -class TestIssuerInternal(Elaboratable): - """TestIssuer - reads instructions from TestMemory and issues them - - efficiency and speed is not the main goal here: functional correctness - and code clarity is. optimisations (which almost 100% interfere with - easy understanding) come later. + takes care of power-on reset, peripherals, debug, DEC/TB, + and gets PC/MSR/SVSTATE from the State Regfile etc. """ def __init__(self, pspec): @@ -375,12 +176,18 @@ class TestIssuerInternal(Elaboratable): self.allow_overlap = (hasattr(pspec, "allow_overlap") and (pspec.allow_overlap == True)) + # and get the core domain + self.core_domain = "coresync" + if (hasattr(pspec, "core_domain") and + isinstance(pspec.core_domain, str)): + self.core_domain = pspec.core_domain + # JTAG interface. add this right at the start because if it's # added it *modifies* the pspec, by adding enable/disable signals # for parts of the rest of the core self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag' - self.dbg_domain = "sync" # sigh "dbgsunc" too problematic - # self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock + #self.dbg_domain = "sync" # sigh "dbgsunc" too problematic + self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock if self.jtag_en: # XXX MUST keep this up-to-date with litex, and # soc-cocotb-sim, and err.. all needs sorting out, argh @@ -428,7 +235,7 @@ class TestIssuerInternal(Elaboratable): # main instruction core. suitable for prototyping / demo only self.core = core = NonProductionCore(pspec) - self.core_rst = ResetSignal("coresync") + self.core_rst = ResetSignal(self.core_domain) # instruction decoder. goes into Trap Record #pdecode = create_pdecode() @@ -442,6 +249,9 @@ class TestIssuerInternal(Elaboratable): if self.svp64_en: self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix + self.update_svstate = Signal() # set this if updating svstate + self.new_svstate = new_svstate = SVSTATERec("new_svstate") + # Test Instruction memory if hasattr(core, "icache"): # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit @@ -451,10 +261,12 @@ class TestIssuerInternal(Elaboratable): # DMI interface self.dbg = CoreDebug() + self.dbg_rst_i = Signal(reset_less=True) # instruction go/monitor self.pc_o = Signal(64, reset_less=True) self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me" + self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me" self.svstate_i = Data(64, "svstate_i") # ditto self.core_bigendian_i = Signal() # TODO: set based on MSR.LE self.busy_o = Signal(reset_less=True) @@ -462,9 +274,12 @@ class TestIssuerInternal(Elaboratable): # STATE regfile read /write ports for PC, MSR, SVSTATE staterf = self.core.regs.rf['state'] + self.state_r_msr = staterf.r_ports['msr'] # MSR rd self.state_r_pc = staterf.r_ports['cia'] # PC rd - self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd + + self.state_w_msr = staterf.w_ports['msr'] # MSR wr + self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr # DMI interface access @@ -495,48 +310,578 @@ class TestIssuerInternal(Elaboratable): self.srcmask = Signal(64) self.dstmask = Signal(64) - def fetch_predicate_fsm(self, m, - pred_insn_i_valid, pred_insn_o_ready, - pred_mask_o_valid, pred_mask_i_ready): - """fetch_predicate_fsm - obtains (constructs in the case of CR) - src/dest predicate masks + def setup_peripherals(self, m): + comb, sync = m.d.comb, m.d.sync - https://bugs.libre-soc.org/show_bug.cgi?id=617 - the predicates can be read here, by using IntRegs r_ports['pred'] - or CRRegs r_ports['pred']. in the case of CRs it will have to - be done through multiple reads, extracting one relevant at a time. - later, a faster way would be to use the 32-bit-wide CR port but - this is more complex decoding, here. equivalent code used in - ISACaller is "from openpower.decoder.isa.caller import get_predcr" + # okaaaay so the debug module must be in coresync clock domain + # but NOT its reset signal. to cope with this, set every single + # submodule explicitly in coresync domain, debug and JTAG + # in their own one but using *external* reset. + csd = DomainRenamer(self.core_domain) + dbd = DomainRenamer(self.dbg_domain) + + m.submodules.core = core = csd(self.core) + # this _so_ needs sorting out. ICache is added down inside + # LoadStore1 and is already a submodule of LoadStore1 + if not isinstance(self.imem, ICache): + m.submodules.imem = imem = csd(self.imem) + m.submodules.dbg = dbg = dbd(self.dbg) + if self.jtag_en: + m.submodules.jtag = jtag = dbd(self.jtag) + # TODO: UART2GDB mux, here, from external pin + # see https://bugs.libre-soc.org/show_bug.cgi?id=499 + sync += dbg.dmi.connect_to(jtag.dmi) - note: this ENTIRE FSM is not to be called when svp64 is disabled - """ - comb = m.d.comb - sync = m.d.sync - pdecode2 = self.pdecode2 - rm_dec = pdecode2.rm_dec # SVP64RMModeDecode - predmode = rm_dec.predmode - srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred - cr_pred, int_pred = self.cr_pred, self.int_pred # read regfiles - # get src/dst step, so we can skip already used mask bits cur_state = self.cur_state - srcstep = cur_state.svstate.srcstep - dststep = cur_state.svstate.dststep - cur_vl = cur_state.svstate.vl - # decode predicates - sregread, sinvert, sunary, sall1s = get_predint(m, srcpred, 's') - dregread, dinvert, dunary, dall1s = get_predint(m, dstpred, 'd') - sidx, scrinvert = get_predcr(m, srcpred, 's') - didx, dcrinvert = get_predcr(m, dstpred, 'd') + # 4x 4k SRAM blocks. these simply "exist", they get routed in litex + if self.sram4x4k: + for i, sram in enumerate(self.sram4k): + m.submodules["sram4k_%d" % i] = csd(sram) + comb += sram.enable.eq(self.wb_sram_en) - # store fetched masks, for either intpred or crpred - # when src/dst step is not zero, the skipped mask bits need to be - # shifted-out, before actually storing them in src/dest mask - new_srcmask = Signal(64, reset_less=True) - new_dstmask = Signal(64, reset_less=True) + # XICS interrupt handler + if self.xics: + m.submodules.xics_icp = icp = csd(self.xics_icp) + m.submodules.xics_ics = ics = csd(self.xics_ics) + comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP + sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core - with m.FSM(name="fetch_predicate"): + # GPIO test peripheral + if self.gpio: + m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio) + + # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl) + # XXX causes litex ECP5 test to get wrong idea about input and output + # (but works with verilator sim *sigh*) + # if self.gpio and self.xics: + # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0]) + + # instruction decoder + pdecode = create_pdecode() + m.submodules.dec2 = pdecode2 = csd(self.pdecode2) + if self.svp64_en: + m.submodules.svp64 = svp64 = csd(self.svp64) + + # convenience + dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer + intrf = self.core.regs.rf['int'] + + # clock delay power-on reset + cd_por = ClockDomain(reset_less=True) + cd_sync = ClockDomain() + m.domains += cd_por, cd_sync + core_sync = ClockDomain(self.core_domain) + if self.core_domain != "sync": + m.domains += core_sync + if self.dbg_domain != "sync": + dbg_sync = ClockDomain(self.dbg_domain) + m.domains += dbg_sync + + ti_rst = Signal(reset_less=True) + delay = Signal(range(4), reset=3) + with m.If(delay != 0): + m.d.por += delay.eq(delay - 1) + comb += cd_por.clk.eq(ClockSignal()) + + # power-on reset delay + core_rst = ResetSignal(self.core_domain) + if self.core_domain != "sync": + comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal()) + comb += core_rst.eq(ti_rst) + else: + with m.If(delay != 0 | dbg.core_rst_o): + comb += core_rst.eq(1) + + # connect external reset signal to DMI Reset + if self.dbg_domain != "sync": + dbg_rst = ResetSignal(self.dbg_domain) + comb += dbg_rst.eq(self.dbg_rst_i) + + # busy/halted signals from core + core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy + comb += self.busy_o.eq(core_busy_o) + comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i) + + # temporary hack: says "go" immediately for both address gen and ST + l0 = core.l0 + ldst = core.fus.fus['ldst0'] + st_go_edge = rising_edge(m, ldst.st.rel_o) + # link addr-go direct to rel + m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) + m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel + + def do_dmi(self, m, dbg): + """deals with DMI debug requests + + currently only provides read requests for the INT regfile, CR and XER + it will later also deal with *writing* to these regfiles. + """ + comb = m.d.comb + sync = m.d.sync + dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer + intrf = self.core.regs.rf['int'] + + with m.If(d_reg.req): # request for regfile access being made + # TODO: error-check this + # XXX should this be combinatorial? sync better? + if intrf.unary: + comb += self.int_r.ren.eq(1 << d_reg.addr) + else: + comb += self.int_r.addr.eq(d_reg.addr) + comb += self.int_r.ren.eq(1) + d_reg_delay = Signal() + sync += d_reg_delay.eq(d_reg.req) + with m.If(d_reg_delay): + # data arrives one clock later + comb += d_reg.data.eq(self.int_r.o_data) + comb += d_reg.ack.eq(1) + + # sigh same thing for CR debug + with m.If(d_cr.req): # request for regfile access being made + comb += self.cr_r.ren.eq(0b11111111) # enable all + d_cr_delay = Signal() + sync += d_cr_delay.eq(d_cr.req) + with m.If(d_cr_delay): + # data arrives one clock later + comb += d_cr.data.eq(self.cr_r.o_data) + comb += d_cr.ack.eq(1) + + # aaand XER... + with m.If(d_xer.req): # request for regfile access being made + comb += self.xer_r.ren.eq(0b111111) # enable all + d_xer_delay = Signal() + sync += d_xer_delay.eq(d_xer.req) + with m.If(d_xer_delay): + # data arrives one clock later + comb += d_xer.data.eq(self.xer_r.o_data) + comb += d_xer.ack.eq(1) + + def tb_dec_fsm(self, m, spr_dec): + """tb_dec_fsm + + this is a FSM for updating either dec or tb. it runs alternately + DEC, TB, DEC, TB. note that SPR pipeline could have written a new + value to DEC, however the regfile has "passthrough" on it so this + *should* be ok. + + see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076 + """ + + comb, sync = m.d.comb, m.d.sync + fast_rf = self.core.regs.rf['fast'] + fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB + fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB + + with m.FSM() as fsm: + + # initiates read of current DEC + with m.State("DEC_READ"): + comb += fast_r_dectb.addr.eq(FastRegs.DEC) + comb += fast_r_dectb.ren.eq(1) + m.next = "DEC_WRITE" + + # waits for DEC read to arrive (1 cycle), updates with new value + with m.State("DEC_WRITE"): + new_dec = Signal(64) + # TODO: MSR.LPCR 32-bit decrement mode + comb += new_dec.eq(fast_r_dectb.o_data - 1) + comb += fast_w_dectb.addr.eq(FastRegs.DEC) + comb += fast_w_dectb.wen.eq(1) + comb += fast_w_dectb.i_data.eq(new_dec) + sync += spr_dec.eq(new_dec) # copy into cur_state for decoder + m.next = "TB_READ" + + # initiates read of current TB + with m.State("TB_READ"): + comb += fast_r_dectb.addr.eq(FastRegs.TB) + comb += fast_r_dectb.ren.eq(1) + m.next = "TB_WRITE" + + # waits for read TB to arrive, initiates write of current TB + with m.State("TB_WRITE"): + new_tb = Signal(64) + comb += new_tb.eq(fast_r_dectb.o_data + 1) + comb += fast_w_dectb.addr.eq(FastRegs.TB) + comb += fast_w_dectb.wen.eq(1) + comb += fast_w_dectb.i_data.eq(new_tb) + m.next = "DEC_READ" + + return m + + def elaborate(self, platform): + m = Module() + # convenience + comb, sync = m.d.comb, m.d.sync + cur_state = self.cur_state + pdecode2 = self.pdecode2 + dbg = self.dbg + + # set up peripherals and core + core_rst = self.core_rst + self.setup_peripherals(m) + + # reset current state if core reset requested + with m.If(core_rst): + m.d.sync += self.cur_state.eq(0) + + # check halted condition: requested PC to execute matches DMI stop addr + # and immediately stop. address of 0xffff_ffff_ffff_ffff can never + # match + halted = Signal() + comb += halted.eq(dbg.stop_addr_o == dbg.state.pc) + with m.If(halted): + comb += dbg.core_stopped_i.eq(1) + comb += dbg.terminate_i.eq(1) + + # PC and instruction from I-Memory + comb += self.pc_o.eq(cur_state.pc) + self.pc_changed = Signal() # note write to PC + self.msr_changed = Signal() # note write to MSR + self.sv_changed = Signal() # note write to SVSTATE + + # read state either from incoming override or from regfile + state = CoreState("get") # current state (MSR/PC/SVSTATE) + state_get(m, state.msr, core_rst, self.msr_i, + "msr", # read MSR + self.state_r_msr, StateRegs.MSR) + state_get(m, state.pc, core_rst, self.pc_i, + "pc", # read PC + self.state_r_pc, StateRegs.PC) + state_get(m, state.svstate, core_rst, self.svstate_i, + "svstate", # read SVSTATE + self.state_r_sv, StateRegs.SVSTATE) + + # don't write pc every cycle + comb += self.state_w_pc.wen.eq(0) + comb += self.state_w_pc.i_data.eq(0) + + # connect up debug state. note "combinatorially same" below, + # this is a bit naff, passing state over in the dbg class, but + # because it is combinatorial it achieves the desired goal + comb += dbg.state.eq(state) + + # this bit doesn't have to be in the FSM: connect up to read + # regfiles on demand from DMI + self.do_dmi(m, dbg) + + # DEC and TB inc/dec FSM. copy of DEC is put into CoreState, + # (which uses that in PowerDecoder2 to raise 0x900 exception) + self.tb_dec_fsm(m, cur_state.dec) + + # while stopped, allow updating the MSR, PC and SVSTATE. + # these are mainly for debugging purposes (including DMI/JTAG) + with m.If(dbg.core_stopped_i): + with m.If(self.pc_i.ok): + comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) + comb += self.state_w_pc.i_data.eq(self.pc_i.data) + sync += self.pc_changed.eq(1) + with m.If(self.msr_i.ok): + comb += self.state_w_msr.wen.eq(1 << StateRegs.MSR) + comb += self.state_w_msr.i_data.eq(self.msr_i.data) + sync += self.msr_changed.eq(1) + with m.If(self.svstate_i.ok | self.update_svstate): + with m.If(self.svstate_i.ok): # over-ride from external source + comb += self.new_svstate.eq(self.svstate_i.data) + comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE) + comb += self.state_w_sv.i_data.eq(self.new_svstate) + sync += self.sv_changed.eq(1) + + return m + + def __iter__(self): + yield from self.pc_i.ports() + yield from self.msr_i.ports() + yield self.pc_o + yield self.memerr_o + yield from self.core.ports() + yield from self.imem.ports() + yield self.core_bigendian_i + yield self.busy_o + + def ports(self): + return list(self) + + def external_ports(self): + ports = self.pc_i.ports() + ports = self.msr_i.ports() + ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o, + ] + + if self.jtag_en: + ports += list(self.jtag.external_ports()) + else: + # don't add DMI if JTAG is enabled + ports += list(self.dbg.dmi.ports()) + + ports += list(self.imem.ibus.fields.values()) + ports += list(self.core.l0.cmpi.wb_bus().fields.values()) + + if self.sram4x4k: + for sram in self.sram4k: + ports += list(sram.bus.fields.values()) + + if self.xics: + ports += list(self.xics_icp.bus.fields.values()) + ports += list(self.xics_ics.bus.fields.values()) + ports.append(self.int_level_i) + + if self.gpio: + ports += list(self.simple_gpio.bus.fields.values()) + ports.append(self.gpio_o) + + return ports + + def ports(self): + return list(self) + + + +# Fetch Finite State Machine. +# WARNING: there are currently DriverConflicts but it's actually working. +# TODO, here: everything that is global in nature, information from the +# main TestIssuerInternal, needs to move to either ispec() or ospec(). +# not only that: TestIssuerInternal.imem can entirely move into here +# because imem is only ever accessed inside the FetchFSM. +class FetchFSM(ControlBase): + def __init__(self, allow_overlap, svp64_en, imem, core_rst, + pdecode2, cur_state, + dbg, core, svstate, nia, is_svp64_mode): + self.allow_overlap = allow_overlap + self.svp64_en = svp64_en + self.imem = imem + self.core_rst = core_rst + self.pdecode2 = pdecode2 + self.cur_state = cur_state + self.dbg = dbg + self.core = core + self.svstate = svstate + self.nia = nia + self.is_svp64_mode = is_svp64_mode + + # set up pipeline ControlBase and allocate i/o specs + # (unusual: normally done by the Pipeline API) + super().__init__(stage=self) + self.p.i_data, self.n.o_data = self.new_specs(None) + self.i, self.o = self.p.i_data, self.n.o_data + + # next 3 functions are Stage API Compliance + def setup(self, m, i): + pass + + def ispec(self): + return FetchInput() + + def ospec(self): + return FetchOutput() + + def elaborate(self, platform): + """fetch FSM + + this FSM performs fetch of raw instruction data, partial-decodes + it 32-bit at a time to detect SVP64 prefixes, and will optionally + read a 2nd 32-bit quantity if that occurs. + """ + m = super().elaborate(platform) + + dbg = self.dbg + core = self.core + pc = self.i.pc + msr = self.i.msr + svstate = self.svstate + nia = self.nia + is_svp64_mode = self.is_svp64_mode + fetch_pc_o_ready = self.p.o_ready + fetch_pc_i_valid = self.p.i_valid + fetch_insn_o_valid = self.n.o_valid + fetch_insn_i_ready = self.n.i_ready + + comb = m.d.comb + sync = m.d.sync + pdecode2 = self.pdecode2 + cur_state = self.cur_state + dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode + + # also note instruction fetch failed + if hasattr(core, "icache"): + fetch_failed = core.icache.i_out.fetch_failed + flush_needed = True + else: + fetch_failed = Const(0, 1) + flush_needed = False + + # set priv / virt mode on I-Cache, sigh + if isinstance(self.imem, ICache): + comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR]) + comb += self.imem.i_in.virt_mode.eq(msr[MSR.DR]) + + with m.FSM(name='fetch_fsm'): + + # waiting (zzz) + with m.State("IDLE"): + with m.If(~dbg.stopping_o & ~fetch_failed & ~dbg.core_stop_o): + comb += fetch_pc_o_ready.eq(1) + with m.If(fetch_pc_i_valid & ~pdecode2.instr_fault + & ~dbg.core_stop_o): + # instruction allowed to go: start by reading the PC + # capture the PC and also drop it into Insn Memory + # we have joined a pair of combinatorial memory + # lookups together. this is Generally Bad. + comb += self.imem.a_pc_i.eq(pc) + comb += self.imem.a_i_valid.eq(1) + comb += self.imem.f_i_valid.eq(1) + # transfer state to output + sync += cur_state.pc.eq(pc) + sync += cur_state.svstate.eq(svstate) # and svstate + sync += cur_state.msr.eq(msr) # and msr + + m.next = "INSN_READ" # move to "wait for bus" phase + + # dummy pause to find out why simulation is not keeping up + with m.State("INSN_READ"): + if self.allow_overlap: + stopping = dbg.stopping_o + else: + stopping = Const(0) + with m.If(stopping): + # stopping: jump back to idle + m.next = "IDLE" + with m.Else(): + with m.If(self.imem.f_busy_o & + ~pdecode2.instr_fault): # zzz... + # busy but not fetch failed: stay in wait-read + comb += self.imem.a_i_valid.eq(1) + comb += self.imem.f_i_valid.eq(1) + with m.Else(): + # not busy (or fetch failed!): instruction fetched + # when fetch failed, the instruction gets ignored + # by the decoder + if hasattr(core, "icache"): + # blech, icache returns actual instruction + insn = self.imem.f_instr_o + else: + # but these return raw memory + insn = get_insn(self.imem.f_instr_o, cur_state.pc) + if self.svp64_en: + svp64 = self.svp64 + # decode the SVP64 prefix, if any + comb += svp64.raw_opcode_in.eq(insn) + comb += svp64.bigendian.eq(self.core_bigendian_i) + # pass the decoded prefix (if any) to PowerDecoder2 + sync += pdecode2.sv_rm.eq(svp64.svp64_rm) + sync += pdecode2.is_svp64_mode.eq(is_svp64_mode) + # remember whether this is a prefixed instruction, + # so the FSM can readily loop when VL==0 + sync += is_svp64_mode.eq(svp64.is_svp64_mode) + # calculate the address of the following instruction + insn_size = Mux(svp64.is_svp64_mode, 8, 4) + sync += nia.eq(cur_state.pc + insn_size) + with m.If(~svp64.is_svp64_mode): + # with no prefix, store the instruction + # and hand it directly to the next FSM + sync += dec_opcode_o.eq(insn) + m.next = "INSN_READY" + with m.Else(): + # fetch the rest of the instruction from memory + comb += self.imem.a_pc_i.eq(cur_state.pc + 4) + comb += self.imem.a_i_valid.eq(1) + comb += self.imem.f_i_valid.eq(1) + m.next = "INSN_READ2" + else: + # not SVP64 - 32-bit only + sync += nia.eq(cur_state.pc + 4) + sync += dec_opcode_o.eq(insn) + m.next = "INSN_READY" + + with m.State("INSN_READ2"): + with m.If(self.imem.f_busy_o): # zzz... + # busy: stay in wait-read + comb += self.imem.a_i_valid.eq(1) + comb += self.imem.f_i_valid.eq(1) + with m.Else(): + # not busy: instruction fetched + insn = get_insn(self.imem.f_instr_o, cur_state.pc+4) + sync += dec_opcode_o.eq(insn) + m.next = "INSN_READY" + # TODO: probably can start looking at pdecode2.rm_dec + # here or maybe even in INSN_READ state, if svp64_mode + # detected, in order to trigger - and wait for - the + # predicate reading. + if self.svp64_en: + pmode = pdecode2.rm_dec.predmode + """ + if pmode != SVP64PredMode.ALWAYS.value: + fire predicate loading FSM and wait before + moving to INSN_READY + else: + sync += self.srcmask.eq(-1) # set to all 1s + sync += self.dstmask.eq(-1) # set to all 1s + m.next = "INSN_READY" + """ + + with m.State("INSN_READY"): + # hand over the instruction, to be decoded + comb += fetch_insn_o_valid.eq(1) + with m.If(fetch_insn_i_ready): + m.next = "IDLE" + + # whatever was done above, over-ride it if core reset is held + with m.If(self.core_rst): + sync += nia.eq(0) + + return m + + +class TestIssuerInternal(TestIssuerBase): + """TestIssuer - reads instructions from TestMemory and issues them + + efficiency and speed is not the main goal here: functional correctness + and code clarity is. optimisations (which almost 100% interfere with + easy understanding) come later. + """ + + def fetch_predicate_fsm(self, m, + pred_insn_i_valid, pred_insn_o_ready, + pred_mask_o_valid, pred_mask_i_ready): + """fetch_predicate_fsm - obtains (constructs in the case of CR) + src/dest predicate masks + + https://bugs.libre-soc.org/show_bug.cgi?id=617 + the predicates can be read here, by using IntRegs r_ports['pred'] + or CRRegs r_ports['pred']. in the case of CRs it will have to + be done through multiple reads, extracting one relevant at a time. + later, a faster way would be to use the 32-bit-wide CR port but + this is more complex decoding, here. equivalent code used in + ISACaller is "from openpower.decoder.isa.caller import get_predcr" + + note: this ENTIRE FSM is not to be called when svp64 is disabled + """ + comb = m.d.comb + sync = m.d.sync + pdecode2 = self.pdecode2 + rm_dec = pdecode2.rm_dec # SVP64RMModeDecode + predmode = rm_dec.predmode + srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred + cr_pred, int_pred = self.cr_pred, self.int_pred # read regfiles + # get src/dst step, so we can skip already used mask bits + cur_state = self.cur_state + srcstep = cur_state.svstate.srcstep + dststep = cur_state.svstate.dststep + cur_vl = cur_state.svstate.vl + + # decode predicates + sregread, sinvert, sunary, sall1s = get_predint(m, srcpred, 's') + dregread, dinvert, dunary, dall1s = get_predint(m, dstpred, 'd') + sidx, scrinvert = get_predcr(m, srcpred, 's') + didx, dcrinvert = get_predcr(m, dstpred, 'd') + + # store fetched masks, for either intpred or crpred + # when src/dst step is not zero, the skipped mask bits need to be + # shifted-out, before actually storing them in src/dest mask + new_srcmask = Signal(64, reset_less=True) + new_dstmask = Signal(64, reset_less=True) + + with m.FSM(name="fetch_predicate"): with m.State("FETCH_PRED_IDLE"): comb += pred_insn_o_ready.eq(1) @@ -664,7 +1009,7 @@ class TestIssuerInternal(Elaboratable): with m.If(pred_mask_i_ready): m.next = "FETCH_PRED_IDLE" - def issue_fsm(self, m, core, pc_changed, sv_changed, nia, + def issue_fsm(self, m, core, nia, dbg, core_rst, is_svp64_mode, fetch_pc_o_ready, fetch_pc_i_valid, fetch_insn_o_valid, fetch_insn_i_ready, @@ -687,13 +1032,12 @@ class TestIssuerInternal(Elaboratable): sync = m.d.sync pdecode2 = self.pdecode2 cur_state = self.cur_state + new_svstate = self.new_svstate # temporaries dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode # for updating svstate (things like srcstep etc.) - update_svstate = Signal() # set this (below) if updating - new_svstate = SVSTATERec("new_svstate") comb += new_svstate.eq(cur_state.svstate) # precalculate srcstep+1 and dststep+1 @@ -737,15 +1081,11 @@ class TestIssuerInternal(Elaboratable): with m.Else(): # tell core it's stopped, and acknowledge debug handshake comb += dbg.core_stopped_i.eq(1) - # while stopped, allow updating the PC and SVSTATE - with m.If(self.pc_i.ok): - comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) - comb += self.state_w_pc.i_data.eq(self.pc_i.data) - sync += pc_changed.eq(1) + # while stopped, allow updating SVSTATE with m.If(self.svstate_i.ok): comb += new_svstate.eq(self.svstate_i.data) - comb += update_svstate.eq(1) - sync += sv_changed.eq(1) + comb += self.update_svstate.eq(1) + sync += self.sv_changed.eq(1) # wait for an instruction to arrive from Fetch with m.State("INSN_WAIT"): @@ -842,7 +1182,7 @@ class TestIssuerInternal(Elaboratable): comb += self.state_w_pc.i_data.eq(nia) comb += new_svstate.srcstep.eq(0) comb += new_svstate.dststep.eq(0) - comb += update_svstate.eq(1) + comb += self.update_svstate.eq(1) # synchronize with the simulator comb += self.insn_done.eq(1) # go back to Issue @@ -851,7 +1191,7 @@ class TestIssuerInternal(Elaboratable): # update new src/dst step comb += new_svstate.srcstep.eq(skip_srcstep) comb += new_svstate.dststep.eq(skip_dststep) - comb += update_svstate.eq(1) + comb += self.update_svstate.eq(1) # proceed to Decode m.next = "DECODE_SV" @@ -929,15 +1269,21 @@ class TestIssuerInternal(Elaboratable): cur_vl = cur_state.svstate.vl comb += is_last.eq(next_srcstep == cur_vl) + with m.If(pdecode2.instr_fault): + # reset instruction fault, try again + sync += pdecode2.instr_fault.eq(0) + m.next = "ISSUE_START" + # return directly to Decode if Execute generated an # exception. - with m.If(pdecode2.ldst_exc.happened): + with m.Elif(pdecode2.ldst_exc.happened): m.next = "DECODE_SV" - # if either PC or SVSTATE were changed by the previous + # if MSR, PC or SVSTATE were changed by the previous # instruction, go directly back to Fetch, without - # updating either PC or SVSTATE - with m.Elif(pc_changed | sv_changed): + # updating either MSR PC or SVSTATE + with m.Elif(self.msr_changed | self.pc_changed | + self.sv_changed): m.next = "ISSUE_START" # also return to Fetch, when no output was a vector @@ -956,18 +1302,18 @@ class TestIssuerInternal(Elaboratable): with m.If(pdecode2.loop_continue): comb += new_svstate.srcstep.eq(0) comb += new_svstate.dststep.eq(0) - comb += update_svstate.eq(1) + comb += self.update_svstate.eq(1) else: comb += new_svstate.srcstep.eq(0) comb += new_svstate.dststep.eq(0) - comb += update_svstate.eq(1) + comb += self.update_svstate.eq(1) m.next = "ISSUE_START" # returning to Execute? then, first update SRCSTEP with m.Else(): comb += new_svstate.srcstep.eq(next_srcstep) comb += new_svstate.dststep.eq(next_dststep) - comb += update_svstate.eq(1) + comb += self.update_svstate.eq(1) # return to mask skip loop m.next = "PRED_SKIP" @@ -978,185 +1324,79 @@ class TestIssuerInternal(Elaboratable): comb += core.icache.flush_in.eq(1) # stop instruction fault sync += pdecode2.instr_fault.eq(0) - if flush_needed: - # request the icache to stop asserting "failed" - comb += core.icache.flush_in.eq(1) - # stop instruction fault - sync += pdecode2.instr_fault.eq(0) - # while stopped, allow updating the PC and SVSTATE - with m.If(self.pc_i.ok): - comb += self.state_w_pc.wen.eq(1 << StateRegs.PC) - comb += self.state_w_pc.i_data.eq(self.pc_i.data) - sync += pc_changed.eq(1) - with m.If(self.svstate_i.ok): - comb += new_svstate.eq(self.svstate_i.data) - comb += update_svstate.eq(1) - sync += sv_changed.eq(1) - - # check if svstate needs updating: if so, write it to State Regfile - with m.If(update_svstate): - comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE) - comb += self.state_w_sv.i_data.eq(new_svstate) - sync += cur_state.svstate.eq(new_svstate) # for next clock - - def execute_fsm(self, m, core, pc_changed, sv_changed, - exec_insn_i_valid, exec_insn_o_ready, - exec_pc_o_valid, exec_pc_i_ready): - """execute FSM - - execute FSM. this interacts with the "issue" FSM - through exec_insn_ready/valid (incoming) and exec_pc_ready/valid - (outgoing). SVP64 RM prefixes have already been set up by the - "issue" phase, so execute is fairly straightforward. - """ - - comb = m.d.comb - sync = m.d.sync - pdecode2 = self.pdecode2 - - # temporaries - core_busy_o = core.n.o_data.busy_o # core is busy - core_ivalid_i = core.p.i_valid # instruction is valid - - if hasattr(core, "icache"): - fetch_failed = core.icache.i_out.fetch_failed - else: - fetch_failed = Const(0, 1) - - with m.FSM(name="exec_fsm"): - - # waiting for instruction bus (stays there until not busy) - with m.State("INSN_START"): - comb += exec_insn_o_ready.eq(1) - with m.If(exec_insn_i_valid): - comb += core_ivalid_i.eq(1) # instruction is valid/issued - sync += sv_changed.eq(0) - sync += pc_changed.eq(0) - with m.If(core.p.o_ready): # only move if accepted - m.next = "INSN_ACTIVE" # move to "wait completion" - - # instruction started: must wait till it finishes - with m.State("INSN_ACTIVE"): - # note changes to PC and SVSTATE - with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)): - sync += sv_changed.eq(1) - with m.If(self.state_nia.wen & (1 << StateRegs.PC)): - sync += pc_changed.eq(1) - with m.If(~core_busy_o): # instruction done! - comb += exec_pc_o_valid.eq(1) - with m.If(exec_pc_i_ready): - # when finished, indicate "done". - # however, if there was an exception, the instruction - # is *not* yet done. this is an implementation - # detail: we choose to implement exceptions by - # taking the exception information from the LDST - # unit, putting that *back* into the PowerDecoder2, - # and *re-running the entire instruction*. - # if we erroneously indicate "done" here, it is as if - # there were *TWO* instructions: - # 1) the failed LDST 2) a TRAP. - with m.If(~pdecode2.ldst_exc.happened & - ~fetch_failed): - comb += self.insn_done.eq(1) - m.next = "INSN_START" # back to fetch - - def setup_peripherals(self, m): - comb, sync = m.d.comb, m.d.sync - - # okaaaay so the debug module must be in coresync clock domain - # but NOT its reset signal. to cope with this, set every single - # submodule explicitly in coresync domain, debug and JTAG - # in their own one but using *external* reset. - csd = DomainRenamer("coresync") - dbd = DomainRenamer(self.dbg_domain) - - m.submodules.core = core = csd(self.core) - # this _so_ needs sorting out. ICache is added down inside - # LoadStore1 and is already a submodule of LoadStore1 - if not isinstance(self.imem, ICache): - m.submodules.imem = imem = csd(self.imem) - m.submodules.dbg = dbg = dbd(self.dbg) - if self.jtag_en: - m.submodules.jtag = jtag = dbd(self.jtag) - # TODO: UART2GDB mux, here, from external pin - # see https://bugs.libre-soc.org/show_bug.cgi?id=499 - sync += dbg.dmi.connect_to(jtag.dmi) - - cur_state = self.cur_state - - # 4x 4k SRAM blocks. these simply "exist", they get routed in litex - if self.sram4x4k: - for i, sram in enumerate(self.sram4k): - m.submodules["sram4k_%d" % i] = csd(sram) - comb += sram.enable.eq(self.wb_sram_en) - - # XICS interrupt handler - if self.xics: - m.submodules.xics_icp = icp = csd(self.xics_icp) - m.submodules.xics_ics = ics = csd(self.xics_ics) - comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP - sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core - - # GPIO test peripheral - if self.gpio: - m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio) - # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl) - # XXX causes litex ECP5 test to get wrong idea about input and output - # (but works with verilator sim *sigh*) - # if self.gpio and self.xics: - # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0]) + # check if svstate needs updating: if so, write it to State Regfile + with m.If(self.update_svstate): + sync += cur_state.svstate.eq(self.new_svstate) # for next clock - # instruction decoder - pdecode = create_pdecode() - m.submodules.dec2 = pdecode2 = csd(self.pdecode2) - if self.svp64_en: - m.submodules.svp64 = svp64 = csd(self.svp64) + def execute_fsm(self, m, core, + exec_insn_i_valid, exec_insn_o_ready, + exec_pc_o_valid, exec_pc_i_ready): + """execute FSM - # convenience - dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer - intrf = self.core.regs.rf['int'] + execute FSM. this interacts with the "issue" FSM + through exec_insn_ready/valid (incoming) and exec_pc_ready/valid + (outgoing). SVP64 RM prefixes have already been set up by the + "issue" phase, so execute is fairly straightforward. + """ - # clock delay power-on reset - cd_por = ClockDomain(reset_less=True) - cd_sync = ClockDomain() - core_sync = ClockDomain("coresync") - m.domains += cd_por, cd_sync, core_sync - if self.dbg_domain != "sync": - dbg_sync = ClockDomain(self.dbg_domain) - m.domains += dbg_sync + comb = m.d.comb + sync = m.d.sync + pdecode2 = self.pdecode2 - ti_rst = Signal(reset_less=True) - delay = Signal(range(4), reset=3) - with m.If(delay != 0): - m.d.por += delay.eq(delay - 1) - comb += cd_por.clk.eq(ClockSignal()) + # temporaries + core_busy_o = core.n.o_data.busy_o # core is busy + core_ivalid_i = core.p.i_valid # instruction is valid - # power-on reset delay - core_rst = ResetSignal("coresync") - comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal()) - comb += core_rst.eq(ti_rst) + if hasattr(core, "icache"): + fetch_failed = core.icache.i_out.fetch_failed + else: + fetch_failed = Const(0, 1) - # debug clock is same as coresync, but reset is *main external* - if self.dbg_domain != "sync": - dbg_rst = ResetSignal(self.dbg_domain) - comb += dbg_rst.eq(ResetSignal()) + with m.FSM(name="exec_fsm"): - # busy/halted signals from core - core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy - comb += self.busy_o.eq(core_busy_o) - comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i) + # waiting for instruction bus (stays there until not busy) + with m.State("INSN_START"): + comb += exec_insn_o_ready.eq(1) + with m.If(exec_insn_i_valid): + comb += core_ivalid_i.eq(1) # instruction is valid/issued + sync += self.sv_changed.eq(0) + sync += self.pc_changed.eq(0) + sync += self.msr_changed.eq(0) + with m.If(core.p.o_ready): # only move if accepted + m.next = "INSN_ACTIVE" # move to "wait completion" - # temporary hack: says "go" immediately for both address gen and ST - l0 = core.l0 - ldst = core.fus.fus['ldst0'] - st_go_edge = rising_edge(m, ldst.st.rel_o) - # link addr-go direct to rel - m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) - m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel + # instruction started: must wait till it finishes + with m.State("INSN_ACTIVE"): + # note changes to MSR, PC and SVSTATE + # XXX oops, really must monitor *all* State Regfile write + # ports looking for changes! + with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)): + sync += self.sv_changed.eq(1) + with m.If(self.state_nia.wen & (1 << StateRegs.MSR)): + sync += self.msr_changed.eq(1) + with m.If(self.state_nia.wen & (1 << StateRegs.PC)): + sync += self.pc_changed.eq(1) + with m.If(~core_busy_o): # instruction done! + comb += exec_pc_o_valid.eq(1) + with m.If(exec_pc_i_ready): + # when finished, indicate "done". + # however, if there was an exception, the instruction + # is *not* yet done. this is an implementation + # detail: we choose to implement exceptions by + # taking the exception information from the LDST + # unit, putting that *back* into the PowerDecoder2, + # and *re-running the entire instruction*. + # if we erroneously indicate "done" here, it is as if + # there were *TWO* instructions: + # 1) the failed LDST 2) a TRAP. + with m.If(~pdecode2.ldst_exc.happened & + ~pdecode2.instr_fault): + comb += self.insn_done.eq(1) + m.next = "INSN_START" # back to fetch def elaborate(self, platform): - m = Module() + m = super().elaborate(platform) # convenience comb, sync = m.d.comb, m.d.sync cur_state = self.cur_state @@ -1166,43 +1406,17 @@ class TestIssuerInternal(Elaboratable): # set up peripherals and core core_rst = self.core_rst - self.setup_peripherals(m) - - # reset current state if core reset requested - with m.If(core_rst): - m.d.sync += self.cur_state.eq(0) - - # PC and instruction from I-Memory - comb += self.pc_o.eq(cur_state.pc) - pc_changed = Signal() # note write to PC - sv_changed = Signal() # note write to SVSTATE # indicate to outside world if any FU is still executing comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing - # read state either from incoming override or from regfile - # TODO: really should be doing MSR in the same way - pc = state_get(m, core_rst, self.pc_i, - "pc", # read PC - self.state_r_pc, StateRegs.PC) - svstate = state_get(m, core_rst, self.svstate_i, - "svstate", # read SVSTATE - self.state_r_sv, StateRegs.SVSTATE) - - # don't write pc every cycle - comb += self.state_w_pc.wen.eq(0) - comb += self.state_w_pc.i_data.eq(0) - # address of the next instruction, in the absence of a branch # depends on the instruction size nia = Signal(64) # connect up debug signals - # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o) - comb += dbg.terminate_i.eq(core.o.core_terminate_o) - comb += dbg.state.pc.eq(pc) - comb += dbg.state.svstate.eq(svstate) - comb += dbg.state.msr.eq(cur_state.msr) + with m.If(core.o.core_terminate_o): + comb += dbg.terminate_i.eq(1) # pass the prefix mode from Fetch to Issue, so the latter can loop # on VL==0 @@ -1248,17 +1462,20 @@ class TestIssuerInternal(Elaboratable): # set up Fetch FSM fetch = FetchFSM(self.allow_overlap, self.svp64_en, self.imem, core_rst, pdecode2, cur_state, - dbg, core, svstate, nia, is_svp64_mode) + dbg, core, + dbg.state.svstate, # combinatorially same + nia, is_svp64_mode) m.submodules.fetch = fetch # connect up in/out data to existing Signals - comb += fetch.p.i_data.pc.eq(pc) + comb += fetch.p.i_data.pc.eq(dbg.state.pc) # combinatorially same + comb += fetch.p.i_data.msr.eq(dbg.state.msr) # combinatorially same # and the ready/valid signalling comb += fetch_pc_o_ready.eq(fetch.p.o_ready) comb += fetch.p.i_valid.eq(fetch_pc_i_valid) comb += fetch_insn_o_valid.eq(fetch.n.o_valid) comb += fetch.n.i_ready.eq(fetch_insn_i_ready) - self.issue_fsm(m, core, pc_changed, sv_changed, nia, + self.issue_fsm(m, core, nia, dbg, core_rst, is_svp64_mode, fetch_pc_o_ready, fetch_pc_i_valid, fetch_insn_o_valid, fetch_insn_i_ready, @@ -1272,168 +1489,23 @@ class TestIssuerInternal(Elaboratable): pred_insn_i_valid, pred_insn_o_ready, pred_mask_o_valid, pred_mask_i_ready) - self.execute_fsm(m, core, pc_changed, sv_changed, + self.execute_fsm(m, core, exec_insn_i_valid, exec_insn_o_ready, exec_pc_o_valid, exec_pc_i_ready) - # this bit doesn't have to be in the FSM: connect up to read - # regfiles on demand from DMI - self.do_dmi(m, dbg) - - # DEC and TB inc/dec FSM. copy of DEC is put into CoreState, - # (which uses that in PowerDecoder2 to raise 0x900 exception) - self.tb_dec_fsm(m, cur_state.dec) - - return m - - def do_dmi(self, m, dbg): - """deals with DMI debug requests - - currently only provides read requests for the INT regfile, CR and XER - it will later also deal with *writing* to these regfiles. - """ - comb = m.d.comb - sync = m.d.sync - dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer - intrf = self.core.regs.rf['int'] - - with m.If(d_reg.req): # request for regfile access being made - # TODO: error-check this - # XXX should this be combinatorial? sync better? - if intrf.unary: - comb += self.int_r.ren.eq(1 << d_reg.addr) - else: - comb += self.int_r.addr.eq(d_reg.addr) - comb += self.int_r.ren.eq(1) - d_reg_delay = Signal() - sync += d_reg_delay.eq(d_reg.req) - with m.If(d_reg_delay): - # data arrives one clock later - comb += d_reg.data.eq(self.int_r.o_data) - comb += d_reg.ack.eq(1) - - # sigh same thing for CR debug - with m.If(d_cr.req): # request for regfile access being made - comb += self.cr_r.ren.eq(0b11111111) # enable all - d_cr_delay = Signal() - sync += d_cr_delay.eq(d_cr.req) - with m.If(d_cr_delay): - # data arrives one clock later - comb += d_cr.data.eq(self.cr_r.o_data) - comb += d_cr.ack.eq(1) - - # aaand XER... - with m.If(d_xer.req): # request for regfile access being made - comb += self.xer_r.ren.eq(0b111111) # enable all - d_xer_delay = Signal() - sync += d_xer_delay.eq(d_xer.req) - with m.If(d_xer_delay): - # data arrives one clock later - comb += d_xer.data.eq(self.xer_r.o_data) - comb += d_xer.ack.eq(1) - - def tb_dec_fsm(self, m, spr_dec): - """tb_dec_fsm - - this is a FSM for updating either dec or tb. it runs alternately - DEC, TB, DEC, TB. note that SPR pipeline could have written a new - value to DEC, however the regfile has "passthrough" on it so this - *should* be ok. - - see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076 - """ - - comb, sync = m.d.comb, m.d.sync - fast_rf = self.core.regs.rf['fast'] - fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB - fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB - - with m.FSM() as fsm: - - # initiates read of current DEC - with m.State("DEC_READ"): - comb += fast_r_dectb.addr.eq(FastRegs.DEC) - comb += fast_r_dectb.ren.eq(1) - m.next = "DEC_WRITE" - - # waits for DEC read to arrive (1 cycle), updates with new value - with m.State("DEC_WRITE"): - new_dec = Signal(64) - # TODO: MSR.LPCR 32-bit decrement mode - comb += new_dec.eq(fast_r_dectb.o_data - 1) - comb += fast_w_dectb.addr.eq(FastRegs.DEC) - comb += fast_w_dectb.wen.eq(1) - comb += fast_w_dectb.i_data.eq(new_dec) - sync += spr_dec.eq(new_dec) # copy into cur_state for decoder - m.next = "TB_READ" - - # initiates read of current TB - with m.State("TB_READ"): - comb += fast_r_dectb.addr.eq(FastRegs.TB) - comb += fast_r_dectb.ren.eq(1) - m.next = "TB_WRITE" - - # waits for read TB to arrive, initiates write of current TB - with m.State("TB_WRITE"): - new_tb = Signal(64) - comb += new_tb.eq(fast_r_dectb.o_data + 1) - comb += fast_w_dectb.addr.eq(FastRegs.TB) - comb += fast_w_dectb.wen.eq(1) - comb += fast_w_dectb.i_data.eq(new_tb) - m.next = "DEC_READ" - return m - def __iter__(self): - yield from self.pc_i.ports() - yield self.pc_o - yield self.memerr_o - yield from self.core.ports() - yield from self.imem.ports() - yield self.core_bigendian_i - yield self.busy_o - - def ports(self): - return list(self) - - def external_ports(self): - ports = self.pc_i.ports() - ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o, - ] - - if self.jtag_en: - ports += list(self.jtag.external_ports()) - else: - # don't add DMI if JTAG is enabled - ports += list(self.dbg.dmi.ports()) - - ports += list(self.imem.ibus.fields.values()) - ports += list(self.core.l0.cmpi.wb_bus().fields.values()) - - if self.sram4x4k: - for sram in self.sram4k: - ports += list(sram.bus.fields.values()) - - if self.xics: - ports += list(self.xics_icp.bus.fields.values()) - ports += list(self.xics_ics.bus.fields.values()) - ports.append(self.int_level_i) - - if self.gpio: - ports += list(self.simple_gpio.bus.fields.values()) - ports.append(self.gpio_o) - - return ports - - def ports(self): - return list(self) - class TestIssuer(Elaboratable): def __init__(self, pspec): self.ti = TestIssuerInternal(pspec) + # XXX TODO: make this a command-line selectable option from pspec + #from soc.simple.inorder import TestIssuerInternalInOrder + #self.ti = TestIssuerInternalInOrder(pspec) self.pll = DummyPLL(instance=True) + self.dbg_rst_i = Signal(reset_less=True) + # PLL direct clock or not self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll if self.pll_en: @@ -1480,23 +1552,24 @@ class TestIssuer(Elaboratable): # internal clock is set to selector clock-out. has the side-effect of # running TestIssuer at this speed (see DomainRenamer("intclk") above) # debug clock runs at coresync internal clock - cd_coresync = ClockDomain("coresync") - #m.domains += cd_coresync if self.ti.dbg_domain != 'sync': cd_dbgsync = ClockDomain("dbgsync") - #m.domains += cd_dbgsync - intclk = ClockSignal("coresync") + intclk = ClockSignal(self.ti.core_domain) dbgclk = ClockSignal(self.ti.dbg_domain) # XXX BYPASS PLL XXX # XXX BYPASS PLL XXX # XXX BYPASS PLL XXX if self.pll_en: comb += intclk.eq(self.ref_clk) + assert self.ti.core_domain != 'sync', \ + "cannot set core_domain to sync and use pll at the same time" else: - comb += intclk.eq(ClockSignal()) + if self.ti.core_domain != 'sync': + comb += intclk.eq(ClockSignal()) if self.ti.dbg_domain != 'sync': dbgclk = ClockSignal(self.ti.dbg_domain) comb += dbgclk.eq(intclk) + comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i) return m