X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fsimple%2Fcore.py;h=9a4abacc3135e647ae4be3d9a8b7882e7ce68fe4;hb=2fc4acde28074b5c3bf8607fdf7d923e017df260;hp=93bf2966d609886a70fe55591e4776f6cfe9602c;hpb=a32e13f196a62b9b5132bf932264466c7dbd48ed;p=soc.git diff --git a/src/soc/simple/core.py b/src/soc/simple/core.py index 93bf2966..9a4abacc 100644 --- a/src/soc/simple/core.py +++ b/src/soc/simple/core.py @@ -17,6 +17,8 @@ the brain-dead part of this module is that even though there is no conflict of access, regfile read/write hazards are *not* analysed, and consequently it is safer to wait for the Function Unit to complete before allowing a new instruction to proceed. +(update: actually this is being added now: +https://bugs.libre-soc.org/show_bug.cgi?id=737) """ from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux, @@ -24,8 +26,7 @@ from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux, from nmigen.cli import rtlil from openpower.decoder.power_decoder2 import PowerDecodeSubset -from openpower.decoder.power_regspec_map import regspec_decode_read -from openpower.decoder.power_regspec_map import regspec_decode_write +from openpower.decoder.power_regspec_map import regspec_decode from openpower.sv.svp64 import SVP64Rec from nmutil.picker import PriorityPicker @@ -46,8 +47,7 @@ import operator from nmutil.util import rising_edge FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"]) -ByRegSpec = namedtuple("ByRegSpec", ["rdflag", "wrport", "read", - "write", "wid", "specs"]) +ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"]) # helper function for reducing a list of signals down to a parallel # ORed single signal. @@ -71,6 +71,47 @@ def sort_fuspecs(fuspecs): return res # enumerate(res) +# a hazard bitvector "remap" function which returns an AST expression +# that remaps read/write hazard regfile port numbers to either a full +# bitvector or a reduced subset one. SPR for example is reduced to a +# single bit. +# CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per +# regfile! therefore the remapping is per regfile, *NOT* per regfile +# port and certainly not based on whether it is a read port or write port. +# note that any reductions here will result in degraded performance due +# to conflicts, but at least it keeps the hazard matrix sizes down to "sane" +def bitvector_remap(regfile, rfile, port): + # 8-bits (at the moment, no SVP64), CR is unary: no remap + if regfile == 'CR': + return port + # 3 bits, unary alrady: return the port + if regfile == 'XER': + return port + # 3 bits, unary: return the port + if regfile == 'XER': + return port + # 5 bits, unary: return the port + if regfile == 'STATE': + return port + # 9 bits (9 entries), might be unary already + if regfile == 'FAST': + if rfile.unary: # FAST might be unary already + return port + else: + return 1 << port + # 10 bits (!!) - reduce to one + if regfile == 'SPR': + if rfile.unary: # FAST might be unary already + return port + else: + return 1 << port + if regfile == 'INT': + if rfile.unary: # INT, check if unary/binary + return port + else: + return 1 << port + + # derive from ControlBase rather than have a separate Stage instance, # this is simpler to do class NonProductionCore(ControlBase): @@ -91,7 +132,7 @@ class NonProductionCore(ControlBase): (pspec.allow_overlap == True)) # test core type - self.make_hazard_vecs = True + self.make_hazard_vecs = self.allow_overlap self.core_type = "fsm" if hasattr(pspec, "core_type"): self.core_type = pspec.core_type @@ -108,14 +149,34 @@ class NonProductionCore(ControlBase): # link LoadStore1 into MMU mmu = self.fus.get_fu('mmu0') + ldst0 = self.fus.get_fu('ldst0') print ("core pspec", pspec.ldst_ifacetype) print ("core mmu", mmu) if mmu is not None: - print ("core lsmem.lsi", l0.cmpi.lsmem.lsi) - mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi) + lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object + print ("core lsmem.lsi", lsi) + mmu.alu.set_ldst_interface(lsi) + # urr store I-Cache in core so it is easier to get at + self.icache = lsi.icache + + # alternative reset values for STATE regs. these probably shouldn't + # be set, here, instead have them done by Issuer. which they are. + # as well. because core.state overrides them. sigh. + self.msr_at_reset = 0x0 + self.pc_at_reset = 0x0 + if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int): + self.msr_at_reset = pspec.msr_reset + if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int): + self.pc_at_reset = pspec.pc_reset + state_resets = [self.pc_at_reset, # PC at reset + self.msr_at_reset, # MSR at reset + 0x0, # SVSTATE at reset + 0x0, # DEC at reset + 0x0] # TB at reset # register files (yes plural) - self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs) + self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs, + state_resets=state_resets) # set up input and output: unusual requirement to set data directly # (due to the way that the core is set up in a different domain, @@ -135,6 +196,9 @@ class NonProductionCore(ControlBase): self.decoders = {} self.des = {} + # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti + # they should be shared (put into the ALU *once*). + for funame, fu in self.fus.fus.items(): f_name = fu.fnunit.name fnunit = fu.fnunit.value @@ -143,17 +207,29 @@ class NonProductionCore(ControlBase): # TRAP decoder is the *main* decoder self.trapunit = funame continue + assert funame not in self.decoders self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name, final=True, state=self.ireg.state, svp64_en=self.svp64_en, regreduce_en=self.regreduce_en) self.des[funame] = self.decoders[funame].do + print ("create decoder subset", funame, opkls, self.des[funame]) + + # create per-Function Unit write-after-write hazard signals + # yes, really, this should have been added in ReservationStations + # but hey. + for funame, fu in self.fus.fus.items(): + fu._waw_hazard = Signal(name="waw_%s" % funame) # share the SPR decoder with the MMU if it exists if "mmu0" in self.decoders: self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"] + # allow pausing of the DEC/TB FSM back in Issuer, by spotting + # if there is an MTSPR instruction + self.pause_dec_tb = Signal() + # next 3 functions are Stage API Compliance def setup(self, m, i): pass @@ -181,6 +257,13 @@ class NonProductionCore(ControlBase): regs = self.regs fus = self.fus.fus + # amalgamate write-hazards into a single top-level Signal + self.waw_hazard = Signal() + whaz = [] + for funame, fu in self.fus.fus.items(): + whaz.append(fu._waw_hazard) + comb += self.waw_hazard.eq(Cat(*whaz).bool()) + # connect decoders self.connect_satellite_decoders(m) @@ -211,23 +294,25 @@ class NonProductionCore(ControlBase): # connect each satellite decoder and give it the instruction. # as subset decoders this massively reduces wire fanout given # the large number of ALUs - setattr(m.submodules, "dec_%s" % v.fn_name, v) + m.submodules["dec_%s" % k] = v comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i) comb += v.dec.bigendian.eq(self.ireg.bigendian_i) # sigh due to SVP64 RA_OR_ZERO detection connect these too comb += v.sv_a_nz.eq(self.ireg.sv_a_nz) - if self.svp64_en: - comb += v.pred_sm.eq(self.ireg.sv_pred_sm) - comb += v.pred_dm.eq(self.ireg.sv_pred_dm) - if k != self.trapunit: - comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM - comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode) - # only the LDST PowerDecodeSubset *actually* needs to - # know to use the alternative decoder. this is all - # a terrible hack - if k.lower().startswith("ldst"): - comb += v.use_svp64_ldst_dec.eq( - self.ireg.use_svp64_ldst_dec) + if not self.svp64_en: + continue + comb += v.pred_sm.eq(self.ireg.sv_pred_sm) + comb += v.pred_dm.eq(self.ireg.sv_pred_dm) + if k == self.trapunit: + continue + comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM + comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode) + # only the LDST PowerDecodeSubset *actually* needs to + # know to use the alternative decoder. this is all + # a terrible hack + if not k.lower().startswith("ldst"): + continue + comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec) def connect_instruction(self, m): """connect_instruction @@ -314,7 +399,7 @@ class NonProductionCore(ControlBase): # rdmask, which is for registers needs to come from the *main* decoder for funame, fu in fus.items(): - rdmask = get_rdflags(self.ireg.e, fu) + rdmask = get_rdflags(m, self.ireg.e, fu) comb += fu.rdmaskn.eq(~rdmask) # sigh - need a NOP counter @@ -354,13 +439,31 @@ class NonProductionCore(ControlBase): # issue, busy, read flags and mask to FU with m.If(enable): # operand comes from the *local* decoder + # do not actually issue, though, if there + # is a waw hazard. decoder has to still + # be asserted in order to detect that, tho comb += fu.oper_i.eq_from(do) - comb += fu.issue_i.eq(1) # issue when valid + if funame == 'mmu0': + # URRR this is truly dreadful. + # OP_FETCH_FAILED is a "fake" op. + # no instruction creates it. OP_TRAP + # uses the *main* decoder: this is + # a *Satellite* decoder that reacts + # on *insn_in*... not fake ops. gaah. + main_op = self.ireg.e.do + with m.If(main_op.insn_type == + MicrOp.OP_FETCH_FAILED): + comb += fu.oper_i.insn_type.eq( + MicrOp.OP_FETCH_FAILED) + comb += fu.oper_i.fn_unit.eq( + Function.MMU) + # issue when valid (and no write-hazard) + comb += fu.issue_i.eq(~self.waw_hazard) # instruction ok, indicate ready comb += self.p.o_ready.eq(1) if self.allow_overlap: - with m.If(~fu_found): + with m.If(~fu_found | self.waw_hazard): # latch copy of instruction sync += ilatch.eq(self.i) comb += self.p.o_ready.eq(1) # accept @@ -382,16 +485,23 @@ class NonProductionCore(ControlBase): # run this FunctionUnit if enabled route op, # issue, busy, read flags and mask to FU with m.If(enable): - # operand comes from the *local* decoder + # operand comes from the *local* decoder, + # which is asserted even if not issued, + # so that WaW-detection can check for hazards. + # only if the waw hazard is clear does the + # instruction actually get issued comb += fu.oper_i.eq_from(do) - comb += fu.issue_i.eq(1) # issue when valid - comb += self.p.o_ready.eq(1) - comb += busy_o.eq(0) - m.next = "READY" + # issue when valid + comb += fu.issue_i.eq(~self.waw_hazard) + with m.If(~self.waw_hazard): + comb += self.p.o_ready.eq(1) + comb += busy_o.eq(0) + m.next = "READY" print ("core: overlap allowed", self.allow_overlap) - busys = map(lambda fu: fu.busy_o, fus.values()) - comb += any_busy_o.eq(Cat(*busys).bool()) + # true when any FU is busy (including the cycle where it is perhaps + # to be issued - because that's what fu_busy is) + comb += any_busy_o.eq(fu_busy.bool()) if not self.allow_overlap: # for simple non-overlap, if any instruction is busy, set # busy output for core. @@ -401,6 +511,26 @@ class NonProductionCore(ControlBase): # and resolved with m.If(self.issue_conflict): comb += busy_o.eq(1) + # make sure that LDST, SPR, MMU, Branch and Trap all say "busy" + # and do not allow overlap. these are all the ones that + # are non-forward-progressing: exceptions etc. that otherwise + # change CoreState for some reason (MSR, PC, SVSTATE) + for funame, fu in fus.items(): + if (funame.lower().startswith('ldst') or + funame.lower().startswith('branch') or + funame.lower().startswith('mmu') or + funame.lower().startswith('spr') or + funame.lower().startswith('trap')): + with m.If(fu.busy_o): + comb += busy_o.eq(1) + # for SPR pipeline pause dec/tb FSM to avoid race condition + # TODO: really this should be much more sophisticated, + # spot MTSPR, spot that DEC/TB is what is to be updated. + # a job for PowerDecoder2, there + if funame.lower().startswith('spr'): + with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR + ): + comb += self.pause_dec_tb.eq(1) # return both the function unit "enable" dict as well as the "busy". # the "busy-or-issued" can be passed in to the Read/Write port @@ -424,7 +554,7 @@ class NonProductionCore(ControlBase): # for checking if the read port has an outstanding write if self.make_hazard_vecs: wv = regs.wv[regfile.lower()] - wvchk = wv.r_ports["issue"] # write-vec bit-level hazard check + wvchk = wv.q_int # write-vec bit-level hazard check # if a hazard is detected on this read port, simply blithely block # every FU from reading on it. this is complete overkill but very @@ -440,38 +570,53 @@ class NonProductionCore(ControlBase): ppoffs = [] for i, fspec in enumerate(fspecs): # get the regfile specs for this regfile port - (rf, wf, read, write, wid, fuspec) = fspec - print ("fpsec", i, fspec, len(fuspec)) + print ("fpsec", i, fspec, len(fspec.specs)) + name = "%s_%s_%d" % (regfile, regname, i) ppoffs.append(pplen) # record offset for picker - pplen += len(fuspec) - name = "rdflag_%s_%s_%d" % (regfile, regname, i) - rdflag = Signal(name=name, reset_less=True) - comb += rdflag.eq(rf) + pplen += len(fspec.specs) + rdflag = Signal(name="rdflag_"+name, reset_less=True) + comb += rdflag.eq(fspec.okflag) rdflags.append(rdflag) print ("pplen", pplen) # create a priority picker to manage this port rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen) - setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick) + m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick rens = [] addrs = [] wvens = [] for i, fspec in enumerate(fspecs): - (rf, wf, _read, _write, wid, fuspec) = fspec + (rf, _read, wid, fuspecs) = \ + (fspec.okflag, fspec.regport, fspec.wid, fspec.specs) # connect up the FU req/go signals, and the reg-read to the FU # and create a Read Broadcast Bus - for pi, (funame, fu, idx) in enumerate(fuspec): + for pi, fuspec in enumerate(fspec.specs): + (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx) pi += ppoffs[i] name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi) fu_active = fu_selected[funame] fu_issued = fu_bitdict[funame] # get (or set up) a latched copy of read register number + # and (sigh) also the read-ok flag + # TODO: use nmutil latchregister + rhname = "%s_%s_%d" % (regfile, regname, i) + rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname), + reset_less=True) + if rhname not in fu.rf_latches: + rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname)) + fu.rf_latches[rhname] = rfl + with m.If(fu.issue_i): + sync += rfl.eq(rdflags[i]) + else: + rfl = fu.rf_latches[rhname] + + # now the register port rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi) - read = Signal.like(_read, name="read_"+name) + read = Signal.like(_read, name="read_"+rname) if rname not in fu.rd_latches: rdl = Signal.like(_read, name="rdlatch_"+rname) fu.rd_latches[rname] = rdl @@ -479,12 +624,16 @@ class NonProductionCore(ControlBase): sync += rdl.eq(_read) else: rdl = fu.rd_latches[rname] - # latch to make the read immediately available on issue cycle - # after the read cycle, use the latched copy + + # make the read immediately available on issue cycle + # after the read cycle, otherwies use the latched copy. + # this captures the regport and okflag on issue with m.If(fu.issue_i): comb += read.eq(_read) + comb += rdflag.eq(rdflags[i]) with m.Else(): comb += read.eq(rdl) + comb += rdflag.eq(rfl) # connect request-read to picker input, and output to go-rd addr_en = Signal.like(read, name="addr_en_"+name) @@ -494,9 +643,9 @@ class NonProductionCore(ControlBase): rhazard = Signal(name="rhaz_"+name) # exclude any currently-enabled read-request (mask out active) - comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] & - ~delay_pick & ~rhazard) # entirely block anything hazarded from being picked + comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag & + ~delay_pick & ~rhazard) comb += rdpick.i[pi].eq(pick) comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick @@ -527,10 +676,10 @@ class NonProductionCore(ControlBase): continue # read the write-hazard bitvector (wv) for any bit that is - wvchk_en = Signal(len(wvchk.ren), name="wv_chk_addr_en_"+name) + wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name) issue_active = Signal(name="rd_iactive_"+name) # XXX combinatorial loop here - comb += issue_active.eq(fu_active & rf) + comb += issue_active.eq(fu_active & rdflag) with m.If(issue_active): if rfile.unary: comb += wvchk_en.eq(read) @@ -538,11 +687,14 @@ class NonProductionCore(ControlBase): comb += wvchk_en.eq(1< clear bit wvseten.append(wv_issue_en) # set data same as enable - wvsets.append(wv_issue_en) # because enable needs a 1 + + # read the write-hazard bitvector (wv) for any bit that is + fu_requested = fu_bitdict[funame] + wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name) + issue_active = Signal(name="waw_iactive_"+name) + whazard = Signal(name="whaz_"+name) + if wf is None: + # XXX EEK! STATE regfile (branch) does not have an + # write-active indicator in regspec_decode_write() + print ("XXX FIXME waw_iactive", issue_active, + fu_requested, wf) + else: + # check bits from the incoming instruction. note (back + # in connect_instruction) that the decoder is held for + # us to be able to do this, here... *without* issue being + # held HI. we MUST NOT gate this with fu.issue_i or + # with fu_bitdict "enable": it would create a loop + comb += issue_active.eq(wf) + with m.If(issue_active): + if rfile.unary: + comb += wvchk_en.eq(write) + else: + comb += wvchk_en.eq(1<