"""simple core not in any way intended for production use. connects up FunctionUnits to Register Files in a brain-dead fashion that only permits one and only one Function Unit to be operational. the principle here is to take the Function Units, analyse their regspecs, and turn their requirements for access to register file read/write ports into groupings by Register File and Register File Port name. under each grouping - by regfile/port - a list of Function Units that need to connect to that port is created. as these are a contended resource a "Broadcast Bus" per read/write port is then also created, with access to it managed by a PriorityPicker. the brain-dead part of this module is that even though there is no conflict of access, regfile read/write hazards are *not* analysed, and consequently it is safer to wait for the Function Unit to complete before allowing a new instruction to proceed. (update: actually this is being added now: https://bugs.libre-soc.org/show_bug.cgi?id=737) """ from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux, Const) from nmigen.cli import rtlil from openpower.decoder.power_decoder2 import PowerDecodeSubset from openpower.decoder.power_regspec_map import regspec_decode from openpower.sv.svp64 import SVP64Rec from nmutil.picker import PriorityPicker from nmutil.util import treereduce from nmutil.singlepipe import ControlBase from soc.fu.compunits.compunits import AllFunctionUnits, LDSTFunctionUnit from soc.regfile.regfiles import RegFiles from openpower.decoder.power_decoder2 import get_rdflags from soc.experiment.l0_cache import TstL0CacheBuffer # test only from soc.config.test.test_loadstore import TestMemPspec from openpower.decoder.power_enums import MicrOp, Function from soc.simple.core_data import CoreInput, CoreOutput from collections import defaultdict, namedtuple import operator from nmutil.util import rising_edge FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"]) ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"]) # helper function for reducing a list of signals down to a parallel # ORed single signal. def ortreereduce(tree, attr="o_data"): return treereduce(tree, operator.or_, lambda x: getattr(x, attr)) def ortreereduce_sig(tree): return treereduce(tree, operator.or_, lambda x: x) # helper function to place full regs declarations first def sort_fuspecs(fuspecs): res = [] for (regname, fspec) in fuspecs.items(): if regname.startswith("full"): res.append((regname, fspec)) for (regname, fspec) in fuspecs.items(): if not regname.startswith("full"): res.append((regname, fspec)) return res # enumerate(res) # a hazard bitvector "remap" function which returns an AST expression # that remaps read/write hazard regfile port numbers to either a full # bitvector or a reduced subset one. SPR for example is reduced to a # single bit. # CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per # regfile! therefore the remapping is per regfile, *NOT* per regfile # port and certainly not based on whether it is a read port or write port. # note that any reductions here will result in degraded performance due # to conflicts, but at least it keeps the hazard matrix sizes down to "sane" def bitvector_remap(regfile, rfile, port): # 8-bits (at the moment, no SVP64), CR is unary: no remap if regfile == 'CR': return port # 3 bits, unary alrady: return the port if regfile == 'XER': return port # 3 bits, unary: return the port if regfile == 'XER': return port # 5 bits, unary: return the port if regfile == 'STATE': return port # 9 bits (9 entries), might be unary already if regfile == 'FAST': if rfile.unary: # FAST might be unary already return port else: return 1 << port # 10 bits (!!) - reduce to one if regfile == 'SPR': if rfile.unary: # FAST might be unary already return port else: return 1 << port if regfile == 'INT': if rfile.unary: # INT, check if unary/binary return port else: return 1 << port # derive from ControlBase rather than have a separate Stage instance, # this is simpler to do class NonProductionCore(ControlBase): def __init__(self, pspec): self.pspec = pspec # test is SVP64 is to be enabled self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True) # test to see if regfile ports should be reduced self.regreduce_en = (hasattr(pspec, "regreduce") and (pspec.regreduce == True)) # test to see if overlapping of instructions is allowed # (not normally enabled for TestIssuer FSM but useful for checking # the bitvector hazard detection, before doing In-Order) self.allow_overlap = (hasattr(pspec, "allow_overlap") and (pspec.allow_overlap == True)) # test core type self.make_hazard_vecs = self.allow_overlap self.core_type = "fsm" if hasattr(pspec, "core_type"): self.core_type = pspec.core_type super().__init__(stage=self) # single LD/ST funnel for memory access self.l0 = l0 = TstL0CacheBuffer(pspec, n_units=1) pi = l0.l0.dports[0] # function units (only one each) # only include mmu if enabled in pspec self.fus = AllFunctionUnits(pspec, pilist=[pi]) # link LoadStore1 into MMU mmu = self.fus.get_fu('mmu0') ldst0 = self.fus.get_fu('ldst0') print ("core pspec", pspec.ldst_ifacetype) print ("core mmu", mmu) if mmu is not None: lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object print ("core lsmem.lsi", lsi) mmu.alu.set_ldst_interface(lsi) # urr store I-Cache in core so it is easier to get at self.icache = lsi.icache # alternative reset values for STATE regs. these probably shouldn't # be set, here, instead have them done by Issuer. which they are. # as well. because core.state overrides them. sigh. self.msr_at_reset = 0x0 self.pc_at_reset = 0x0 if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int): self.msr_at_reset = pspec.msr_reset if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int): self.pc_at_reset = pspec.pc_reset state_resets = [self.pc_at_reset, # PC at reset self.msr_at_reset, # MSR at reset 0x0, # SVSTATE at reset 0x0, # DEC at reset 0x0] # TB at reset # register files (yes plural) self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs, state_resets=state_resets) # set up input and output: unusual requirement to set data directly # (due to the way that the core is set up in a different domain, # see TestIssuer.setup_peripherals self.p.i_data, self.n.o_data = self.new_specs(None) self.i, self.o = self.p.i_data, self.n.o_data # actual internal input data used (captured) self.ireg = self.ispec() # create per-FU instruction decoders (subsetted). these "satellite" # decoders reduce wire fan-out from the one (main) PowerDecoder2 # (used directly by the trap unit) to the *twelve* (or more) # Function Units. we can either have 32 wires (the instruction) # to each, or we can have well over a 200 wire fan-out (to 12 # ALUs). it's an easy choice to make. self.decoders = {} self.des = {} # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti # they should be shared (put into the ALU *once*). for funame, fu in self.fus.fus.items(): f_name = fu.fnunit.name fnunit = fu.fnunit.value opkls = fu.opsubsetkls if f_name == 'TRAP': # TRAP decoder is the *main* decoder self.trapunit = funame continue assert funame not in self.decoders self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name, final=True, state=self.ireg.state, svp64_en=self.svp64_en, regreduce_en=self.regreduce_en) self.des[funame] = self.decoders[funame].do print ("create decoder subset", funame, opkls, self.des[funame]) # create per-Function Unit write-after-write hazard signals # yes, really, this should have been added in ReservationStations # but hey. for funame, fu in self.fus.fus.items(): fu._waw_hazard = Signal(name="waw_%s" % funame) # share the SPR decoder with the MMU if it exists if "mmu0" in self.decoders: self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"] # allow pausing of the DEC/TB FSM back in Issuer, by spotting # if there is an MTSPR instruction self.pause_dec_tb = Signal() # next 3 functions are Stage API Compliance def setup(self, m, i): pass def ispec(self): return CoreInput(self.pspec, self.svp64_en, self.regreduce_en) def ospec(self): return CoreOutput() # elaborate function to create HDL def elaborate(self, platform): m = super().elaborate(platform) # for testing purposes, to cut down on build time in coriolis2 if hasattr(self.pspec, "nocore") and self.pspec.nocore == True: x = Signal() # dummy signal m.d.sync += x.eq(~x) return m comb = m.d.comb m.submodules.fus = self.fus m.submodules.l0 = l0 = self.l0 self.regs.elaborate_into(m, platform) regs = self.regs fus = self.fus.fus # amalgamate write-hazards into a single top-level Signal self.waw_hazard = Signal() whaz = [] for funame, fu in self.fus.fus.items(): whaz.append(fu._waw_hazard) comb += self.waw_hazard.eq(Cat(*whaz).bool()) # connect decoders self.connect_satellite_decoders(m) # ssh, cheat: trap uses the main decoder because of the rewriting self.des[self.trapunit] = self.ireg.e.do # connect up Function Units, then read/write ports, and hazard conflict self.issue_conflict = Signal() fu_bitdict, fu_selected = self.connect_instruction(m) raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected) self.connect_wrports(m, fu_bitdict, fu_selected) if self.allow_overlap: comb += self.issue_conflict.eq(raw_hazard) # note if an exception happened. in a pipelined or OoO design # this needs to be accompanied by "shadowing" (or stalling) el = [] for exc in self.fus.excs.values(): el.append(exc.happened) if len(el) > 0: # at least one exception comb += self.o.exc_happened.eq(Cat(*el).bool()) return m def connect_satellite_decoders(self, m): comb = m.d.comb for k, v in self.decoders.items(): # connect each satellite decoder and give it the instruction. # as subset decoders this massively reduces wire fanout given # the large number of ALUs m.submodules["dec_%s" % k] = v comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i) comb += v.dec.bigendian.eq(self.ireg.bigendian_i) # sigh due to SVP64 RA_OR_ZERO detection connect these too comb += v.sv_a_nz.eq(self.ireg.sv_a_nz) if not self.svp64_en: continue comb += v.pred_sm.eq(self.ireg.sv_pred_sm) comb += v.pred_dm.eq(self.ireg.sv_pred_dm) if k == self.trapunit: continue comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode) # only the LDST PowerDecodeSubset *actually* needs to # know to use the alternative decoder. this is all # a terrible hack if not k.lower().startswith("ldst"): continue comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec) def connect_instruction(self, m): """connect_instruction uses decoded (from PowerOp) function unit information from CSV files to ascertain which Function Unit should deal with the current instruction. some (such as OP_ATTN, OP_NOP) are dealt with here, including ignoring it and halting the processor. OP_NOP is a bit annoying because the issuer expects busy flag still to be raised then lowered. (this requires a fake counter to be set). """ comb, sync = m.d.comb, m.d.sync fus = self.fus.fus # indicate if core is busy busy_o = self.o.busy_o any_busy_o = self.o.any_busy_o # connect up temporary copy of incoming instruction. the FSM will # either blat the incoming instruction (if valid) into self.ireg # or if the instruction could not be delivered, keep dropping the # latched copy into ireg ilatch = self.ispec() self.instr_active = Signal() # enable/busy-signals for each FU, get one bit for each FU (by name) fu_enable = Signal(len(fus), reset_less=True) fu_busy = Signal(len(fus), reset_less=True) fu_bitdict = {} fu_selected = {} for i, funame in enumerate(fus.keys()): fu_bitdict[funame] = fu_enable[i] fu_selected[funame] = fu_busy[i] # identify function units and create a list by fnunit so that # PriorityPickers can be created for selecting one of them that # isn't busy at the time the incoming instruction needs passing on by_fnunit = defaultdict(list) for fname, member in Function.__members__.items(): for funame, fu in fus.items(): fnunit = fu.fnunit.value if member.value & fnunit: # this FU handles this type of op by_fnunit[fname].append((funame, fu)) # add by Function # ok now just print out the list of FUs by Function, because we can for fname, fu_list in by_fnunit.items(): print ("FUs by type", fname, fu_list) # now create a PriorityPicker per FU-type such that only one # non-busy FU will be picked issue_pps = {} fu_found = Signal() # take a note if no Function Unit was available for fname, fu_list in by_fnunit.items(): i_pp = PriorityPicker(len(fu_list)) m.submodules['i_pp_%s' % fname] = i_pp i_l = [] for i, (funame, fu) in enumerate(fu_list): # match the decoded instruction (e.do.fn_unit) against the # "capability" of this FU, gate that by whether that FU is # busy, and drop that into the PriorityPicker. # this will give us an output of the first available *non-busy* # Function Unit (Reservation Statio) capable of handling this # instruction. fnunit = fu.fnunit.value en_req = Signal(name="issue_en_%s" % funame, reset_less=True) fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool() comb += en_req.eq(fnmatch & ~fu.busy_o & self.instr_active) i_l.append(en_req) # store in list for doing the Cat-trick # picker output, gated by enable: store in fu_bitdict po = Signal(name="o_issue_pick_"+funame) # picker output comb += po.eq(i_pp.o[i] & i_pp.en_o) comb += fu_bitdict[funame].eq(po) comb += fu_selected[funame].eq(fu.busy_o | po) # if we don't do this, then when there are no FUs available, # the "p.o_ready" signal will go back "ok we accepted this # instruction" which of course isn't true. with m.If(i_pp.en_o): comb += fu_found.eq(1) # for each input, Cat them together and drop them into the picker comb += i_pp.i.eq(Cat(*i_l)) # rdmask, which is for registers needs to come from the *main* decoder for funame, fu in fus.items(): rdmask = get_rdflags(m, self.ireg.e, fu) comb += fu.rdmaskn.eq(~rdmask) # sigh - need a NOP counter counter = Signal(2) with m.If(counter != 0): sync += counter.eq(counter - 1) comb += busy_o.eq(1) # default to reading from incoming instruction: may be overridden # by copy from latch when "waiting" comb += self.ireg.eq(self.i) # always say "ready" except if overridden comb += self.p.o_ready.eq(1) with m.FSM(): with m.State("READY"): with m.If(self.p.i_valid): # run only when valid with m.Switch(self.ireg.e.do.insn_type): # check for ATTN: halt if true with m.Case(MicrOp.OP_ATTN): m.d.sync += self.o.core_terminate_o.eq(1) # fake NOP - this isn't really used (Issuer detects NOP) with m.Case(MicrOp.OP_NOP): sync += counter.eq(2) comb += busy_o.eq(1) with m.Default(): comb += self.instr_active.eq(1) comb += self.p.o_ready.eq(0) # connect instructions. only one enabled at a time for funame, fu in fus.items(): do = self.des[funame] enable = fu_bitdict[funame] # run this FunctionUnit if enabled route op, # issue, busy, read flags and mask to FU with m.If(enable): # operand comes from the *local* decoder # do not actually issue, though, if there # is a waw hazard. decoder has to still # be asserted in order to detect that, tho comb += fu.oper_i.eq_from(do) if funame == 'mmu0': # URRR this is truly dreadful. # OP_FETCH_FAILED is a "fake" op. # no instruction creates it. OP_TRAP # uses the *main* decoder: this is # a *Satellite* decoder that reacts # on *insn_in*... not fake ops. gaah. main_op = self.ireg.e.do with m.If(main_op.insn_type == MicrOp.OP_FETCH_FAILED): comb += fu.oper_i.insn_type.eq( MicrOp.OP_FETCH_FAILED) comb += fu.oper_i.fn_unit.eq( Function.MMU) # issue when valid (and no write-hazard) comb += fu.issue_i.eq(~self.waw_hazard) # instruction ok, indicate ready comb += self.p.o_ready.eq(1) if self.allow_overlap: with m.If(~fu_found | self.waw_hazard): # latch copy of instruction sync += ilatch.eq(self.i) comb += self.p.o_ready.eq(1) # accept comb += busy_o.eq(1) m.next = "WAITING" with m.State("WAITING"): comb += self.instr_active.eq(1) comb += self.p.o_ready.eq(0) comb += busy_o.eq(1) # using copy of instruction, keep waiting until an FU is free comb += self.ireg.eq(ilatch) with m.If(fu_found): # wait for conflict to clear # connect instructions. only one enabled at a time for funame, fu in fus.items(): do = self.des[funame] enable = fu_bitdict[funame] # run this FunctionUnit if enabled route op, # issue, busy, read flags and mask to FU with m.If(enable): # operand comes from the *local* decoder, # which is asserted even if not issued, # so that WaW-detection can check for hazards. # only if the waw hazard is clear does the # instruction actually get issued comb += fu.oper_i.eq_from(do) # issue when valid comb += fu.issue_i.eq(~self.waw_hazard) with m.If(~self.waw_hazard): comb += self.p.o_ready.eq(1) comb += busy_o.eq(0) m.next = "READY" print ("core: overlap allowed", self.allow_overlap) # true when any FU is busy (including the cycle where it is perhaps # to be issued - because that's what fu_busy is) comb += any_busy_o.eq(fu_busy.bool()) if not self.allow_overlap: # for simple non-overlap, if any instruction is busy, set # busy output for core. comb += busy_o.eq(any_busy_o) else: # sigh deal with a fun situation that needs to be investigated # and resolved with m.If(self.issue_conflict): comb += busy_o.eq(1) # make sure that LDST, SPR, MMU, Branch and Trap all say "busy" # and do not allow overlap. these are all the ones that # are non-forward-progressing: exceptions etc. that otherwise # change CoreState for some reason (MSR, PC, SVSTATE) for funame, fu in fus.items(): if (funame.lower().startswith('ldst') or funame.lower().startswith('branch') or funame.lower().startswith('mmu') or funame.lower().startswith('spr') or funame.lower().startswith('trap')): with m.If(fu.busy_o): comb += busy_o.eq(1) # for SPR pipeline pause dec/tb FSM to avoid race condition # TODO: really this should be much more sophisticated, # spot MTSPR, spot that DEC/TB is what is to be updated. # a job for PowerDecoder2, there if funame.lower().startswith('spr'): with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR ): comb += self.pause_dec_tb.eq(1) # return both the function unit "enable" dict as well as the "busy". # the "busy-or-issued" can be passed in to the Read/Write port # connecters to give them permission to request access to regfiles return fu_bitdict, fu_selected def connect_rdport(self, m, fu_bitdict, fu_selected, rdpickers, regfile, regname, fspec): comb, sync = m.d.comb, m.d.sync fus = self.fus.fus regs = self.regs rpidx = regname # select the required read port. these are pre-defined sizes rfile = regs.rf[regfile.lower()] rport = rfile.r_ports[rpidx] print("read regfile", rpidx, regfile, regs.rf.keys(), rfile, rfile.unary) # for checking if the read port has an outstanding write if self.make_hazard_vecs: wv = regs.wv[regfile.lower()] wvchk = wv.q_int # write-vec bit-level hazard check # if a hazard is detected on this read port, simply blithely block # every FU from reading on it. this is complete overkill but very # simple for now. hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx)) fspecs = fspec if not isinstance(fspecs, list): fspecs = [fspecs] rdflags = [] pplen = 0 ppoffs = [] for i, fspec in enumerate(fspecs): # get the regfile specs for this regfile port print ("fpsec", i, fspec, len(fspec.specs)) name = "%s_%s_%d" % (regfile, regname, i) ppoffs.append(pplen) # record offset for picker pplen += len(fspec.specs) rdflag = Signal(name="rdflag_"+name, reset_less=True) comb += rdflag.eq(fspec.okflag) rdflags.append(rdflag) print ("pplen", pplen) # create a priority picker to manage this port rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen) m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick rens = [] addrs = [] wvens = [] for i, fspec in enumerate(fspecs): (rf, _read, wid, fuspecs) = \ (fspec.okflag, fspec.regport, fspec.wid, fspec.specs) # connect up the FU req/go signals, and the reg-read to the FU # and create a Read Broadcast Bus for pi, fuspec in enumerate(fspec.specs): (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx) pi += ppoffs[i] name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi) fu_active = fu_selected[funame] fu_issued = fu_bitdict[funame] # get (or set up) a latched copy of read register number # and (sigh) also the read-ok flag # TODO: use nmutil latchregister rhname = "%s_%s_%d" % (regfile, regname, i) rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname), reset_less=True) if rhname not in fu.rf_latches: rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname)) fu.rf_latches[rhname] = rfl with m.If(fu.issue_i): sync += rfl.eq(rdflags[i]) else: rfl = fu.rf_latches[rhname] # now the register port rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi) read = Signal.like(_read, name="read_"+rname) if rname not in fu.rd_latches: rdl = Signal.like(_read, name="rdlatch_"+rname) fu.rd_latches[rname] = rdl with m.If(fu.issue_i): sync += rdl.eq(_read) else: rdl = fu.rd_latches[rname] # make the read immediately available on issue cycle # after the read cycle, otherwies use the latched copy. # this captures the regport and okflag on issue with m.If(fu.issue_i): comb += read.eq(_read) comb += rdflag.eq(rdflags[i]) with m.Else(): comb += read.eq(rdl) comb += rdflag.eq(rfl) # connect request-read to picker input, and output to go-rd addr_en = Signal.like(read, name="addr_en_"+name) pick = Signal(name="pick_"+name) # picker input rp = Signal(name="rp_"+name) # picker output delay_pick = Signal(name="dp_"+name) # read-enable "underway" rhazard = Signal(name="rhaz_"+name) # exclude any currently-enabled read-request (mask out active) # entirely block anything hazarded from being picked comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag & ~delay_pick & ~rhazard) comb += rdpick.i[pi].eq(pick) comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick # if picked, select read-port "reg select" number to port comb += rp.eq(rdpick.o[pi] & rdpick.en_o) sync += delay_pick.eq(rp) # delayed "pick" comb += addr_en.eq(Mux(rp, read, 0)) # the read-enable happens combinatorially (see mux-bus below) # but it results in the data coming out on a one-cycle delay. if rfile.unary: rens.append(addr_en) else: addrs.append(addr_en) rens.append(rp) # use the *delayed* pick signal to put requested data onto bus with m.If(delay_pick): # connect regfile port to input, creating fan-out Bus src = fu.src_i[idx] print("reg connect widths", regfile, regname, pi, funame, src.shape(), rport.o_data.shape()) # all FUs connect to same port comb += src.eq(rport.o_data) if not self.make_hazard_vecs: continue # read the write-hazard bitvector (wv) for any bit that is wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name) issue_active = Signal(name="rd_iactive_"+name) # XXX combinatorial loop here comb += issue_active.eq(fu_active & rdflag) with m.If(issue_active): if rfile.unary: comb += wvchk_en.eq(read) else: comb += wvchk_en.eq(1< clear bit wvseten.append(wv_issue_en) # set data same as enable # read the write-hazard bitvector (wv) for any bit that is fu_requested = fu_bitdict[funame] wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name) issue_active = Signal(name="waw_iactive_"+name) whazard = Signal(name="whaz_"+name) if wf is None: # XXX EEK! STATE regfile (branch) does not have an # write-active indicator in regspec_decode_write() print ("XXX FIXME waw_iactive", issue_active, fu_requested, wf) else: # check bits from the incoming instruction. note (back # in connect_instruction) that the decoder is held for # us to be able to do this, here... *without* issue being # held HI. we MUST NOT gate this with fu.issue_i or # with fu_bitdict "enable": it would create a loop comb += issue_active.eq(wf) with m.If(issue_active): if rfile.unary: comb += wvchk_en.eq(write) else: comb += wvchk_en.eq(1<