from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
from nmigen.cli import rtlil
-from soc.decoder.power_decoder2 import PowerDecodeSubset
-from soc.decoder.power_regspec_map import regspec_decode_read
-from soc.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_decoder2 import PowerDecodeSubset
+from openpower.decoder.power_regspec_map import regspec_decode_read
+from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.sv.svp64 import SVP64Rec
from nmutil.picker import PriorityPicker
from nmutil.util import treereduce
+from nmutil.singlepipe import ControlBase
from soc.fu.compunits.compunits import AllFunctionUnits
from soc.regfile.regfiles import RegFiles
-from soc.decoder.decode2execute1 import Decode2ToExecute1Type
-from soc.decoder.decode2execute1 import IssuerDecode2ToOperand
-from soc.decoder.power_decoder2 import get_rdflags
-from soc.decoder.decode2execute1 import Data
+from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.decoder.power_decoder2 import get_rdflags
+from openpower.decoder.decode2execute1 import Data
from soc.experiment.l0_cache import TstL0CacheBuffer # test only
from soc.config.test.test_loadstore import TestMemPspec
-from soc.decoder.power_enums import MicrOp
+from openpower.decoder.power_enums import MicrOp
from soc.config.state import CoreState
import operator
# helper function for reducing a list of signals down to a parallel
# ORed single signal.
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
return res # enumerate(res)
-class NonProductionCore(Elaboratable):
- def __init__(self, pspec):
- self.pspec = pspec
+class CoreInput:
+ """CoreInput: this is the input specification for Signals coming into core.
- # single LD/ST funnel for memory access
- self.l0 = TstL0CacheBuffer(pspec, n_units=1)
- pi = self.l0.l0.dports[0]
+ * state. this contains PC, MSR, and SVSTATE. this is crucial information.
+ (TODO: bigendian_i should really be read from the relevant MSR bit)
- if False:
- # MMU / DCache
- self.mmu = MMU()
- self.dcache = DCache()
+ * the previously-decoded instruction goes into the Decode2Execute1Type
+ data structure. no need for Core to re-decode that. however note
+ that *satellite* decoders *are* part of Core.
- # function units (only one each)
- self.fus = AllFunctionUnits(pspec, pilist=[pi])
+ * the raw instruction. this is used by satellite decoders internal to
+ Core, to provide Function-Unit-specific information. really, they
+ should be part of the actual ALU itself (in order to reduce wires),
+ but hey.
- # register files (yes plural)
- self.regs = RegFiles()
+ * other stuff is related to SVP64. the 24-bit SV REMAP field containing
+ Vector context, etc.
+ """
+ def __init__(self, pspec, svp64_en, regreduce_en):
+ self.pspec = pspec
+ self.svp64_en = svp64_en
+ self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
+ regreduce_en=regreduce_en)
- # instruction decoder - needs a Trap-capable Record (captures EINT etc.)
- self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand)
+ # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
+ self.sv_a_nz = Signal()
+ # state and raw instruction (and SVP64 ReMap fields)
self.state = CoreState("core")
self.raw_insn_i = Signal(32) # raw instruction
- self.bigendian_i = Signal() # bigendian
-
- # issue/valid/busy signalling
- self.ivalid_i = Signal(reset_less=True) # instruction is valid
- self.issue_i = Signal(reset_less=True)
- self.busy_o = Signal(name="corebusy_o", reset_less=True)
-
+ self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+ if svp64_en:
+ self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
+ self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
+ self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
+ self.sv_pred_sm = Signal() # TODO: SIMD width
+ self.sv_pred_dm = Signal() # TODO: SIMD width
+
+ def eq(self, i):
+ self.e.eq(i.e)
+ self.sv_a_nz.eq(i.sv_a_nz)
+ self.state.eq(i.state)
+ self.raw_insn_i.eq(i.raw_insn_i)
+ self.bigendian_i.eq(i.bigendian_i)
+ if not self.svp64_en:
+ return
+ self.sv_rm.eq(i.sv_rm)
+ self.is_svp64_mode.eq(i.is_svp64_mode)
+ self.use_svp64_ldst_dec.eq(i.use_svp64_ldst_dec)
+ self.sv_pred_sm.eq(i.sv_pred_sm)
+ self.sv_pred_dm.eq(i.sv_pred_dm)
+
+
+class CoreOutput:
+ def __init__(self):
# start/stop and terminated signalling
- self.core_stopped_i = Signal(reset_less=True)
- self.core_reset_i = Signal()
self.core_terminate_o = Signal(reset=0) # indicates stopped
+ self.exc_happened = Signal() # exception happened
+
+ def eq(self, i):
+ self.core_terminate_o.eq(i.core_terminate_o)
+ self.exc_happened.eq(i.exc_happened)
+
+
+# derive from ControlBase rather than have a separate Stage instance,
+# this is simpler to do
+class NonProductionCore(ControlBase):
+ def __init__(self, pspec):
+ self.pspec = pspec
+
+ # test is SVP64 is to be enabled
+ self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
+
+ # test to see if regfile ports should be reduced
+ self.regreduce_en = (hasattr(pspec, "regreduce") and
+ (pspec.regreduce == True))
+
+ super().__init__(stage=self)
+
+ # single LD/ST funnel for memory access
+ self.l0 = l0 = TstL0CacheBuffer(pspec, n_units=1)
+ pi = l0.l0.dports[0]
+
+ # function units (only one each)
+ # only include mmu if enabled in pspec
+ self.fus = AllFunctionUnits(pspec, pilist=[pi])
+
+ # link LoadStore1 into MMU
+ mmu = self.fus.get_fu('mmu0')
+ print ("core pspec", pspec.ldst_ifacetype)
+ print ("core mmu", mmu)
+ print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
+ if mmu is not None:
+ mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+
+ # register files (yes plural)
+ self.regs = RegFiles(pspec)
+
+ # set up input and output: unusual requirement to set data directly
+ # (due to the way that the core is set up in a different domain,
+ # see TestIssuer.setup_peripherals
+ self.i, self.o = self.new_specs(None)
+ self.i, self.o = self.p.i_data, self.n.o_data
# create per-FU instruction decoders (subsetted)
self.decoders = {}
fnunit = fu.fnunit.value
opkls = fu.opsubsetkls
if f_name == 'TRAP':
+ # TRAP decoder is the *main* decoder
self.trapunit = funame
continue
self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
final=True,
- state=self.state)
+ state=self.i.state,
+ svp64_en=self.svp64_en,
+ regreduce_en=self.regreduce_en)
self.des[funame] = self.decoders[funame].do
+ if "mmu0" in self.decoders:
+ self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
+
+ def setup(self, m, i):
+ pass
+
+ def ispec(self):
+ return CoreInput(self.pspec, self.svp64_en, self.regreduce_en)
+
+ def ospec(self):
+ return CoreOutput()
+
def elaborate(self, platform):
- m = Module()
+ m = super().elaborate(platform)
+
# for testing purposes, to cut down on build time in coriolis2
if hasattr(self.pspec, "nocore") and self.pspec.nocore == True:
+ x = Signal() # dummy signal
+ m.d.sync += x.eq(~x)
return m
comb = m.d.comb
# connect decoders
for k, v in self.decoders.items():
+ # connect each satellite decoder and give it the instruction.
+ # as subset decoders this massively reduces wire fanout given
+ # the large number of ALUs
setattr(m.submodules, "dec_%s" % v.fn_name, v)
- comb += v.dec.raw_opcode_in.eq(self.raw_insn_i)
- comb += v.dec.bigendian.eq(self.bigendian_i)
+ comb += v.dec.raw_opcode_in.eq(self.i.raw_insn_i)
+ comb += v.dec.bigendian.eq(self.i.bigendian_i)
+ # sigh due to SVP64 RA_OR_ZERO detection connect these too
+ comb += v.sv_a_nz.eq(self.i.sv_a_nz)
+ if self.svp64_en:
+ comb += v.pred_sm.eq(self.i.sv_pred_sm)
+ comb += v.pred_dm.eq(self.i.sv_pred_dm)
+ if k != self.trapunit:
+ comb += v.sv_rm.eq(self.i.sv_rm) # pass through SVP64 ReMap
+ comb += v.is_svp64_mode.eq(self.i.is_svp64_mode)
+ # only the LDST PowerDecodeSubset *actually* needs to
+ # know to use the alternative decoder. this is all
+ # a terrible hack
+ if k.lower().startswith("ldst"):
+ comb += v.use_svp64_ldst_dec.eq(
+ self.i.use_svp64_ldst_dec)
# ssh, cheat: trap uses the main decoder because of the rewriting
- self.des[self.trapunit] = self.e.do
+ self.des[self.trapunit] = self.i.e.do
# connect up Function Units, then read/write ports
fu_bitdict = self.connect_instruction(m)
self.connect_rdports(m, fu_bitdict)
self.connect_wrports(m, fu_bitdict)
- # connect up reset
- m.d.comb += ResetSignal().eq(self.core_reset_i)
+ # note if an exception happened. in a pipelined or OoO design
+ # this needs to be accompanied by "shadowing" (or stalling)
+ el = []
+ for exc in self.fus.excs.values():
+ el.append(exc.happened)
+ if len(el) > 0: # at least one exception
+ comb += self.o.exc_happened.eq(Cat(*el).bool())
return m
comb, sync = m.d.comb, m.d.sync
fus = self.fus.fus
+ # indicate if core is busy
+ busy_o = Signal(name="corebusy_o", reset_less=True)
+
# enable-signals for each FU, get one bit for each FU (by name)
fu_enable = Signal(len(fus), reset_less=True)
fu_bitdict = {}
# enable the required Function Unit based on the opcode decode
# note: this *only* works correctly for simple core when one and
- # *only* one FU is allocated per instruction
+ # *only* one FU is allocated per instruction. what is actually
+ # required is one PriorityPicker per group of matching fnunits,
+ # and for only one actual FU to be "picked". this basically means
+ # when ReservationStations are enabled it will be possible to
+ # monitor multiple outstanding processing properly.
for funame, fu in fus.items():
fnunit = fu.fnunit.value
enable = Signal(name="en_%s" % funame, reset_less=True)
- comb += enable.eq((self.e.do.fn_unit & fnunit).bool())
+ comb += enable.eq((self.i.e.do.fn_unit & fnunit).bool())
comb += fu_bitdict[funame].eq(enable)
# sigh - need a NOP counter
counter = Signal(2)
with m.If(counter != 0):
sync += counter.eq(counter - 1)
- comb += self.busy_o.eq(1)
+ comb += busy_o.eq(1)
- with m.If(self.ivalid_i): # run only when valid
- with m.Switch(self.e.do.insn_type):
+ with m.If(self.p.i_valid): # run only when valid
+ with m.Switch(self.i.e.do.insn_type):
# check for ATTN: halt if true
with m.Case(MicrOp.OP_ATTN):
- m.d.sync += self.core_terminate_o.eq(1)
+ m.d.sync += self.o.core_terminate_o.eq(1)
+ # fake NOP - this isn't really used (Issuer detects NOP)
with m.Case(MicrOp.OP_NOP):
sync += counter.eq(2)
- comb += self.busy_o.eq(1)
+ comb += busy_o.eq(1)
with m.Default():
# connect up instructions. only one enabled at a time
with m.If(enable):
# operand comes from the *local* decoder
comb += fu.oper_i.eq_from(do)
- #comb += fu.oper_i.eq_from_execute1(e)
- comb += fu.issue_i.eq(self.issue_i)
- comb += self.busy_o.eq(fu.busy_o)
+ comb += fu.issue_i.eq(1) # issue when input valid
+ comb += busy_o.eq(fu.busy_o)
# rdmask, which is for registers, needs to come
# from the *main* decoder
- rdmask = get_rdflags(self.e, fu)
+ rdmask = get_rdflags(self.i.e, fu)
comb += fu.rdmaskn.eq(~rdmask)
+ # if instruction is busy, set busy output for core. also
+ # continue to hold each fu rdmask
+ for funame, fu in fus.items():
+ with m.If(fu.busy_o):
+ comb += busy_o.eq(fu.busy_o)
+
+ # set ready/valid signalling. if busy, means refuse incoming issue
+ # XXX note: for an in-order core this is far too simple. busy must
+ # be gated with the *availability* of the incoming (requested)
+ # instruction, where Core must be prepared to store-and-hold
+ # an instruction if no FU is available.
+ comb += self.p.o_ready.eq(~busy_o)
+
return fu_bitdict
def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
src = fu.src_i[idx]
print("reg connect widths",
regfile, regname, pi, funame,
- src.shape(), rport.data_o.shape())
+ src.shape(), rport.o_data.shape())
# all FUs connect to same port
- comb += src.eq(rport.data_o)
+ comb += src.eq(rport.o_data)
# or-reduce the muxed read signals
if rfile.unary:
# argh. an experiment to merge RA and RB in the INT regfile
# (we have too many read/write ports)
- #if regfile == 'INT':
- #fuspecs['rabc'] = [fuspecs.pop('rb')]
- #fuspecs['rabc'].append(fuspecs.pop('rc'))
- #fuspecs['rabc'].append(fuspecs.pop('ra'))
- #if regfile == 'FAST':
- # fuspecs['fast1'] = [fuspecs.pop('fast1')]
- # if 'fast2' in fuspecs:
- # fuspecs['fast1'].append(fuspecs.pop('fast2'))
+ if self.regreduce_en:
+ if regfile == 'INT':
+ fuspecs['rabc'] = [fuspecs.pop('rb')]
+ fuspecs['rabc'].append(fuspecs.pop('rc'))
+ fuspecs['rabc'].append(fuspecs.pop('ra'))
+ if regfile == 'FAST':
+ fuspecs['fast1'] = [fuspecs.pop('fast1')]
+ if 'fast2' in fuspecs:
+ fuspecs['fast1'].append(fuspecs.pop('fast2'))
+ if 'fast3' in fuspecs:
+ fuspecs['fast1'].append(fuspecs.pop('fast3'))
# for each named regfile port, connect up all FUs to that port
for (regname, fspec) in sort_fuspecs(fuspecs):
pick = fu.wr.rel_o[idx] & fu_active # & wrflag
comb += wrpick.i[pi].eq(pick)
# create a single-pulse go write from the picker output
- wr_pick = Signal()
+ wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
comb += wr_pick.eq(wrpick.o[pi] & wrpick.en_o)
comb += fu.go_wr_i[idx].eq(rising_edge(m, wr_pick))
# connect regfile port to input
print("reg connect widths",
regfile, regname, pi, funame,
- dest.shape(), wport.data_i.shape())
+ dest.shape(), wport.i_data.shape())
wsigs.append(fu_dest_latch)
# here is where we create the Write Broadcast Bus. simple, eh?
- comb += wport.data_i.eq(ortreereduce_sig(wsigs))
+ comb += wport.i_data.eq(ortreereduce_sig(wsigs))
if rfile.unary:
# for unary-addressed
comb += wport.wen.eq(ortreereduce_sig(wens))
fuspecs = byregfiles_wrspec[regfile]
wrpickers[regfile] = {}
- # argh, more port-merging
- if regfile == 'INT':
- fuspecs['o'] = [fuspecs.pop('o')]
- fuspecs['o'].append(fuspecs.pop('o1'))
- if regfile == 'FAST':
- fuspecs['fast1'] = [fuspecs.pop('fast1')]
- if 'fast2' in fuspecs:
- fuspecs['fast1'].append(fuspecs.pop('fast2'))
+ if self.regreduce_en:
+ # argh, more port-merging
+ if regfile == 'INT':
+ fuspecs['o'] = [fuspecs.pop('o')]
+ fuspecs['o'].append(fuspecs.pop('o1'))
+ if regfile == 'FAST':
+ fuspecs['fast1'] = [fuspecs.pop('fast1')]
+ if 'fast2' in fuspecs:
+ fuspecs['fast1'].append(fuspecs.pop('fast2'))
+ if 'fast3' in fuspecs:
+ fuspecs['fast1'].append(fuspecs.pop('fast3'))
for (regname, fspec) in sort_fuspecs(fuspecs):
self.connect_wrport(m, fu_bitdict, wrpickers,
mode = "read" if readmode else "write"
regs = self.regs
fus = self.fus.fus
- e = self.e # decoded instruction to execute
+ e = self.i.e # decoded instruction to execute
# dictionary of lists of regfile ports
byregfiles = {}
def __iter__(self):
yield from self.fus.ports()
- yield from self.e.ports()
+ yield from self.i.e.ports()
yield from self.l0.ports()
# TODO: regs