MultiCompUnit fixed to not need rdmask to be sustained indefinitely
[soc.git] / src / soc / simple / core.py
index 7fa96c0a3d8235e846f2ae8694212ce63ebae33d..1c67c87b3f5c783c72131918e4efff317babdb4c 100644 (file)
@@ -22,22 +22,24 @@ before allowing a new instruction to proceed.
 from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
 from nmigen.cli import rtlil
 
-from soc.decoder.power_decoder2 import PowerDecodeSubset
-from soc.decoder.power_regspec_map import regspec_decode_read
-from soc.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_decoder2 import PowerDecodeSubset
+from openpower.decoder.power_regspec_map import regspec_decode_read
+from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.sv.svp64 import SVP64Rec
 
 from nmutil.picker import PriorityPicker
 from nmutil.util import treereduce
+from nmutil.singlepipe import ControlBase
 
 from soc.fu.compunits.compunits import AllFunctionUnits
 from soc.regfile.regfiles import RegFiles
-from soc.decoder.decode2execute1 import Decode2ToExecute1Type
-from soc.decoder.decode2execute1 import IssuerDecode2ToOperand
-from soc.decoder.power_decoder2 import get_rdflags
-from soc.decoder.decode2execute1 import Data
+from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.decoder.power_decoder2 import get_rdflags
+from openpower.decoder.decode2execute1 import Data
 from soc.experiment.l0_cache import TstL0CacheBuffer  # test only
 from soc.config.test.test_loadstore import TestMemPspec
-from soc.decoder.power_enums import MicrOp
+from openpower.decoder.power_enums import MicrOp
 from soc.config.state import CoreState
 
 import operator
@@ -47,7 +49,7 @@ from nmutil.util import rising_edge
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
     return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
 
 
@@ -67,61 +69,148 @@ def sort_fuspecs(fuspecs):
     return res  # enumerate(res)
 
 
-class NonProductionCore(Elaboratable):
-    def __init__(self, pspec):
+class CoreInput:
+    """CoreInput: this is the input specification for Signals coming into core.
 
-        # single LD/ST funnel for memory access
-        self.l0 = TstL0CacheBuffer(pspec, n_units=1)
-        pi = self.l0.l0.dports[0]
+    * state.  this contains PC, MSR, and SVSTATE. this is crucial information.
+      (TODO: bigendian_i should really be read from the relevant MSR bit)
 
-        if False:
-            # MMU / DCache
-            self.mmu = MMU()
-            self.dcache = DCache()
+    * the previously-decoded instruction goes into the Decode2Execute1Type
+      data structure. no need for Core to re-decode that.  however note
+      that *satellite* decoders *are* part of Core.
 
-        # function units (only one each)
-        self.fus = AllFunctionUnits(pspec, pilist=[pi])
+    * the raw instruction. this is used by satellite decoders internal to
+      Core, to provide Function-Unit-specific information.  really, they
+      should be part of the actual ALU itself (in order to reduce wires),
+      but hey.
 
-        # register files (yes plural)
-        self.regs = RegFiles()
+    * other stuff is related to SVP64.  the 24-bit SV REMAP field containing
+      Vector context, etc.
+    """
+    def __init__(self, pspec, svp64_en, regreduce_en):
+        self.pspec = pspec
+        self.svp64_en = svp64_en
+        self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
+                                regreduce_en=regreduce_en)
 
-        # instruction decoder - needs a Trap-capable Record (captures EINT etc.)
-        self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand)
+        # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
+        self.sv_a_nz = Signal()
 
+        # state and raw instruction (and SVP64 ReMap fields)
         self.state = CoreState("core")
         self.raw_insn_i = Signal(32) # raw instruction
-        self.bigendian_i = Signal() # bigendian
-
-        # issue/valid/busy signalling
-        self.ivalid_i = Signal(reset_less=True) # instruction is valid
-        self.issue_i = Signal(reset_less=True)
-        self.busy_o = Signal(name="corebusy_o", reset_less=True)
-
+        self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+        if svp64_en:
+            self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
+            self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
+            self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
+            self.sv_pred_sm = Signal() # TODO: SIMD width
+            self.sv_pred_dm = Signal() # TODO: SIMD width
+
+    def eq(self, i):
+        self.e.eq(i.e)
+        self.sv_a_nz.eq(i.sv_a_nz)
+        self.state.eq(i.state)
+        self.raw_insn_i.eq(i.raw_insn_i)
+        self.bigendian_i.eq(i.bigendian_i)
+        if not self.svp64_en:
+            return
+        self.sv_rm.eq(i.sv_rm)
+        self.is_svp64_mode.eq(i.is_svp64_mode)
+        self.use_svp64_ldst_dec.eq(i.use_svp64_ldst_dec)
+        self.sv_pred_sm.eq(i.sv_pred_sm)
+        self.sv_pred_dm.eq(i.sv_pred_dm)
+
+
+class CoreOutput:
+    def __init__(self):
         # start/stop and terminated signalling
-        self.core_stopped_i = Signal(reset_less=True)
-        self.core_reset_i = Signal()
         self.core_terminate_o = Signal(reset=0)  # indicates stopped
+        self.exc_happened = Signal()             # exception happened
+
+    def eq(self, i):
+        self.core_terminate_o.eq(i.core_terminate_o)
+        self.exc_happened.eq(i.exc_happened)
+
+
+# derive from ControlBase rather than have a separate Stage instance,
+# this is simpler to do
+class NonProductionCore(ControlBase):
+    def __init__(self, pspec):
+        self.pspec = pspec
+
+        # test is SVP64 is to be enabled
+        self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
+
+        # test to see if regfile ports should be reduced
+        self.regreduce_en = (hasattr(pspec, "regreduce") and
+                             (pspec.regreduce == True))
+
+        super().__init__(stage=self)
+
+        # single LD/ST funnel for memory access
+        self.l0 = l0 = TstL0CacheBuffer(pspec, n_units=1)
+        pi = l0.l0.dports[0]
+
+        # function units (only one each)
+        # only include mmu if enabled in pspec
+        self.fus = AllFunctionUnits(pspec, pilist=[pi])
+
+        # link LoadStore1 into MMU
+        mmu = self.fus.get_fu('mmu0')
+        print ("core pspec", pspec.ldst_ifacetype)
+        print ("core mmu", mmu)
+        print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
+        if mmu is not None:
+            mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+
+        # register files (yes plural)
+        self.regs = RegFiles(pspec)
+
+        # set up input and output: unusual requirement to set data directly
+        # (due to the way that the core is set up in a different domain,
+        # see TestIssuer.setup_peripherals
+        self.i, self.o = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
 
         # create per-FU instruction decoders (subsetted)
         self.decoders = {}
-        self.ees = {}
+        self.des = {}
 
         for funame, fu in self.fus.fus.items():
             f_name = fu.fnunit.name
             fnunit = fu.fnunit.value
             opkls = fu.opsubsetkls
             if f_name == 'TRAP':
+                # TRAP decoder is the *main* decoder
                 self.trapunit = funame
                 continue
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
-                                                      state=self.state)
-            self.ees[funame] = self.decoders[funame].e
+                                                      state=self.i.state,
+                                            svp64_en=self.svp64_en,
+                                            regreduce_en=self.regreduce_en)
+            self.des[funame] = self.decoders[funame].do
+
+        if "mmu0" in self.decoders:
+            self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
+
+    def setup(self, m, i):
+        pass
+
+    def ispec(self):
+        return CoreInput(self.pspec, self.svp64_en, self.regreduce_en)
+
+    def ospec(self):
+        return CoreOutput()
 
     def elaborate(self, platform):
-        m = Module()
+        m = super().elaborate(platform)
+
         # for testing purposes, to cut down on build time in coriolis2
         if hasattr(self.pspec, "nocore") and self.pspec.nocore == True:
+            x = Signal() # dummy signal
+            m.d.sync += x.eq(~x)
             return m
         comb = m.d.comb
 
@@ -133,20 +222,42 @@ class NonProductionCore(Elaboratable):
 
         # connect decoders
         for k, v in self.decoders.items():
+            # connect each satellite decoder and give it the instruction.
+            # as subset decoders this massively reduces wire fanout given
+            # the large number of ALUs
             setattr(m.submodules, "dec_%s" % v.fn_name, v)
-            comb += v.dec.raw_opcode_in.eq(self.raw_insn_i)
-            comb += v.dec.bigendian.eq(self.bigendian_i)
+            comb += v.dec.raw_opcode_in.eq(self.i.raw_insn_i)
+            comb += v.dec.bigendian.eq(self.i.bigendian_i)
+            # sigh due to SVP64 RA_OR_ZERO detection connect these too
+            comb += v.sv_a_nz.eq(self.i.sv_a_nz)
+            if self.svp64_en:
+                comb += v.pred_sm.eq(self.i.sv_pred_sm)
+                comb += v.pred_dm.eq(self.i.sv_pred_dm)
+                if k != self.trapunit:
+                    comb += v.sv_rm.eq(self.i.sv_rm) # pass through SVP64 ReMap
+                    comb += v.is_svp64_mode.eq(self.i.is_svp64_mode)
+                    # only the LDST PowerDecodeSubset *actually* needs to
+                    # know to use the alternative decoder.  this is all
+                    # a terrible hack
+                    if k.lower().startswith("ldst"):
+                        comb += v.use_svp64_ldst_dec.eq(
+                                        self.i.use_svp64_ldst_dec)
 
         # ssh, cheat: trap uses the main decoder because of the rewriting
-        self.ees[self.trapunit] = self.e
+        self.des[self.trapunit] = self.i.e.do
 
         # connect up Function Units, then read/write ports
         fu_bitdict = self.connect_instruction(m)
         self.connect_rdports(m, fu_bitdict)
         self.connect_wrports(m, fu_bitdict)
 
-        # connect up reset
-        m.d.comb += ResetSignal().eq(self.core_reset_i)
+        # note if an exception happened.  in a pipelined or OoO design
+        # this needs to be accompanied by "shadowing" (or stalling)
+        el = []
+        for exc in self.fus.excs.values():
+            el.append(exc.happened)
+        if len(el) > 0: # at least one exception
+            comb += self.o.exc_happened.eq(Cat(*el).bool())
 
         return m
 
@@ -165,6 +276,9 @@ class NonProductionCore(Elaboratable):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
 
+        # indicate if core is busy
+        busy_o = Signal(name="corebusy_o", reset_less=True)
+
         # enable-signals for each FU, get one bit for each FU (by name)
         fu_enable = Signal(len(fus), reset_less=True)
         fu_bitdict = {}
@@ -173,48 +287,65 @@ class NonProductionCore(Elaboratable):
 
         # enable the required Function Unit based on the opcode decode
         # note: this *only* works correctly for simple core when one and
-        # *only* one FU is allocated per instruction
+        # *only* one FU is allocated per instruction.  what is actually
+        # required is one PriorityPicker per group of matching fnunits,
+        # and for only one actual FU to be "picked".  this basically means
+        # when ReservationStations are enabled it will be possible to
+        # monitor multiple outstanding processing properly.
         for funame, fu in fus.items():
             fnunit = fu.fnunit.value
             enable = Signal(name="en_%s" % funame, reset_less=True)
-            comb += enable.eq((self.e.do.fn_unit & fnunit).bool())
+            comb += enable.eq((self.i.e.do.fn_unit & fnunit).bool())
             comb += fu_bitdict[funame].eq(enable)
 
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
             sync += counter.eq(counter - 1)
-            comb += self.busy_o.eq(1)
+            comb += busy_o.eq(1)
 
-        with m.If(self.ivalid_i): # run only when valid
-            with m.Switch(self.e.do.insn_type):
+        with m.If(self.p.i_valid): # run only when valid
+            with m.Switch(self.i.e.do.insn_type):
                 # check for ATTN: halt if true
                 with m.Case(MicrOp.OP_ATTN):
-                    m.d.sync += self.core_terminate_o.eq(1)
+                    m.d.sync += self.o.core_terminate_o.eq(1)
 
+                # fake NOP - this isn't really used (Issuer detects NOP)
                 with m.Case(MicrOp.OP_NOP):
                     sync += counter.eq(2)
-                    comb += self.busy_o.eq(1)
+                    comb += busy_o.eq(1)
 
                 with m.Default():
                     # connect up instructions.  only one enabled at a time
                     for funame, fu in fus.items():
-                        e = self.ees[funame]
+                        do = self.des[funame]
                         enable = fu_bitdict[funame]
 
                         # run this FunctionUnit if enabled
                         # route op, issue, busy, read flags and mask to FU
                         with m.If(enable):
                             # operand comes from the *local*  decoder
-                            comb += fu.oper_i.eq_from(e.do)
-                            #comb += fu.oper_i.eq_from_execute1(e)
-                            comb += fu.issue_i.eq(self.issue_i)
-                            comb += self.busy_o.eq(fu.busy_o)
+                            comb += fu.oper_i.eq_from(do)
+                            comb += fu.issue_i.eq(1) # issue when input valid
+                            comb += busy_o.eq(fu.busy_o)
                             # rdmask, which is for registers, needs to come
                             # from the *main* decoder
-                            rdmask = get_rdflags(self.e, fu)
+                            rdmask = get_rdflags(self.i.e, fu)
                             comb += fu.rdmaskn.eq(~rdmask)
 
+        # if instruction is busy, set busy output for core. also
+        # continue to hold each fu rdmask
+        for funame, fu in fus.items():
+            with m.If(fu.busy_o):
+                comb += busy_o.eq(fu.busy_o)
+
+        # set ready/valid signalling.  if busy, means refuse incoming issue
+        # XXX note: for an in-order core this is far too simple.  busy must
+        # be gated with the *availability* of the incoming (requested)
+        # instruction, where Core must be prepared to store-and-hold
+        # an instruction if no FU is available.
+        comb += self.p.o_ready.eq(~busy_o)
+
         return fu_bitdict
 
     def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
@@ -298,9 +429,9 @@ class NonProductionCore(Elaboratable):
                     src = fu.src_i[idx]
                     print("reg connect widths",
                           regfile, regname, pi, funame,
-                          src.shape(), rport.data_o.shape())
+                          src.shape(), rport.o_data.shape())
                     # all FUs connect to same port
-                    comb += src.eq(rport.data_o)
+                    comb += src.eq(rport.o_data)
 
         # or-reduce the muxed read signals
         if rfile.unary:
@@ -335,14 +466,17 @@ class NonProductionCore(Elaboratable):
 
             # argh.  an experiment to merge RA and RB in the INT regfile
             # (we have too many read/write ports)
-            #if regfile == 'INT':
-                #fuspecs['rabc'] = [fuspecs.pop('rb')]
-                #fuspecs['rabc'].append(fuspecs.pop('rc'))
-                #fuspecs['rabc'].append(fuspecs.pop('ra'))
-            #if regfile == 'FAST':
-            #    fuspecs['fast1'] = [fuspecs.pop('fast1')]
-            #    if 'fast2' in fuspecs:
-            #        fuspecs['fast1'].append(fuspecs.pop('fast2'))
+            if self.regreduce_en:
+                if regfile == 'INT':
+                    fuspecs['rabc'] = [fuspecs.pop('rb')]
+                    fuspecs['rabc'].append(fuspecs.pop('rc'))
+                    fuspecs['rabc'].append(fuspecs.pop('ra'))
+                if regfile == 'FAST':
+                    fuspecs['fast1'] = [fuspecs.pop('fast1')]
+                    if 'fast2' in fuspecs:
+                        fuspecs['fast1'].append(fuspecs.pop('fast2'))
+                    if 'fast3' in fuspecs:
+                        fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             # for each named regfile port, connect up all FUs to that port
             for (regname, fspec) in sort_fuspecs(fuspecs):
@@ -403,7 +537,7 @@ class NonProductionCore(Elaboratable):
                 pick = fu.wr.rel_o[idx] & fu_active  # & wrflag
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
-                wr_pick = Signal()
+                wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
                 comb += wr_pick.eq(wrpick.o[pi] & wrpick.en_o)
                 comb += fu.go_wr_i[idx].eq(rising_edge(m, wr_pick))
 
@@ -423,11 +557,11 @@ class NonProductionCore(Elaboratable):
                 # connect regfile port to input
                 print("reg connect widths",
                       regfile, regname, pi, funame,
-                      dest.shape(), wport.data_i.shape())
+                      dest.shape(), wport.i_data.shape())
                 wsigs.append(fu_dest_latch)
 
         # here is where we create the Write Broadcast Bus. simple, eh?
-        comb += wport.data_i.eq(ortreereduce_sig(wsigs))
+        comb += wport.i_data.eq(ortreereduce_sig(wsigs))
         if rfile.unary:
             # for unary-addressed
             comb += wport.wen.eq(ortreereduce_sig(wens))
@@ -460,14 +594,17 @@ class NonProductionCore(Elaboratable):
             fuspecs = byregfiles_wrspec[regfile]
             wrpickers[regfile] = {}
 
-            # argh, more port-merging
-            if regfile == 'INT':
-                fuspecs['o'] = [fuspecs.pop('o')]
-                fuspecs['o'].append(fuspecs.pop('o1'))
-            if regfile == 'FAST':
-                fuspecs['fast1'] = [fuspecs.pop('fast1')]
-                if 'fast2' in fuspecs:
-                    fuspecs['fast1'].append(fuspecs.pop('fast2'))
+            if self.regreduce_en:
+                # argh, more port-merging
+                if regfile == 'INT':
+                    fuspecs['o'] = [fuspecs.pop('o')]
+                    fuspecs['o'].append(fuspecs.pop('o1'))
+                if regfile == 'FAST':
+                    fuspecs['fast1'] = [fuspecs.pop('fast1')]
+                    if 'fast2' in fuspecs:
+                        fuspecs['fast1'].append(fuspecs.pop('fast2'))
+                    if 'fast3' in fuspecs:
+                        fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 self.connect_wrport(m, fu_bitdict, wrpickers,
@@ -478,7 +615,7 @@ class NonProductionCore(Elaboratable):
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
-        e = self.e # decoded instruction to execute
+        e = self.i.e # decoded instruction to execute
 
         # dictionary of lists of regfile ports
         byregfiles = {}
@@ -527,7 +664,7 @@ class NonProductionCore(Elaboratable):
 
     def __iter__(self):
         yield from self.fus.ports()
-        yield from self.e.ports()
+        yield from self.i.e.ports()
         yield from self.l0.ports()
         # TODO: regs