issuer.py: add microwatt_old and microwatt_debug options
[soc.git] / src / soc / simple / core.py
index e0ad6763a54cb376d24d415193795f31965ba2d1..9a4abacc3135e647ae4be3d9a8b7882e7ce68fe4 100644 (file)
@@ -17,21 +17,23 @@ the brain-dead part of this module is that even though there is no
 conflict of access, regfile read/write hazards are *not* analysed,
 and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
 conflict of access, regfile read/write hazards are *not* analysed,
 and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
+(update: actually this is being added now:
+https://bugs.libre-soc.org/show_bug.cgi?id=737)
 """
 
 """
 
-from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
+from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
+                    Const)
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
-from openpower.decoder.power_regspec_map import regspec_decode_read
-from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_regspec_map import regspec_decode
 from openpower.sv.svp64 import SVP64Rec
 
 from nmutil.picker import PriorityPicker
 from nmutil.util import treereduce
 from nmutil.singlepipe import ControlBase
 
 from openpower.sv.svp64 import SVP64Rec
 
 from nmutil.picker import PriorityPicker
 from nmutil.util import treereduce
 from nmutil.singlepipe import ControlBase
 
-from soc.fu.compunits.compunits import AllFunctionUnits
+from soc.fu.compunits.compunits import AllFunctionUnits, LDSTFunctionUnit
 from soc.regfile.regfiles import RegFiles
 from openpower.decoder.power_decoder2 import get_rdflags
 from soc.experiment.l0_cache import TstL0CacheBuffer  # test only
 from soc.regfile.regfiles import RegFiles
 from openpower.decoder.power_decoder2 import get_rdflags
 from soc.experiment.l0_cache import TstL0CacheBuffer  # test only
@@ -39,11 +41,13 @@ from soc.config.test.test_loadstore import TestMemPspec
 from openpower.decoder.power_enums import MicrOp, Function
 from soc.simple.core_data import CoreInput, CoreOutput
 
 from openpower.decoder.power_enums import MicrOp, Function
 from soc.simple.core_data import CoreInput, CoreOutput
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 import operator
 
 from nmutil.util import rising_edge
 
 import operator
 
 from nmutil.util import rising_edge
 
+FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"])
+ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"])
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
@@ -67,6 +71,47 @@ def sort_fuspecs(fuspecs):
     return res  # enumerate(res)
 
 
     return res  # enumerate(res)
 
 
+# a hazard bitvector "remap" function which returns an AST expression
+# that remaps read/write hazard regfile port numbers to either a full
+# bitvector or a reduced subset one.  SPR for example is reduced to a
+# single bit.
+# CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per
+# regfile!  therefore the remapping is per regfile, *NOT* per regfile
+# port and certainly not based on whether it is a read port or write port.
+# note that any reductions here will result in degraded performance due
+# to conflicts, but at least it keeps the hazard matrix sizes down to "sane"
+def bitvector_remap(regfile, rfile, port):
+    # 8-bits (at the moment, no SVP64), CR is unary: no remap
+    if regfile == 'CR':
+        return port
+    # 3 bits, unary alrady: return the port
+    if regfile == 'XER':
+        return port
+    # 3 bits, unary: return the port
+    if regfile == 'XER':
+        return port
+    # 5 bits, unary: return the port
+    if regfile == 'STATE':
+        return port
+    # 9 bits (9 entries), might be unary already
+    if regfile == 'FAST':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    # 10 bits (!!) - reduce to one
+    if regfile == 'SPR':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    if regfile == 'INT':
+        if rfile.unary: # INT, check if unary/binary
+            return port
+        else:
+            return 1 << port
+
+
 # derive from ControlBase rather than have a separate Stage instance,
 # this is simpler to do
 class NonProductionCore(ControlBase):
 # derive from ControlBase rather than have a separate Stage instance,
 # this is simpler to do
 class NonProductionCore(ControlBase):
@@ -80,8 +125,14 @@ class NonProductionCore(ControlBase):
         self.regreduce_en = (hasattr(pspec, "regreduce") and
                              (pspec.regreduce == True))
 
         self.regreduce_en = (hasattr(pspec, "regreduce") and
                              (pspec.regreduce == True))
 
+        # test to see if overlapping of instructions is allowed
+        # (not normally enabled for TestIssuer FSM but useful for checking
+        # the bitvector hazard detection, before doing In-Order)
+        self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+                             (pspec.allow_overlap == True))
+
         # test core type
         # test core type
-        self.make_hazard_vecs = True
+        self.make_hazard_vecs = self.allow_overlap
         self.core_type = "fsm"
         if hasattr(pspec, "core_type"):
             self.core_type = pspec.core_type
         self.core_type = "fsm"
         if hasattr(pspec, "core_type"):
             self.core_type = pspec.core_type
@@ -98,21 +149,44 @@ class NonProductionCore(ControlBase):
 
         # link LoadStore1 into MMU
         mmu = self.fus.get_fu('mmu0')
 
         # link LoadStore1 into MMU
         mmu = self.fus.get_fu('mmu0')
+        ldst0 = self.fus.get_fu('ldst0')
         print ("core pspec", pspec.ldst_ifacetype)
         print ("core mmu", mmu)
         if mmu is not None:
         print ("core pspec", pspec.ldst_ifacetype)
         print ("core mmu", mmu)
         if mmu is not None:
-            print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
-            mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+            lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+            print ("core lsmem.lsi", lsi)
+            mmu.alu.set_ldst_interface(lsi)
+            # urr store I-Cache in core so it is easier to get at
+            self.icache = lsi.icache
+
+        # alternative reset values for STATE regs. these probably shouldn't
+        # be set, here, instead have them done by Issuer. which they are.
+        # as well. because core.state overrides them. sigh.
+        self.msr_at_reset = 0x0
+        self.pc_at_reset = 0x0
+        if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+            self.msr_at_reset = pspec.msr_reset
+        if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+            self.pc_at_reset = pspec.pc_reset
+        state_resets = [self.pc_at_reset,  # PC at reset
+                        self.msr_at_reset, # MSR at reset
+                        0x0,               # SVSTATE at reset
+                        0x0,               # DEC at reset
+                        0x0]               # TB at reset
 
         # register files (yes plural)
 
         # register files (yes plural)
-        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs)
+        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+                                    state_resets=state_resets)
 
         # set up input and output: unusual requirement to set data directly
         # (due to the way that the core is set up in a different domain,
         # see TestIssuer.setup_peripherals
 
         # set up input and output: unusual requirement to set data directly
         # (due to the way that the core is set up in a different domain,
         # see TestIssuer.setup_peripherals
-        self.i, self.o = self.new_specs(None)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
         self.i, self.o = self.p.i_data, self.n.o_data
 
         self.i, self.o = self.p.i_data, self.n.o_data
 
+        # actual internal input data used (captured)
+        self.ireg = self.ispec()
+
         # create per-FU instruction decoders (subsetted).  these "satellite"
         # decoders reduce wire fan-out from the one (main) PowerDecoder2
         # (used directly by the trap unit) to the *twelve* (or more)
         # create per-FU instruction decoders (subsetted).  these "satellite"
         # decoders reduce wire fan-out from the one (main) PowerDecoder2
         # (used directly by the trap unit) to the *twelve* (or more)
@@ -122,6 +196,9 @@ class NonProductionCore(ControlBase):
         self.decoders = {}
         self.des = {}
 
         self.decoders = {}
         self.des = {}
 
+        # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti
+        # they should be shared (put into the ALU *once*).
+
         for funame, fu in self.fus.fus.items():
             f_name = fu.fnunit.name
             fnunit = fu.fnunit.value
         for funame, fu in self.fus.fus.items():
             f_name = fu.fnunit.name
             fnunit = fu.fnunit.value
@@ -130,17 +207,29 @@ class NonProductionCore(ControlBase):
                 # TRAP decoder is the *main* decoder
                 self.trapunit = funame
                 continue
                 # TRAP decoder is the *main* decoder
                 self.trapunit = funame
                 continue
+            assert funame not in self.decoders
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
-                                                      state=self.i.state,
+                                                      state=self.ireg.state,
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
+            print ("create decoder subset", funame, opkls, self.des[funame])
+
+        # create per-Function Unit write-after-write hazard signals
+        # yes, really, this should have been added in ReservationStations
+        # but hey.
+        for funame, fu in self.fus.fus.items():
+            fu._waw_hazard = Signal(name="waw_%s" % funame)
 
         # share the SPR decoder with the MMU if it exists
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
 
 
         # share the SPR decoder with the MMU if it exists
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
 
+        # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+        # if there is an MTSPR instruction
+        self.pause_dec_tb = Signal()
+
     # next 3 functions are Stage API Compliance
     def setup(self, m, i):
         pass
     # next 3 functions are Stage API Compliance
     def setup(self, m, i):
         pass
@@ -168,16 +257,26 @@ class NonProductionCore(ControlBase):
         regs = self.regs
         fus = self.fus.fus
 
         regs = self.regs
         fus = self.fus.fus
 
+        # amalgamate write-hazards into a single top-level Signal
+        self.waw_hazard = Signal()
+        whaz = []
+        for funame, fu in self.fus.fus.items():
+            whaz.append(fu._waw_hazard)
+        comb += self.waw_hazard.eq(Cat(*whaz).bool())
+
         # connect decoders
         self.connect_satellite_decoders(m)
 
         # ssh, cheat: trap uses the main decoder because of the rewriting
         # connect decoders
         self.connect_satellite_decoders(m)
 
         # ssh, cheat: trap uses the main decoder because of the rewriting
-        self.des[self.trapunit] = self.i.e.do
+        self.des[self.trapunit] = self.ireg.e.do
 
 
-        # connect up Function Units, then read/write ports
+        # connect up Function Units, then read/write ports, and hazard conflict
+        self.issue_conflict = Signal()
         fu_bitdict, fu_selected = self.connect_instruction(m)
         fu_bitdict, fu_selected = self.connect_instruction(m)
-        self.connect_rdports(m, fu_selected)
-        self.connect_wrports(m, fu_selected)
+        raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected)
+        self.connect_wrports(m, fu_bitdict, fu_selected)
+        if self.allow_overlap:
+            comb += self.issue_conflict.eq(raw_hazard)
 
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
 
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
@@ -195,23 +294,25 @@ class NonProductionCore(ControlBase):
             # connect each satellite decoder and give it the instruction.
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
             # connect each satellite decoder and give it the instruction.
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
-            comb += v.dec.raw_opcode_in.eq(self.i.raw_insn_i)
-            comb += v.dec.bigendian.eq(self.i.bigendian_i)
+            m.submodules["dec_%s" % k] = v
+            comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
+            comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
-            comb += v.sv_a_nz.eq(self.i.sv_a_nz)
-            if self.svp64_en:
-                comb += v.pred_sm.eq(self.i.sv_pred_sm)
-                comb += v.pred_dm.eq(self.i.sv_pred_dm)
-                if k != self.trapunit:
-                    comb += v.sv_rm.eq(self.i.sv_rm) # pass through SVP64 ReMap
-                    comb += v.is_svp64_mode.eq(self.i.is_svp64_mode)
-                    # only the LDST PowerDecodeSubset *actually* needs to
-                    # know to use the alternative decoder.  this is all
-                    # a terrible hack
-                    if k.lower().startswith("ldst"):
-                        comb += v.use_svp64_ldst_dec.eq(
-                                        self.i.use_svp64_ldst_dec)
+            comb += v.sv_a_nz.eq(self.ireg.sv_a_nz)
+            if not self.svp64_en:
+                continue
+            comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
+            comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
+            if k == self.trapunit:
+                continue
+            comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
+            comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
+            # only the LDST PowerDecodeSubset *actually* needs to
+            # know to use the alternative decoder.  this is all
+            # a terrible hack
+            if not k.lower().startswith("ldst"):
+                continue
+            comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec)
 
     def connect_instruction(self, m):
         """connect_instruction
 
     def connect_instruction(self, m):
         """connect_instruction
@@ -230,6 +331,14 @@ class NonProductionCore(ControlBase):
 
         # indicate if core is busy
         busy_o = self.o.busy_o
 
         # indicate if core is busy
         busy_o = self.o.busy_o
+        any_busy_o = self.o.any_busy_o
+
+        # connect up temporary copy of incoming instruction. the FSM will
+        # either blat the incoming instruction (if valid) into self.ireg
+        # or if the instruction could not be delivered, keep dropping the
+        # latched copy into ireg
+        ilatch = self.ispec()
+        self.instr_active = Signal()
 
         # enable/busy-signals for each FU, get one bit for each FU (by name)
         fu_enable = Signal(len(fus), reset_less=True)
 
         # enable/busy-signals for each FU, get one bit for each FU (by name)
         fu_enable = Signal(len(fus), reset_less=True)
@@ -271,8 +380,9 @@ class NonProductionCore(ControlBase):
                 # instruction.
                 fnunit = fu.fnunit.value
                 en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
                 # instruction.
                 fnunit = fu.fnunit.value
                 en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
-                fnmatch = (self.i.e.do.fn_unit & fnunit).bool()
-                comb += en_req.eq(fnmatch & ~fu.busy_o & self.p.i_valid)
+                fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool()
+                comb += en_req.eq(fnmatch & ~fu.busy_o &
+                                    self.instr_active)
                 i_l.append(en_req) # store in list for doing the Cat-trick
                 # picker output, gated by enable: store in fu_bitdict
                 po = Signal(name="o_issue_pick_"+funame) # picker output
                 i_l.append(en_req) # store in list for doing the Cat-trick
                 # picker output, gated by enable: store in fu_bitdict
                 po = Signal(name="o_issue_pick_"+funame) # picker output
@@ -282,63 +392,153 @@ class NonProductionCore(ControlBase):
                 # if we don't do this, then when there are no FUs available,
                 # the "p.o_ready" signal will go back "ok we accepted this
                 # instruction" which of course isn't true.
                 # if we don't do this, then when there are no FUs available,
                 # the "p.o_ready" signal will go back "ok we accepted this
                 # instruction" which of course isn't true.
-                comb += fu_found.eq(~fnmatch | i_pp.en_o)
+                with m.If(i_pp.en_o):
+                    comb += fu_found.eq(1)
             # for each input, Cat them together and drop them into the picker
             comb += i_pp.i.eq(Cat(*i_l))
 
             # for each input, Cat them together and drop them into the picker
             comb += i_pp.i.eq(Cat(*i_l))
 
+        # rdmask, which is for registers needs to come from the *main* decoder
+        for funame, fu in fus.items():
+            rdmask = get_rdflags(m, self.ireg.e, fu)
+            comb += fu.rdmaskn.eq(~rdmask)
+
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
             sync += counter.eq(counter - 1)
             comb += busy_o.eq(1)
 
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
             sync += counter.eq(counter - 1)
             comb += busy_o.eq(1)
 
-        with m.If(self.p.i_valid): # run only when valid
-            with m.Switch(self.i.e.do.insn_type):
-                # check for ATTN: halt if true
-                with m.Case(MicrOp.OP_ATTN):
-                    m.d.sync += self.o.core_terminate_o.eq(1)
-
-                # fake NOP - this isn't really used (Issuer detects NOP)
-                with m.Case(MicrOp.OP_NOP):
-                    sync += counter.eq(2)
-                    comb += busy_o.eq(1)
-
-                with m.Default():
-                    # connect up instructions.  only one enabled at a time
+        # default to reading from incoming instruction: may be overridden
+        # by copy from latch when "waiting"
+        comb += self.ireg.eq(self.i)
+        # always say "ready" except if overridden
+        comb += self.p.o_ready.eq(1)
+
+        with m.FSM():
+            with m.State("READY"):
+                with m.If(self.p.i_valid): # run only when valid
+                    with m.Switch(self.ireg.e.do.insn_type):
+                        # check for ATTN: halt if true
+                        with m.Case(MicrOp.OP_ATTN):
+                            m.d.sync += self.o.core_terminate_o.eq(1)
+
+                        # fake NOP - this isn't really used (Issuer detects NOP)
+                        with m.Case(MicrOp.OP_NOP):
+                            sync += counter.eq(2)
+                            comb += busy_o.eq(1)
+
+                        with m.Default():
+                            comb += self.instr_active.eq(1)
+                            comb += self.p.o_ready.eq(0)
+                            # connect instructions. only one enabled at a time
+                            for funame, fu in fus.items():
+                                do = self.des[funame]
+                                enable = fu_bitdict[funame]
+
+                                # run this FunctionUnit if enabled route op,
+                                # issue, busy, read flags and mask to FU
+                                with m.If(enable):
+                                    # operand comes from the *local*  decoder
+                                    # do not actually issue, though, if there
+                                    # is a waw hazard. decoder has to still
+                                    # be asserted in order to detect that, tho
+                                    comb += fu.oper_i.eq_from(do)
+                                    if funame == 'mmu0':
+                                        # URRR this is truly dreadful.
+                                        # OP_FETCH_FAILED is a "fake" op.
+                                        # no instruction creates it.  OP_TRAP
+                                        # uses the *main* decoder: this is
+                                        # a *Satellite* decoder that reacts
+                                        # on *insn_in*... not fake ops. gaah.
+                                        main_op = self.ireg.e.do
+                                        with m.If(main_op.insn_type ==
+                                                  MicrOp.OP_FETCH_FAILED):
+                                            comb += fu.oper_i.insn_type.eq(
+                                                  MicrOp.OP_FETCH_FAILED)
+                                            comb += fu.oper_i.fn_unit.eq(
+                                                  Function.MMU)
+                                    # issue when valid (and no write-hazard)
+                                    comb += fu.issue_i.eq(~self.waw_hazard)
+                                    # instruction ok, indicate ready
+                                    comb += self.p.o_ready.eq(1)
+
+                            if self.allow_overlap:
+                                with m.If(~fu_found | self.waw_hazard):
+                                    # latch copy of instruction
+                                    sync += ilatch.eq(self.i)
+                                    comb += self.p.o_ready.eq(1) # accept
+                                    comb += busy_o.eq(1)
+                                    m.next = "WAITING"
+
+            with m.State("WAITING"):
+                comb += self.instr_active.eq(1)
+                comb += self.p.o_ready.eq(0)
+                comb += busy_o.eq(1)
+                # using copy of instruction, keep waiting until an FU is free
+                comb += self.ireg.eq(ilatch)
+                with m.If(fu_found): # wait for conflict to clear
+                    # connect instructions. only one enabled at a time
                     for funame, fu in fus.items():
                         do = self.des[funame]
                         enable = fu_bitdict[funame]
 
                     for funame, fu in fus.items():
                         do = self.des[funame]
                         enable = fu_bitdict[funame]
 
-                        # run this FunctionUnit if enabled
-                        # route op, issue, busy, read flags and mask to FU
+                        # run this FunctionUnit if enabled route op,
+                        # issue, busy, read flags and mask to FU
                         with m.If(enable):
                         with m.If(enable):
-                            # operand comes from the *local*  decoder
+                            # operand comes from the *local* decoder,
+                            # which is asserted even if not issued,
+                            # so that WaW-detection can check for hazards.
+                            # only if the waw hazard is clear does the
+                            # instruction actually get issued
                             comb += fu.oper_i.eq_from(do)
                             comb += fu.oper_i.eq_from(do)
-                            comb += fu.issue_i.eq(1) # issue when input valid
-                            # rdmask, which is for registers, needs to come
-                            # from the *main* decoder
-                            rdmask = get_rdflags(self.i.e, fu)
-                            comb += fu.rdmaskn.eq(~rdmask)
-
-        # if instruction is busy, set busy output for core.
-        busys = map(lambda fu: fu.busy_o, fus.values())
-        comb += busy_o.eq(Cat(*busys).bool())
-
-        # ready/valid signalling.  if busy, means refuse incoming issue.
-        # (this is a global signal, TODO, change to one which allows
-        # overlapping instructions)
-        # also, if there was no fu found we must not send back a valid
-        # indicator.  BUT, of course, when there is no instruction
-        # we must ignore the fu_found flag, otherwise o_ready will never
-        # be set when everything is idle
-        comb += self.p.o_ready.eq(fu_found | ~self.p.i_valid)
+                            # issue when valid
+                            comb += fu.issue_i.eq(~self.waw_hazard)
+                            with m.If(~self.waw_hazard):
+                                comb += self.p.o_ready.eq(1)
+                                comb += busy_o.eq(0)
+                                m.next = "READY"
+
+        print ("core: overlap allowed", self.allow_overlap)
+        # true when any FU is busy (including the cycle where it is perhaps
+        # to be issued - because that's what fu_busy is)
+        comb += any_busy_o.eq(fu_busy.bool())
+        if not self.allow_overlap:
+            # for simple non-overlap, if any instruction is busy, set
+            # busy output for core.
+            comb += busy_o.eq(any_busy_o)
+        else:
+            # sigh deal with a fun situation that needs to be investigated
+            # and resolved
+            with m.If(self.issue_conflict):
+                comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
+                # for SPR pipeline pause dec/tb FSM to avoid race condition
+                # TODO: really this should be much more sophisticated,
+                # spot MTSPR, spot that DEC/TB is what is to be updated.
+                # a job for PowerDecoder2, there
+                if funame.lower().startswith('spr'):
+                    with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+                        ):
+                        comb += self.pause_dec_tb.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
         # connecters to give them permission to request access to regfiles
         return fu_bitdict, fu_selected
 
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
         # connecters to give them permission to request access to regfiles
         return fu_bitdict, fu_selected
 
-    def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
+    def connect_rdport(self, m, fu_bitdict, fu_selected,
+                                rdpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -351,61 +551,108 @@ class NonProductionCore(ControlBase):
         print("read regfile", rpidx, regfile, regs.rf.keys(),
                               rfile, rfile.unary)
 
         print("read regfile", rpidx, regfile, regs.rf.keys(),
                               rfile, rfile.unary)
 
+        # for checking if the read port has an outstanding write
+        if self.make_hazard_vecs:
+            wv = regs.wv[regfile.lower()]
+            wvchk = wv.q_int # write-vec bit-level hazard check
+
+        # if a hazard is detected on this read port, simply blithely block
+        # every FU from reading on it.  this is complete overkill but very
+        # simple for now.
+        hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
+
         fspecs = fspec
         if not isinstance(fspecs, list):
             fspecs = [fspecs]
 
         rdflags = []
         pplen = 0
         fspecs = fspec
         if not isinstance(fspecs, list):
             fspecs = [fspecs]
 
         rdflags = []
         pplen = 0
-        reads = []
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, fspec, len(fuspec))
+            print ("fpsec", i, fspec, len(fspec.specs))
+            name = "%s_%s_%d" % (regfile, regname, i)
             ppoffs.append(pplen) # record offset for picker
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
-            name = "rdflag_%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name=name, reset_less=True)
-            comb += rdflag.eq(rf)
+            pplen += len(fspec.specs)
+            rdflag = Signal(name="rdflag_"+name, reset_less=True)
+            comb += rdflag.eq(fspec.okflag)
             rdflags.append(rdflag)
             rdflags.append(rdflag)
-            reads.append(read)
 
         print ("pplen", pplen)
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
 
         print ("pplen", pplen)
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
 
         rens = []
         addrs = []
         wvens = []
 
         for i, fspec in enumerate(fspecs):
 
         rens = []
         addrs = []
         wvens = []
 
         for i, fspec in enumerate(fspecs):
-            (rf, wf, read, write, wid, fuspec) = fspec
+            (rf, _read, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
                 pi += ppoffs[i]
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
+                fu_active = fu_selected[funame]
+                fu_issued = fu_bitdict[funame]
+
+                # get (or set up) a latched copy of read register number
+                # and (sigh) also the read-ok flag
+                # TODO: use nmutil latchregister
+                rhname = "%s_%s_%d" % (regfile, regname, i)
+                rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+                                reset_less=True)
+                if rhname not in fu.rf_latches:
+                    rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname))
+                    fu.rf_latches[rhname] = rfl
+                    with m.If(fu.issue_i):
+                        sync += rfl.eq(rdflags[i])
+                else:
+                    rfl = fu.rf_latches[rhname]
+
+                # now the register port
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+                read = Signal.like(_read, name="read_"+rname)
+                if rname not in fu.rd_latches:
+                    rdl = Signal.like(_read, name="rdlatch_"+rname)
+                    fu.rd_latches[rname] = rdl
+                    with m.If(fu.issue_i):
+                        sync += rdl.eq(_read)
+                else:
+                    rdl = fu.rd_latches[rname]
+
+                # make the read immediately available on issue cycle
+                # after the read cycle, otherwies use the latched copy.
+                # this captures the regport and okflag on issue
+                with m.If(fu.issue_i):
+                    comb += read.eq(_read)
+                    comb += rdflag.eq(rdflags[i])
+                with m.Else():
+                    comb += read.eq(rdl)
+                    comb += rdflag.eq(rfl)
 
                 # connect request-read to picker input, and output to go-rd
 
                 # connect request-read to picker input, and output to go-rd
-                fu_active = fu_bitdict[funame]
-                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
-                addr_en = Signal.like(reads[i], name="addr_en_"+name)
+                addr_en = Signal.like(read, name="addr_en_"+name)
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+                rhazard = Signal(name="rhaz_"+name)
 
                 # exclude any currently-enabled read-request (mask out active)
 
                 # exclude any currently-enabled read-request (mask out active)
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
-                                ~delay_pick)
+                # entirely block anything hazarded from being picked
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
+                                ~delay_pick & ~rhazard)
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
                 # if picked, select read-port "reg select" number to port
                 comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
                 sync += delay_pick.eq(rp) # delayed "pick"
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
                 # if picked, select read-port "reg select" number to port
                 comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
                 sync += delay_pick.eq(rp) # delayed "pick"
-                comb += addr_en.eq(Mux(rp, reads[i], 0))
+                comb += addr_en.eq(Mux(rp, read, 0))
 
                 # the read-enable happens combinatorially (see mux-bus below)
                 # but it results in the data coming out on a one-cycle delay.
 
                 # the read-enable happens combinatorially (see mux-bus below)
                 # but it results in the data coming out on a one-cycle delay.
@@ -425,6 +672,32 @@ class NonProductionCore(ControlBase):
                     # all FUs connect to same port
                     comb += src.eq(rport.o_data)
 
                     # all FUs connect to same port
                     comb += src.eq(rport.o_data)
 
+                if not self.make_hazard_vecs:
+                    continue
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
+                issue_active = Signal(name="rd_iactive_"+name)
+                # XXX combinatorial loop here
+                comb += issue_active.eq(fu_active & rdflag)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(read)
+                    else:
+                        comb += wvchk_en.eq(1<<read)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~rhazard):
+                        comb += wvchk_en.eq(0)
+
+                # read-hazard is ANDed with (filtered by) what is actually
+                # being requested.
+                comb += rhazard.eq((wvchk & wvchk_en).bool())
+
+                wvens.append(wvchk_en)
+
         # or-reduce the muxed read signals
         if rfile.unary:
             # for unary-addressed
         # or-reduce the muxed read signals
         if rfile.unary:
             # for unary-addressed
@@ -435,7 +708,17 @@ class NonProductionCore(ControlBase):
             comb += rport.ren.eq(Cat(*rens).bool())
             print ("binary", regfile, rpidx, rport, rport.ren, rens, addrs)
 
             comb += rport.ren.eq(Cat(*rens).bool())
             print ("binary", regfile, rpidx, rport, rport.ren, rens, addrs)
 
-    def connect_rdports(self, m, fu_bitdict):
+        if not self.make_hazard_vecs:
+            return Const(0) # declare "no hazards"
+
+        # enable the read bitvectors for this issued instruction
+        # and return whether any write-hazard bit is set
+        wvchk_and = Signal(len(wvchk), name="wv_chk_"+name)
+        comb += wvchk_and.eq(wvchk & ortreereduce_sig(wvens))
+        comb += hazard_detected.eq(wvchk_and.bool())
+        return hazard_detected
+
+    def connect_rdports(self, m, fu_bitdict, fu_selected):
         """connect read ports
 
         orders the read regspecs into a dict-of-dicts, by regfile, by
         """connect read ports
 
         orders the read regspecs into a dict-of-dicts, by regfile, by
@@ -445,15 +728,15 @@ class NonProductionCore(ControlBase):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
+        rd_hazard = []
 
         # dictionary of lists of regfile read ports
 
         # dictionary of lists of regfile read ports
-        byregfiles_rd, byregfiles_rdspec = self.get_byregfiles(True)
+        byregfiles_rdspec = self.get_byregfiles(m, True)
 
         # okaay, now we need a PriorityPicker per regfile per regfile port
         # loootta pickers... peter piper picked a pack of pickled peppers...
         rdpickers = {}
 
         # okaay, now we need a PriorityPicker per regfile per regfile port
         # loootta pickers... peter piper picked a pack of pickled peppers...
         rdpickers = {}
-        for regfile, spec in byregfiles_rd.items():
-            fuspecs = byregfiles_rdspec[regfile]
+        for regfile, fuspecs in byregfiles_rdspec.items():
             rdpickers[regfile] = {}
 
             # argh.  an experiment to merge RA and RB in the INT regfile
             rdpickers[regfile] = {}
 
             # argh.  an experiment to merge RA and RB in the INT regfile
@@ -471,14 +754,20 @@ class NonProductionCore(ControlBase):
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             # for each named regfile port, connect up all FUs to that port
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             # for each named regfile port, connect up all FUs to that port
+            # also return (and collate) hazard detection)
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 print("connect rd", regname, fspec)
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 print("connect rd", regname, fspec)
-                self.connect_rdport(m, fu_bitdict, rdpickers, regfile,
+                rh = self.connect_rdport(m, fu_bitdict, fu_selected,
+                                       rdpickers, regfile,
                                        regname, fspec)
                                        regname, fspec)
+                rd_hazard.append(rh)
+
+        return Cat(*rd_hazard).bool()
 
     def make_hazards(self, m, regfile, rfile, wvclr, wvset,
                     funame, regname, idx,
 
     def make_hazards(self, m, regfile, rfile, wvclr, wvset,
                     funame, regname, idx,
-                    addr_en, wp, fu, fu_active, wrflag, write):
+                    addr_en, wp, fu, fu_active, wrflag, write,
+                    fu_wrok):
         """make_hazards: a setter and a clearer for the regfile write ports
 
         setter is at issue time (using PowerDecoder2 regfile write numbers)
         """make_hazards: a setter and a clearer for the regfile write ports
 
         setter is at issue time (using PowerDecoder2 regfile write numbers)
@@ -489,39 +778,64 @@ class NonProductionCore(ControlBase):
         (has its data.ok bit CLEARED).  this is perfectly legitimate.
         and a royal pain.
         """
         (has its data.ok bit CLEARED).  this is perfectly legitimate.
         and a royal pain.
         """
-        comb = m.d.comb
+        comb, sync = m.d.comb, m.d.sync
         name = "%s_%s_%d" % (funame, regname, idx)
 
         name = "%s_%s_%d" % (funame, regname, idx)
 
-        # deal with write vector clear: this kicks in when the regfile
-        # is written to, and clears the corresponding bitvector entry
-        print ("write vector", regfile, wvclr)
-        wvaddr_en = Signal(len(wvclr.wen), name="wvaddr_en_"+name)
-        if rfile.unary:
-            comb += wvaddr_en.eq(addr_en)
-        else:
-            with m.If(wp):
-                comb += wvaddr_en.eq(1<<addr_en)
-
-        # now connect up the bitvector write hazard.  unlike the
+        # connect up the bitvector write hazard.  unlike the
         # regfile writeports, a ONE must be written to the corresponding
         # bit of the hazard bitvector (to indicate the existence of
         # the hazard)
 
         # the detection of what shall be written to is based
         # regfile writeports, a ONE must be written to the corresponding
         # bit of the hazard bitvector (to indicate the existence of
         # the hazard)
 
         # the detection of what shall be written to is based
-        # on *issue*
+        # on *issue*.  it is delayed by 1 cycle so that instructions
+        # "addi 5,5,0x2" do not cause combinatorial loops due to
+        # fake-dependency on *themselves*.  this will totally fail
+        # spectacularly when doing multi-issue
         print ("write vector (for regread)", regfile, wvset)
         print ("write vector (for regread)", regfile, wvset)
-        wviaddr_en = Signal(len(wvset.wen), name="wv_issue_addr_en_"+name)
+        wviaddr_en = Signal(len(wvset), name="wv_issue_addr_en_"+name)
         issue_active = Signal(name="iactive_"+name)
         issue_active = Signal(name="iactive_"+name)
-        comb += issue_active.eq(fu.issue_i & fu_active & wrflag)
+        sync += issue_active.eq(fu.issue_i & fu_active & wrflag)
         with m.If(issue_active):
             if rfile.unary:
                 comb += wviaddr_en.eq(write)
             else:
                 comb += wviaddr_en.eq(1<<write)
 
         with m.If(issue_active):
             if rfile.unary:
                 comb += wviaddr_en.eq(write)
             else:
                 comb += wviaddr_en.eq(1<<write)
 
+        # deal with write vector clear: this kicks in when the regfile
+        # is written to, and clears the corresponding bitvector entry
+        print ("write vector", regfile, wvclr)
+        wvaddr_en = Signal(len(wvclr), name="wvaddr_en_"+name)
+        if rfile.unary:
+            comb += wvaddr_en.eq(addr_en)
+        else:
+            with m.If(wp):
+                comb += wvaddr_en.eq(1<<addr_en)
+
+        # XXX ASSUME that LDSTFunctionUnit always sets the data it intends to
+        # this may NOT be the case when an exception occurs
+        if isinstance(fu, LDSTFunctionUnit):
+            return wvaddr_en, wviaddr_en
+
+        # okaaay, this is preparation for the awkward case.
+        # * latch a copy of wrflag when issue goes high.
+        # * when the fu_wrok (data.ok) flag is NOT set,
+        #   but the FU is done, the FU is NEVER going to write
+        #   so the bitvector has to be cleared.
+        latch_wrflag = Signal(name="latch_wrflag_"+name)
+        with m.If(~fu.busy_o):
+            sync += latch_wrflag.eq(0)
+        with m.If(fu.issue_i & fu_active):
+            sync += latch_wrflag.eq(wrflag)
+        with m.If(fu.alu_done_o & latch_wrflag & ~fu_wrok):
+            if rfile.unary:
+                comb += wvaddr_en.eq(write) # addr_en gated with wp, don't use
+            else:
+                comb += wvaddr_en.eq(1<<addr_en) # binary addr_en not gated
+
         return wvaddr_en, wviaddr_en
 
         return wvaddr_en, wviaddr_en
 
-    def connect_wrport(self, m, fu_bitdict, wrpickers, regfile, regname, fspec):
+    def connect_wrport(self, m, fu_bitdict, fu_selected,
+                                wrpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -540,8 +854,9 @@ class NonProductionCore(ControlBase):
         # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
         # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
-            wvset = wv.w_ports["set"] # write-vec bit-level hazard ctrl
-            wvclr = wv.w_ports["clr"] # write-vec bit-level hazard ctrl
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvchk = wv.q # write-after-write hazard check
 
         fspecs = fspec
         if not isinstance(fspecs, list):
 
         fspecs = fspec
         if not isinstance(fspecs, list):
@@ -550,59 +865,73 @@ class NonProductionCore(ControlBase):
         pplen = 0
         writes = []
         ppoffs = []
         pplen = 0
         writes = []
         ppoffs = []
-        rdflags = []
         wrflags = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
         wrflags = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, "wrflag", wf, fspec, len(fuspec))
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            print ("fpsec", i, "wrflag", wf, fspec, len(fuspecs))
             ppoffs.append(pplen) # record offset for picker
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
+            pplen += len(fuspecs)
 
             name = "%s_%s_%d" % (regfile, regname, i)
 
             name = "%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name="rd_flag_"+name)
             wrflag = Signal(name="wr_flag_"+name)
             wrflag = Signal(name="wr_flag_"+name)
-            if rf is not None:
-                comb += rdflag.eq(rf)
-            else:
-                comb += rdflag.eq(0)
             if wf is not None:
                 comb += wrflag.eq(wf)
             else:
                 comb += wrflag.eq(0)
             if wf is not None:
                 comb += wrflag.eq(wf)
             else:
                 comb += wrflag.eq(0)
-            rdflags.append(rdflag)
             wrflags.append(wrflag)
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
             wrflags.append(wrflag)
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
 
         wsigs = []
         wens = []
         wvsets = []
         wvseten = []
         wvclren = []
 
         wsigs = []
         wens = []
         wvsets = []
         wvseten = []
         wvclren = []
+        #wvens = [] - not needed: reading of writevec is permanently held hi
         addrs = []
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
         addrs = []
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
-            (rf, wf, read, _write, wid, fuspec) = fspec
-            wrname = "write_%s_%s_%d" % (regfile, regname, i)
-            write = Signal.like(_write, name=wrname)
-            comb += write.eq(_write)
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
+                fu_requested = fu_bitdict[funame]
                 pi += ppoffs[i]
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+                # get (or set up) a write-latched copy of write register number
+                write = Signal.like(_write, name="write_"+name)
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+                if rname not in fu.wr_latches:
+                    wrl = Signal.like(_write, name="wrlatch_"+rname)
+                    fu.wr_latches[rname] = write
+                    # do not depend on fu.issue_i here, it creates a
+                    # combinatorial loop on waw checking. using the FU
+                    # "enable" bitdict entry for this FU is sufficient,
+                    # because the PowerDecoder2 read/write nums are
+                    # valid continuously when the instruction is valid
+                    with m.If(fu_requested):
+                        sync += wrl.eq(_write)
+                        comb += write.eq(_write)
+                    with m.Else():
+                        comb += write.eq(wrl)
+                else:
+                    write = fu.wr_latches[rname]
 
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
 
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
-                name = "wrflag_%s_%s_%d" % (funame, regname, idx)
-                wrflag = Signal(name=name, reset_less=True)
-                comb += wrflag.eq(dest.ok & fu.busy_o)
+                name = "%s_%s_%d" % (funame, regname, idx)
+                fu_wrok = Signal(name="fu_wrok_"+name, reset_less=True)
+                comb += fu_wrok.eq(dest.ok & fu.busy_o)
 
                 # connect request-write to picker input, and output to go-wr
 
                 # connect request-write to picker input, and output to go-wr
-                fu_active = fu_bitdict[funame]
-                pick = fu.wr.rel_o[idx] & fu_active  # & wrflag
+                fu_active = fu_selected[funame]
+                pick = fu.wr.rel_o[idx] & fu_active
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
                 wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
                 wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
@@ -635,11 +964,48 @@ class NonProductionCore(ControlBase):
                 res = self.make_hazards(m, regfile, rfile, wvclr, wvset,
                                         funame, regname, idx,
                                         addr_en, wp, fu, fu_active,
                 res = self.make_hazards(m, regfile, rfile, wvclr, wvset,
                                         funame, regname, idx,
                                         addr_en, wp, fu, fu_active,
-                                        wrflags[i], write)
+                                        wrflags[i], write, fu_wrok)
                 wvaddr_en, wv_issue_en = res
                 wvclren.append(wvaddr_en)   # set only: no data => clear bit
                 wvseten.append(wv_issue_en) # set data same as enable
                 wvaddr_en, wv_issue_en = res
                 wvclren.append(wvaddr_en)   # set only: no data => clear bit
                 wvseten.append(wv_issue_en) # set data same as enable
-                wvsets.append(wv_issue_en)  # because enable needs a 1
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                fu_requested = fu_bitdict[funame]
+                wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name)
+                issue_active = Signal(name="waw_iactive_"+name)
+                whazard = Signal(name="whaz_"+name)
+                if wf is None:
+                    # XXX EEK! STATE regfile (branch) does not have an
+                    # write-active indicator in regspec_decode_write()
+                    print ("XXX FIXME waw_iactive", issue_active,
+                                                    fu_requested, wf)
+                else:
+                    # check bits from the incoming instruction.  note (back
+                    # in connect_instruction) that the decoder is held for
+                    # us to be able to do this, here... *without* issue being
+                    # held HI.  we MUST NOT gate this with fu.issue_i or
+                    # with fu_bitdict "enable": it would create a loop
+                    comb += issue_active.eq(wf)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(write)
+                    else:
+                        comb += wvchk_en.eq(1<<write)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~whazard):
+                        comb += wvchk_en.eq(0)
+
+                # write-hazard is ANDed with (filtered by) what is actually
+                # being requested.  the wvchk data is on a one-clock delay,
+                # and wvchk_en comes directly from the main decoder
+                comb += whazard.eq((wvchk & wvchk_en).bool())
+                with m.If(whazard):
+                    comb += fu._waw_hazard.eq(1)
+
+                #wvens.append(wvchk_en)
 
         # here is where we create the Write Broadcast Bus. simple, eh?
         comb += wport.i_data.eq(ortreereduce_sig(wsigs))
 
         # here is where we create the Write Broadcast Bus. simple, eh?
         comb += wport.i_data.eq(ortreereduce_sig(wsigs))
@@ -651,12 +1017,22 @@ class NonProductionCore(ControlBase):
             comb += wport.addr.eq(ortreereduce_sig(addrs))
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
             comb += wport.addr.eq(ortreereduce_sig(addrs))
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
-        # for write-vectors
-        comb += wvclr.wen.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
-        comb += wvset.wen.eq(ortreereduce_sig(wvseten)) # set (issue time)
-        comb += wvset.i_data.eq(ortreereduce_sig(wvsets))
-
-    def connect_wrports(self, m, fu_bitdict):
+        if not self.make_hazard_vecs:
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (wvclren, # clear (regfile write)
+                wvseten) # set (issue time)
+
+    def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
 
         orders the write regspecs into a dict-of-dicts, by regfile,
         """connect write ports
 
         orders the write regspecs into a dict-of-dicts, by regfile,
@@ -671,13 +1047,14 @@ class NonProductionCore(ControlBase):
         fus = self.fus.fus
         regs = self.regs
         # dictionary of lists of regfile write ports
         fus = self.fus.fus
         regs = self.regs
         # dictionary of lists of regfile write ports
-        byregfiles_wr, byregfiles_wrspec = self.get_byregfiles(False)
+        byregfiles_wrspec = self.get_byregfiles(m, False)
 
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
 
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
-        for regfile, spec in byregfiles_wr.items():
-            fuspecs = byregfiles_wrspec[regfile]
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
+        for regfile, fuspecs in byregfiles_wrspec.items():
             wrpickers[regfile] = {}
 
             if self.regreduce_en:
             wrpickers[regfile] = {}
 
             if self.regreduce_en:
@@ -692,61 +1069,91 @@ class NonProductionCore(ControlBase):
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
             for (regname, fspec) in sort_fuspecs(fuspecs):
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                         regfile, regname, fspec)
                                         regfile, regname, fspec)
-
-    def get_byregfiles(self, readmode):
+                wvclrers[regfile.lower()] += wvclren
+                wvseters[regfile.lower()] += wvseten
+
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+
+    def get_byregfiles(self, m, readmode):
 
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
 
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
-        e = self.i.e # decoded instruction to execute
+        e = self.ireg.e # decoded instruction to execute
+
+        # dictionary of dictionaries of lists/tuples of regfile ports.
+        # first key: regfile.  second key: regfile port name
+        byregfiles_spec = defaultdict(dict)
 
 
-        # dictionary of lists of regfile ports
-        byregfiles = {}
-        byregfiles_spec = {}
         for (funame, fu) in fus.items():
         for (funame, fu) in fus.items():
+            # create in each FU a receptacle for the read/write register
+            # hazard numbers (and okflags for read).  to be latched in
+            # connect_rd/write_ports
+            if readmode:
+                fu.rd_latches = {} # read reg number latches
+                fu.rf_latches = {} # read flag latches
+            else:
+                fu.wr_latches = {}
+
+            # construct regfile specs: read uses inspec, write outspec
             print("%s ports for %s" % (mode, funame))
             for idx in range(fu.n_src if readmode else fu.n_dst):
             print("%s ports for %s" % (mode, funame))
             for idx in range(fu.n_src if readmode else fu.n_dst):
-                if readmode:
-                    (regfile, regname, wid) = fu.get_in_spec(idx)
-                else:
-                    (regfile, regname, wid) = fu.get_out_spec(idx)
+                (regfile, regname, wid) = fu.get_io_spec(readmode, idx)
                 print("    %d %s %s %s" % (idx, regfile, regname, str(wid)))
                 print("    %d %s %s %s" % (idx, regfile, regname, str(wid)))
-                if readmode:
-                    rdflag, read = regspec_decode_read(e, regfile, regname)
-                    wrport, write = None, None
-                else:
-                    rdflag, read = None, None
-                    wrport, write = regspec_decode_write(e, regfile, regname)
-                if regfile not in byregfiles:
-                    byregfiles[regfile] = {}
-                    byregfiles_spec[regfile] = {}
+
+                # the PowerDecoder2 (main one, not the satellites) contains
+                # the decoded regfile numbers. obtain these now
+                decinfo = regspec_decode(m, readmode, e, regfile, regname)
+                okflag, regport = decinfo.okflag, decinfo.regport
+
+                # construct the dictionary of regspec information by regfile
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
-                        (rdflag, wrport, read, write, wid, [])
-                # here we start to create "lanes"
-                if idx not in byregfiles[regfile]:
-                    byregfiles[regfile][idx] = []
-                fuspec = (funame, fu, idx)
-                byregfiles[regfile][idx].append(fuspec)
-                byregfiles_spec[regfile][regname][5].append(fuspec)
-
-        # ok just print that out, for convenience
-        for regfile, spec in byregfiles.items():
+                        ByRegSpec(okflag, regport, wid, [])
+
+                # here we start to create "lanes" where each Function Unit
+                # requiring access to a given [single-contended resource]
+                # regfile port is appended to a list, so that PriorityPickers
+                # can be created to give uncontested access to it
+                fuspec = FUSpec(funame, fu, idx)
+                byregfiles_spec[regfile][regname].specs.append(fuspec)
+
+        # ok just print that all out, for convenience
+        for regfile, fuspecs in byregfiles_spec.items():
             print("regfile %s ports:" % mode, regfile)
             print("regfile %s ports:" % mode, regfile)
-            fuspecs = byregfiles_spec[regfile]
             for regname, fspec in fuspecs.items():
             for regname, fspec in fuspecs.items():
-                [rdflag, wrflag, read, write, wid, fuspec] = fspec
+                [okflag, regport, wid, fuspecs] = fspec
                 print("  rf %s port %s lane: %s" % (mode, regfile, regname))
                 print("  rf %s port %s lane: %s" % (mode, regfile, regname))
-                print("  %s" % regname, wid, read, write, rdflag, wrflag)
-                for (funame, fu, idx) in fuspec:
+                print("  %s" % regname, wid, okflag, regport)
+                for (funame, fu, idx) in fuspecs:
                     fusig = fu.src_i[idx] if readmode else fu.dest[idx]
                     print("    ", funame, fu.__class__.__name__, idx, fusig)
                     print()
 
                     fusig = fu.src_i[idx] if readmode else fu.dest[idx]
                     print("    ", funame, fu.__class__.__name__, idx, fusig)
                     print()
 
-        return byregfiles, byregfiles_spec
+        return byregfiles_spec
 
     def __iter__(self):
         yield from self.fus.ports()
 
     def __iter__(self):
         yield from self.fus.ports()
@@ -761,7 +1168,8 @@ class NonProductionCore(ControlBase):
 if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
 if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
+                         allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)