issuer.py: add microwatt_old and microwatt_debug options
[soc.git] / src / soc / simple / core.py
index c47a9464402659851a9e2ceffaa4dd7d1c6a6aac..9a4abacc3135e647ae4be3d9a8b7882e7ce68fe4 100644 (file)
@@ -17,6 +17,8 @@ the brain-dead part of this module is that even though there is no
 conflict of access, regfile read/write hazards are *not* analysed,
 and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
+(update: actually this is being added now:
+https://bugs.libre-soc.org/show_bug.cgi?id=737)
 """
 
 from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
@@ -24,8 +26,7 @@ from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
-from openpower.decoder.power_regspec_map import regspec_decode_read
-from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_regspec_map import regspec_decode
 from openpower.sv.svp64 import SVP64Rec
 
 from nmutil.picker import PriorityPicker
@@ -40,11 +41,13 @@ from soc.config.test.test_loadstore import TestMemPspec
 from openpower.decoder.power_enums import MicrOp, Function
 from soc.simple.core_data import CoreInput, CoreOutput
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 import operator
 
 from nmutil.util import rising_edge
 
+FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"])
+ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"])
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
@@ -68,6 +71,47 @@ def sort_fuspecs(fuspecs):
     return res  # enumerate(res)
 
 
+# a hazard bitvector "remap" function which returns an AST expression
+# that remaps read/write hazard regfile port numbers to either a full
+# bitvector or a reduced subset one.  SPR for example is reduced to a
+# single bit.
+# CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per
+# regfile!  therefore the remapping is per regfile, *NOT* per regfile
+# port and certainly not based on whether it is a read port or write port.
+# note that any reductions here will result in degraded performance due
+# to conflicts, but at least it keeps the hazard matrix sizes down to "sane"
+def bitvector_remap(regfile, rfile, port):
+    # 8-bits (at the moment, no SVP64), CR is unary: no remap
+    if regfile == 'CR':
+        return port
+    # 3 bits, unary alrady: return the port
+    if regfile == 'XER':
+        return port
+    # 3 bits, unary: return the port
+    if regfile == 'XER':
+        return port
+    # 5 bits, unary: return the port
+    if regfile == 'STATE':
+        return port
+    # 9 bits (9 entries), might be unary already
+    if regfile == 'FAST':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    # 10 bits (!!) - reduce to one
+    if regfile == 'SPR':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    if regfile == 'INT':
+        if rfile.unary: # INT, check if unary/binary
+            return port
+        else:
+            return 1 << port
+
+
 # derive from ControlBase rather than have a separate Stage instance,
 # this is simpler to do
 class NonProductionCore(ControlBase):
@@ -88,7 +132,7 @@ class NonProductionCore(ControlBase):
                              (pspec.allow_overlap == True))
 
         # test core type
-        self.make_hazard_vecs = True
+        self.make_hazard_vecs = self.allow_overlap
         self.core_type = "fsm"
         if hasattr(pspec, "core_type"):
             self.core_type = pspec.core_type
@@ -105,14 +149,34 @@ class NonProductionCore(ControlBase):
 
         # link LoadStore1 into MMU
         mmu = self.fus.get_fu('mmu0')
+        ldst0 = self.fus.get_fu('ldst0')
         print ("core pspec", pspec.ldst_ifacetype)
         print ("core mmu", mmu)
         if mmu is not None:
-            print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
-            mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+            lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+            print ("core lsmem.lsi", lsi)
+            mmu.alu.set_ldst_interface(lsi)
+            # urr store I-Cache in core so it is easier to get at
+            self.icache = lsi.icache
+
+        # alternative reset values for STATE regs. these probably shouldn't
+        # be set, here, instead have them done by Issuer. which they are.
+        # as well. because core.state overrides them. sigh.
+        self.msr_at_reset = 0x0
+        self.pc_at_reset = 0x0
+        if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+            self.msr_at_reset = pspec.msr_reset
+        if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+            self.pc_at_reset = pspec.pc_reset
+        state_resets = [self.pc_at_reset,  # PC at reset
+                        self.msr_at_reset, # MSR at reset
+                        0x0,               # SVSTATE at reset
+                        0x0,               # DEC at reset
+                        0x0]               # TB at reset
 
         # register files (yes plural)
-        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs)
+        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+                                    state_resets=state_resets)
 
         # set up input and output: unusual requirement to set data directly
         # (due to the way that the core is set up in a different domain,
@@ -132,6 +196,9 @@ class NonProductionCore(ControlBase):
         self.decoders = {}
         self.des = {}
 
+        # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti
+        # they should be shared (put into the ALU *once*).
+
         for funame, fu in self.fus.fus.items():
             f_name = fu.fnunit.name
             fnunit = fu.fnunit.value
@@ -140,17 +207,29 @@ class NonProductionCore(ControlBase):
                 # TRAP decoder is the *main* decoder
                 self.trapunit = funame
                 continue
+            assert funame not in self.decoders
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
                                                       state=self.ireg.state,
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
+            print ("create decoder subset", funame, opkls, self.des[funame])
+
+        # create per-Function Unit write-after-write hazard signals
+        # yes, really, this should have been added in ReservationStations
+        # but hey.
+        for funame, fu in self.fus.fus.items():
+            fu._waw_hazard = Signal(name="waw_%s" % funame)
 
         # share the SPR decoder with the MMU if it exists
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
 
+        # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+        # if there is an MTSPR instruction
+        self.pause_dec_tb = Signal()
+
     # next 3 functions are Stage API Compliance
     def setup(self, m, i):
         pass
@@ -178,6 +257,13 @@ class NonProductionCore(ControlBase):
         regs = self.regs
         fus = self.fus.fus
 
+        # amalgamate write-hazards into a single top-level Signal
+        self.waw_hazard = Signal()
+        whaz = []
+        for funame, fu in self.fus.fus.items():
+            whaz.append(fu._waw_hazard)
+        comb += self.waw_hazard.eq(Cat(*whaz).bool())
+
         # connect decoders
         self.connect_satellite_decoders(m)
 
@@ -208,23 +294,25 @@ class NonProductionCore(ControlBase):
             # connect each satellite decoder and give it the instruction.
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
+            m.submodules["dec_%s" % k] = v
             comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
             comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
             comb += v.sv_a_nz.eq(self.ireg.sv_a_nz)
-            if self.svp64_en:
-                comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
-                comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
-                if k != self.trapunit:
-                    comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
-                    comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
-                    # only the LDST PowerDecodeSubset *actually* needs to
-                    # know to use the alternative decoder.  this is all
-                    # a terrible hack
-                    if k.lower().startswith("ldst"):
-                        comb += v.use_svp64_ldst_dec.eq(
-                                        self.ireg.use_svp64_ldst_dec)
+            if not self.svp64_en:
+                continue
+            comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
+            comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
+            if k == self.trapunit:
+                continue
+            comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
+            comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
+            # only the LDST PowerDecodeSubset *actually* needs to
+            # know to use the alternative decoder.  this is all
+            # a terrible hack
+            if not k.lower().startswith("ldst"):
+                continue
+            comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec)
 
     def connect_instruction(self, m):
         """connect_instruction
@@ -311,7 +399,7 @@ class NonProductionCore(ControlBase):
 
         # rdmask, which is for registers needs to come from the *main* decoder
         for funame, fu in fus.items():
-            rdmask = get_rdflags(self.ireg.e, fu)
+            rdmask = get_rdflags(m, self.ireg.e, fu)
             comb += fu.rdmaskn.eq(~rdmask)
 
         # sigh - need a NOP counter
@@ -351,13 +439,31 @@ class NonProductionCore(ControlBase):
                                 # issue, busy, read flags and mask to FU
                                 with m.If(enable):
                                     # operand comes from the *local*  decoder
+                                    # do not actually issue, though, if there
+                                    # is a waw hazard. decoder has to still
+                                    # be asserted in order to detect that, tho
                                     comb += fu.oper_i.eq_from(do)
-                                    comb += fu.issue_i.eq(1) # issue when valid
+                                    if funame == 'mmu0':
+                                        # URRR this is truly dreadful.
+                                        # OP_FETCH_FAILED is a "fake" op.
+                                        # no instruction creates it.  OP_TRAP
+                                        # uses the *main* decoder: this is
+                                        # a *Satellite* decoder that reacts
+                                        # on *insn_in*... not fake ops. gaah.
+                                        main_op = self.ireg.e.do
+                                        with m.If(main_op.insn_type ==
+                                                  MicrOp.OP_FETCH_FAILED):
+                                            comb += fu.oper_i.insn_type.eq(
+                                                  MicrOp.OP_FETCH_FAILED)
+                                            comb += fu.oper_i.fn_unit.eq(
+                                                  Function.MMU)
+                                    # issue when valid (and no write-hazard)
+                                    comb += fu.issue_i.eq(~self.waw_hazard)
                                     # instruction ok, indicate ready
                                     comb += self.p.o_ready.eq(1)
 
                             if self.allow_overlap:
-                                with m.If(~fu_found):
+                                with m.If(~fu_found | self.waw_hazard):
                                     # latch copy of instruction
                                     sync += ilatch.eq(self.i)
                                     comb += self.p.o_ready.eq(1) # accept
@@ -379,16 +485,23 @@ class NonProductionCore(ControlBase):
                         # run this FunctionUnit if enabled route op,
                         # issue, busy, read flags and mask to FU
                         with m.If(enable):
-                            # operand comes from the *local*  decoder
+                            # operand comes from the *local* decoder,
+                            # which is asserted even if not issued,
+                            # so that WaW-detection can check for hazards.
+                            # only if the waw hazard is clear does the
+                            # instruction actually get issued
                             comb += fu.oper_i.eq_from(do)
-                            comb += fu.issue_i.eq(1) # issue when valid
-                            comb += self.p.o_ready.eq(1)
-                            comb += busy_o.eq(0)
-                            m.next = "READY"
+                            # issue when valid
+                            comb += fu.issue_i.eq(~self.waw_hazard)
+                            with m.If(~self.waw_hazard):
+                                comb += self.p.o_ready.eq(1)
+                                comb += busy_o.eq(0)
+                                m.next = "READY"
 
         print ("core: overlap allowed", self.allow_overlap)
-        busys = map(lambda fu: fu.busy_o, fus.values())
-        comb += any_busy_o.eq(Cat(*busys).bool())
+        # true when any FU is busy (including the cycle where it is perhaps
+        # to be issued - because that's what fu_busy is)
+        comb += any_busy_o.eq(fu_busy.bool())
         if not self.allow_overlap:
             # for simple non-overlap, if any instruction is busy, set
             # busy output for core.
@@ -398,6 +511,26 @@ class NonProductionCore(ControlBase):
             # and resolved
             with m.If(self.issue_conflict):
                 comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
+                # for SPR pipeline pause dec/tb FSM to avoid race condition
+                # TODO: really this should be much more sophisticated,
+                # spot MTSPR, spot that DEC/TB is what is to be updated.
+                # a job for PowerDecoder2, there
+                if funame.lower().startswith('spr'):
+                    with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+                        ):
+                        comb += self.pause_dec_tb.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
@@ -421,7 +554,7 @@ class NonProductionCore(ControlBase):
         # for checking if the read port has an outstanding write
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
-            wvchk = wv.r_ports["issue"] # write-vec bit-level hazard check
+            wvchk = wv.q_int # write-vec bit-level hazard check
 
         # if a hazard is detected on this read port, simply blithely block
         # every FU from reading on it.  this is complete overkill but very
@@ -437,38 +570,53 @@ class NonProductionCore(ControlBase):
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, fspec, len(fuspec))
+            print ("fpsec", i, fspec, len(fspec.specs))
+            name = "%s_%s_%d" % (regfile, regname, i)
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
-            name = "rdflag_%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name=name, reset_less=True)
-            comb += rdflag.eq(rf)
+            pplen += len(fspec.specs)
+            rdflag = Signal(name="rdflag_"+name, reset_less=True)
+            comb += rdflag.eq(fspec.okflag)
             rdflags.append(rdflag)
 
         print ("pplen", pplen)
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
 
         rens = []
         addrs = []
         wvens = []
 
         for i, fspec in enumerate(fspecs):
-            (rf, wf, _read, _write, wid, fuspec) = fspec
+            (rf, _read, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
                 pi += ppoffs[i]
                 name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
                 fu_active = fu_selected[funame]
                 fu_issued = fu_bitdict[funame]
 
                 # get (or set up) a latched copy of read register number
+                # and (sigh) also the read-ok flag
+                # TODO: use nmutil latchregister
+                rhname = "%s_%s_%d" % (regfile, regname, i)
+                rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+                                reset_less=True)
+                if rhname not in fu.rf_latches:
+                    rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname))
+                    fu.rf_latches[rhname] = rfl
+                    with m.If(fu.issue_i):
+                        sync += rfl.eq(rdflags[i])
+                else:
+                    rfl = fu.rf_latches[rhname]
+
+                # now the register port
                 rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
-                read = Signal.like(_read, name="read_"+name)
+                read = Signal.like(_read, name="read_"+rname)
                 if rname not in fu.rd_latches:
                     rdl = Signal.like(_read, name="rdlatch_"+rname)
                     fu.rd_latches[rname] = rdl
@@ -476,23 +624,28 @@ class NonProductionCore(ControlBase):
                         sync += rdl.eq(_read)
                 else:
                     rdl = fu.rd_latches[rname]
-                # latch to make the read immediately available on issue cycle
-                # after the read cycle, use the latched copy
+
+                # make the read immediately available on issue cycle
+                # after the read cycle, otherwies use the latched copy.
+                # this captures the regport and okflag on issue
                 with m.If(fu.issue_i):
                     comb += read.eq(_read)
+                    comb += rdflag.eq(rdflags[i])
                 with m.Else():
                     comb += read.eq(rdl)
+                    comb += rdflag.eq(rfl)
 
                 # connect request-read to picker input, and output to go-rd
                 addr_en = Signal.like(read, name="addr_en_"+name)
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+                rhazard = Signal(name="rhaz_"+name)
 
                 # exclude any currently-enabled read-request (mask out active)
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
-                                ~delay_pick & ~hazard_detected)
                 # entirely block anything hazarded from being picked
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
+                                ~delay_pick & ~rhazard)
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
@@ -523,10 +676,10 @@ class NonProductionCore(ControlBase):
                     continue
 
                 # read the write-hazard bitvector (wv) for any bit that is
-                wvchk_en = Signal(len(wvchk.ren), name="wv_chk_addr_en_"+name)
+                wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
                 issue_active = Signal(name="rd_iactive_"+name)
                 # XXX combinatorial loop here
-                comb += issue_active.eq(fu_active & rf)
+                comb += issue_active.eq(fu_active & rdflag)
                 with m.If(issue_active):
                     if rfile.unary:
                         comb += wvchk_en.eq(read)
@@ -534,10 +687,15 @@ class NonProductionCore(ControlBase):
                         comb += wvchk_en.eq(1<<read)
                 # if FU is busy (which doesn't get set at the same time as
                 # issue) and no hazard was detected, clear wvchk_en (i.e.
-                # stop checking for hazards)
-                with m.If(fu.busy_o & ~hazard_detected):
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~rhazard):
                         comb += wvchk_en.eq(0)
 
+                # read-hazard is ANDed with (filtered by) what is actually
+                # being requested.
+                comb += rhazard.eq((wvchk & wvchk_en).bool())
+
                 wvens.append(wvchk_en)
 
         # or-reduce the muxed read signals
@@ -555,8 +713,9 @@ class NonProductionCore(ControlBase):
 
         # enable the read bitvectors for this issued instruction
         # and return whether any write-hazard bit is set
-        comb += wvchk.ren.eq(ortreereduce_sig(wvens))
-        comb += hazard_detected.eq(wvchk.o_data.bool())
+        wvchk_and = Signal(len(wvchk), name="wv_chk_"+name)
+        comb += wvchk_and.eq(wvchk & ortreereduce_sig(wvens))
+        comb += hazard_detected.eq(wvchk_and.bool())
         return hazard_detected
 
     def connect_rdports(self, m, fu_bitdict, fu_selected):
@@ -572,13 +731,12 @@ class NonProductionCore(ControlBase):
         rd_hazard = []
 
         # dictionary of lists of regfile read ports
-        byregfiles_rd, byregfiles_rdspec = self.get_byregfiles(True)
+        byregfiles_rdspec = self.get_byregfiles(m, True)
 
         # okaay, now we need a PriorityPicker per regfile per regfile port
         # loootta pickers... peter piper picked a pack of pickled peppers...
         rdpickers = {}
-        for regfile, spec in byregfiles_rd.items():
-            fuspecs = byregfiles_rdspec[regfile]
+        for regfile, fuspecs in byregfiles_rdspec.items():
             rdpickers[regfile] = {}
 
             # argh.  an experiment to merge RA and RB in the INT regfile
@@ -629,11 +787,14 @@ class NonProductionCore(ControlBase):
         # the hazard)
 
         # the detection of what shall be written to is based
-        # on *issue*
+        # on *issue*.  it is delayed by 1 cycle so that instructions
+        # "addi 5,5,0x2" do not cause combinatorial loops due to
+        # fake-dependency on *themselves*.  this will totally fail
+        # spectacularly when doing multi-issue
         print ("write vector (for regread)", regfile, wvset)
-        wviaddr_en = Signal(len(wvset.wen), name="wv_issue_addr_en_"+name)
+        wviaddr_en = Signal(len(wvset), name="wv_issue_addr_en_"+name)
         issue_active = Signal(name="iactive_"+name)
-        comb += issue_active.eq(fu.issue_i & fu_active & wrflag)
+        sync += issue_active.eq(fu.issue_i & fu_active & wrflag)
         with m.If(issue_active):
             if rfile.unary:
                 comb += wviaddr_en.eq(write)
@@ -643,7 +804,7 @@ class NonProductionCore(ControlBase):
         # deal with write vector clear: this kicks in when the regfile
         # is written to, and clears the corresponding bitvector entry
         print ("write vector", regfile, wvclr)
-        wvaddr_en = Signal(len(wvclr.wen), name="wvaddr_en_"+name)
+        wvaddr_en = Signal(len(wvclr), name="wvaddr_en_"+name)
         if rfile.unary:
             comb += wvaddr_en.eq(addr_en)
         else:
@@ -693,8 +854,9 @@ class NonProductionCore(ControlBase):
         # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
-            wvset = wv.w_ports["set"] # write-vec bit-level hazard ctrl
-            wvclr = wv.w_ports["clr"] # write-vec bit-level hazard ctrl
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvchk = wv.q # write-after-write hazard check
 
         fspecs = fspec
         if not isinstance(fspecs, list):
@@ -703,53 +865,56 @@ class NonProductionCore(ControlBase):
         pplen = 0
         writes = []
         ppoffs = []
-        rdflags = []
         wrflags = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, "wrflag", wf, fspec, len(fuspec))
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            print ("fpsec", i, "wrflag", wf, fspec, len(fuspecs))
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
+            pplen += len(fuspecs)
 
             name = "%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name="rd_flag_"+name)
             wrflag = Signal(name="wr_flag_"+name)
-            if rf is not None:
-                comb += rdflag.eq(rf)
-            else:
-                comb += rdflag.eq(0)
             if wf is not None:
                 comb += wrflag.eq(wf)
             else:
                 comb += wrflag.eq(0)
-            rdflags.append(rdflag)
             wrflags.append(wrflag)
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
 
         wsigs = []
         wens = []
         wvsets = []
         wvseten = []
         wvclren = []
+        #wvens = [] - not needed: reading of writevec is permanently held hi
         addrs = []
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
-            (rf, wf, read, _write, wid, fuspec) = fspec
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
+                fu_requested = fu_bitdict[funame]
                 pi += ppoffs[i]
                 name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
                 # get (or set up) a write-latched copy of write register number
                 write = Signal.like(_write, name="write_"+name)
-                rname = "%s_%s_%s" % (funame, regfile, regname)
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
                 if rname not in fu.wr_latches:
                     wrl = Signal.like(_write, name="wrlatch_"+rname)
                     fu.wr_latches[rname] = write
-                    with m.If(fu.issue_i):
+                    # do not depend on fu.issue_i here, it creates a
+                    # combinatorial loop on waw checking. using the FU
+                    # "enable" bitdict entry for this FU is sufficient,
+                    # because the PowerDecoder2 read/write nums are
+                    # valid continuously when the instruction is valid
+                    with m.If(fu_requested):
                         sync += wrl.eq(_write)
                         comb += write.eq(_write)
                     with m.Else():
@@ -760,8 +925,8 @@ class NonProductionCore(ControlBase):
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
-                name = "fu_wrok_%s_%s_%d" % (funame, regname, idx)
-                fu_wrok = Signal(name=name, reset_less=True)
+                name = "%s_%s_%d" % (funame, regname, idx)
+                fu_wrok = Signal(name="fu_wrok_"+name, reset_less=True)
                 comb += fu_wrok.eq(dest.ok & fu.busy_o)
 
                 # connect request-write to picker input, and output to go-wr
@@ -803,7 +968,44 @@ class NonProductionCore(ControlBase):
                 wvaddr_en, wv_issue_en = res
                 wvclren.append(wvaddr_en)   # set only: no data => clear bit
                 wvseten.append(wv_issue_en) # set data same as enable
-                wvsets.append(wv_issue_en)  # because enable needs a 1
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                fu_requested = fu_bitdict[funame]
+                wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name)
+                issue_active = Signal(name="waw_iactive_"+name)
+                whazard = Signal(name="whaz_"+name)
+                if wf is None:
+                    # XXX EEK! STATE regfile (branch) does not have an
+                    # write-active indicator in regspec_decode_write()
+                    print ("XXX FIXME waw_iactive", issue_active,
+                                                    fu_requested, wf)
+                else:
+                    # check bits from the incoming instruction.  note (back
+                    # in connect_instruction) that the decoder is held for
+                    # us to be able to do this, here... *without* issue being
+                    # held HI.  we MUST NOT gate this with fu.issue_i or
+                    # with fu_bitdict "enable": it would create a loop
+                    comb += issue_active.eq(wf)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(write)
+                    else:
+                        comb += wvchk_en.eq(1<<write)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~whazard):
+                        comb += wvchk_en.eq(0)
+
+                # write-hazard is ANDed with (filtered by) what is actually
+                # being requested.  the wvchk data is on a one-clock delay,
+                # and wvchk_en comes directly from the main decoder
+                comb += whazard.eq((wvchk & wvchk_en).bool())
+                with m.If(whazard):
+                    comb += fu._waw_hazard.eq(1)
+
+                #wvens.append(wvchk_en)
 
         # here is where we create the Write Broadcast Bus. simple, eh?
         comb += wport.i_data.eq(ortreereduce_sig(wsigs))
@@ -816,12 +1018,19 @@ class NonProductionCore(ControlBase):
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
         if not self.make_hazard_vecs:
-            return
-
-        # for write-vectors
-        comb += wvclr.wen.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
-        comb += wvset.wen.eq(ortreereduce_sig(wvseten)) # set (issue time)
-        comb += wvset.i_data.eq(ortreereduce_sig(wvsets))
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (wvclren, # clear (regfile write)
+                wvseten) # set (issue time)
 
     def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
@@ -838,13 +1047,14 @@ class NonProductionCore(ControlBase):
         fus = self.fus.fus
         regs = self.regs
         # dictionary of lists of regfile write ports
-        byregfiles_wr, byregfiles_wrspec = self.get_byregfiles(False)
+        byregfiles_wrspec = self.get_byregfiles(m, False)
 
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
-        for regfile, spec in byregfiles_wr.items():
-            fuspecs = byregfiles_wrspec[regfile]
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
+        for regfile, fuspecs in byregfiles_wrspec.items():
             wrpickers[regfile] = {}
 
             if self.regreduce_en:
@@ -859,88 +1069,91 @@ class NonProductionCore(ControlBase):
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, fu_selected, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                         regfile, regname, fspec)
+                wvclrers[regfile.lower()] += wvclren
+                wvseters[regfile.lower()] += wvseten
 
-    def get_byregfiles(self, readmode):
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+
+    def get_byregfiles(self, m, readmode):
 
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
         e = self.ireg.e # decoded instruction to execute
 
-        # dictionary of dictionaries of lists of regfile ports.
+        # dictionary of dictionaries of lists/tuples of regfile ports.
         # first key: regfile.  second key: regfile port name
-        byregfiles = defaultdict(dict)
         byregfiles_spec = defaultdict(dict)
 
         for (funame, fu) in fus.items():
             # create in each FU a receptacle for the read/write register
-            # hazard numbers.  to be latched in connect_rd/write_ports
-            # XXX better that this is moved into the actual FUs, but
-            # the issue there is that this function is actually better
-            # suited at the moment
+            # hazard numbers (and okflags for read).  to be latched in
+            # connect_rd/write_ports
             if readmode:
-                fu.rd_latches = {}
+                fu.rd_latches = {} # read reg number latches
+                fu.rf_latches = {} # read flag latches
             else:
                 fu.wr_latches = {}
 
+            # construct regfile specs: read uses inspec, write outspec
             print("%s ports for %s" % (mode, funame))
             for idx in range(fu.n_src if readmode else fu.n_dst):
-                # construct regfile specs: read uses inspec, write outspec
-                if readmode:
-                    (regfile, regname, wid) = fu.get_in_spec(idx)
-                else:
-                    (regfile, regname, wid) = fu.get_out_spec(idx)
+                (regfile, regname, wid) = fu.get_io_spec(readmode, idx)
                 print("    %d %s %s %s" % (idx, regfile, regname, str(wid)))
 
                 # the PowerDecoder2 (main one, not the satellites) contains
                 # the decoded regfile numbers. obtain these now
-                if readmode:
-                    rdflag, read = regspec_decode_read(e, regfile, regname)
-                    wrport, write = None, None
-                else:
-                    rdflag, read = None, None
-                    wrport, write = regspec_decode_write(e, regfile, regname)
+                decinfo = regspec_decode(m, readmode, e, regfile, regname)
+                okflag, regport = decinfo.okflag, decinfo.regport
 
                 # construct the dictionary of regspec information by regfile
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
-                        (rdflag, wrport, read, write, wid, [])
-                # here we start to create "lanes"
-                if idx not in byregfiles[regfile]:
-                    byregfiles[regfile][idx] = []
-                fuspec = (funame, fu, idx)
-                byregfiles[regfile][idx].append(fuspec)
-                byregfiles_spec[regfile][regname][5].append(fuspec)
+                        ByRegSpec(okflag, regport, wid, [])
 
-                continue
-                # append a latch Signal to the FU's list of latches
-                rname = "%s_%s" % (regfile, regname)
-                if readmode:
-                    if rname not in fu.rd_latches:
-                        rdl = Signal.like(read, name="rdlatch_"+rname)
-                        fu.rd_latches[rname] = rdl
-                else:
-                    if rname not in fu.wr_latches:
-                        wrl = Signal.like(write, name="wrlatch_"+rname)
-                        fu.wr_latches[rname] = wrl
+                # here we start to create "lanes" where each Function Unit
+                # requiring access to a given [single-contended resource]
+                # regfile port is appended to a list, so that PriorityPickers
+                # can be created to give uncontested access to it
+                fuspec = FUSpec(funame, fu, idx)
+                byregfiles_spec[regfile][regname].specs.append(fuspec)
 
         # ok just print that all out, for convenience
-        for regfile, spec in byregfiles.items():
+        for regfile, fuspecs in byregfiles_spec.items():
             print("regfile %s ports:" % mode, regfile)
-            fuspecs = byregfiles_spec[regfile]
             for regname, fspec in fuspecs.items():
-                [rdflag, wrflag, read, write, wid, fuspec] = fspec
+                [okflag, regport, wid, fuspecs] = fspec
                 print("  rf %s port %s lane: %s" % (mode, regfile, regname))
-                print("  %s" % regname, wid, read, write, rdflag, wrflag)
-                for (funame, fu, idx) in fuspec:
+                print("  %s" % regname, wid, okflag, regport)
+                for (funame, fu, idx) in fuspecs:
                     fusig = fu.src_i[idx] if readmode else fu.dest[idx]
                     print("    ", funame, fu.__class__.__name__, idx, fusig)
                     print()
 
-        return byregfiles, byregfiles_spec
+        return byregfiles_spec
 
     def __iter__(self):
         yield from self.fus.ports()
@@ -955,7 +1168,8 @@ class NonProductionCore(ControlBase):
 if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
+                         allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)