fix write-after-write hazard checking
[soc.git] / src / soc / simple / core.py
index 604624c4b9d59c6598273006ad849624fc4c537b..debff9da1d6b37ea55dcb018bc9476b5fe25a25f 100644 (file)
@@ -17,9 +17,12 @@ the brain-dead part of this module is that even though there is no
 conflict of access, regfile read/write hazards are *not* analysed,
 and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
+(update: actually this is being added now:
+https://bugs.libre-soc.org/show_bug.cgi?id=737)
 """
 
-from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
+from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
+                    Const)
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
@@ -39,11 +42,14 @@ from soc.config.test.test_loadstore import TestMemPspec
 from openpower.decoder.power_enums import MicrOp, Function
 from soc.simple.core_data import CoreInput, CoreOutput
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 import operator
 
 from nmutil.util import rising_edge
 
+FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"])
+ByRegSpec = namedtuple("ByRegSpec", ["rdport", "wrport", "read",
+                                     "write", "wid", "specs"])
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
@@ -87,7 +93,7 @@ class NonProductionCore(ControlBase):
                              (pspec.allow_overlap == True))
 
         # test core type
-        self.make_hazard_vecs = True
+        self.make_hazard_vecs = self.allow_overlap
         self.core_type = "fsm"
         if hasattr(pspec, "core_type"):
             self.core_type = pspec.core_type
@@ -146,6 +152,12 @@ class NonProductionCore(ControlBase):
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
 
+        # create per-Function Unit write-after-write hazard signals
+        # yes, really, this should have been added in ReservationStations
+        # but hey.
+        for funame, fu in self.fus.fus.items():
+            fu._waw_hazard = Signal(name="waw_%s" % funame)
+
         # share the SPR decoder with the MMU if it exists
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
@@ -177,6 +189,13 @@ class NonProductionCore(ControlBase):
         regs = self.regs
         fus = self.fus.fus
 
+        # amalgamate write-hazards into a single top-level Signal
+        self.waw_hazard = Signal()
+        whaz = []
+        for funame, fu in self.fus.fus.items():
+            whaz.append(fu._waw_hazard)
+        comb += self.waw_hazard.eq(Cat(*whaz).bool())
+
         # connect decoders
         self.connect_satellite_decoders(m)
 
@@ -184,11 +203,12 @@ class NonProductionCore(ControlBase):
         self.des[self.trapunit] = self.ireg.e.do
 
         # connect up Function Units, then read/write ports, and hazard conflict
-        issue_conflict = Signal()
-        fu_bitdict, fu_selected = self.connect_instruction(m, issue_conflict)
-        raw_hazard = self.connect_rdports(m, fu_selected)
-        self.connect_wrports(m, fu_selected)
-        comb += issue_conflict.eq(raw_hazard)
+        self.issue_conflict = Signal()
+        fu_bitdict, fu_selected = self.connect_instruction(m)
+        raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected)
+        self.connect_wrports(m, fu_bitdict, fu_selected)
+        if self.allow_overlap:
+            comb += self.issue_conflict.eq(raw_hazard)
 
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
@@ -206,7 +226,7 @@ class NonProductionCore(ControlBase):
             # connect each satellite decoder and give it the instruction.
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
+            m.submodules["dec_%s" % v.fn_name] = v
             comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
             comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
@@ -224,7 +244,7 @@ class NonProductionCore(ControlBase):
                         comb += v.use_svp64_ldst_dec.eq(
                                         self.ireg.use_svp64_ldst_dec)
 
-    def connect_instruction(self, m, issue_conflict):
+    def connect_instruction(self, m):
         """connect_instruction
 
         uses decoded (from PowerOp) function unit information from CSV files
@@ -241,6 +261,7 @@ class NonProductionCore(ControlBase):
 
         # indicate if core is busy
         busy_o = self.o.busy_o
+        any_busy_o = self.o.any_busy_o
 
         # connect up temporary copy of incoming instruction. the FSM will
         # either blat the incoming instruction (if valid) into self.ireg
@@ -290,7 +311,8 @@ class NonProductionCore(ControlBase):
                 fnunit = fu.fnunit.value
                 en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
                 fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool()
-                comb += en_req.eq(fnmatch & ~fu.busy_o & self.instr_active)
+                comb += en_req.eq(fnmatch & ~fu.busy_o &
+                                    self.instr_active)
                 i_l.append(en_req) # store in list for doing the Cat-trick
                 # picker output, gated by enable: store in fu_bitdict
                 po = Signal(name="o_issue_pick_"+funame) # picker output
@@ -300,11 +322,16 @@ class NonProductionCore(ControlBase):
                 # if we don't do this, then when there are no FUs available,
                 # the "p.o_ready" signal will go back "ok we accepted this
                 # instruction" which of course isn't true.
-                with m.If(~issue_conflict & i_pp.en_o):
+                with m.If(i_pp.en_o):
                     comb += fu_found.eq(1)
             # for each input, Cat them together and drop them into the picker
             comb += i_pp.i.eq(Cat(*i_l))
 
+        # rdmask, which is for registers needs to come from the *main* decoder
+        for funame, fu in fus.items():
+            rdmask = get_rdflags(self.ireg.e, fu)
+            comb += fu.rdmaskn.eq(~rdmask)
+
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
@@ -317,12 +344,9 @@ class NonProductionCore(ControlBase):
         # always say "ready" except if overridden
         comb += self.p.o_ready.eq(1)
 
-        l_issue_conflict = Signal()
-
         with m.FSM():
             with m.State("READY"):
                 with m.If(self.p.i_valid): # run only when valid
-                    comb += self.instr_active.eq(1)
                     with m.Switch(self.ireg.e.do.insn_type):
                         # check for ATTN: halt if true
                         with m.Case(MicrOp.OP_ATTN):
@@ -334,6 +358,7 @@ class NonProductionCore(ControlBase):
                             comb += busy_o.eq(1)
 
                         with m.Default():
+                            comb += self.instr_active.eq(1)
                             comb += self.p.o_ready.eq(0)
                             # connect instructions. only one enabled at a time
                             for funame, fu in fus.items():
@@ -342,35 +367,32 @@ class NonProductionCore(ControlBase):
 
                                 # run this FunctionUnit if enabled route op,
                                 # issue, busy, read flags and mask to FU
-                                with m.If(enable & fu_found):
+                                with m.If(enable):
                                     # operand comes from the *local*  decoder
+                                    # do not actually issue, though, if there
+                                    # is a waw hazard. decoder has to still
+                                    # be asserted in order to detect that, tho
                                     comb += fu.oper_i.eq_from(do)
-                                    comb += fu.issue_i.eq(1) # issue when valid
-                                    # rdmask, which is for registers,
-                                    # needs to come
-                                    # from the *main* decoder
-                                    rdmask = get_rdflags(self.ireg.e, fu)
-                                    comb += fu.rdmaskn.eq(~rdmask)
+                                    # issue when valid (and no write-hazard)
+                                    comb += fu.issue_i.eq(~self.waw_hazard)
                                     # instruction ok, indicate ready
                                     comb += self.p.o_ready.eq(1)
 
-                            with m.If(~fu_found):
-                                # latch copy of instruction
-                                sync += ilatch.eq(self.i)
-                                sync += l_issue_conflict.eq(issue_conflict)
-                                comb += self.p.o_ready.eq(1) # accept
-                                comb += busy_o.eq(1)
-                                m.next = "WAITING"
+                            if self.allow_overlap:
+                                with m.If(~fu_found | self.waw_hazard):
+                                    # latch copy of instruction
+                                    sync += ilatch.eq(self.i)
+                                    comb += self.p.o_ready.eq(1) # accept
+                                    comb += busy_o.eq(1)
+                                    m.next = "WAITING"
 
             with m.State("WAITING"):
                 comb += self.instr_active.eq(1)
-                with m.If(fu_found):
-                    sync += l_issue_conflict.eq(0)
                 comb += self.p.o_ready.eq(0)
                 comb += busy_o.eq(1)
                 # using copy of instruction, keep waiting until an FU is free
                 comb += self.ireg.eq(ilatch)
-                with m.If(~l_issue_conflict): # wait for conflict to clear
+                with m.If(fu_found): # wait for conflict to clear
                     # connect instructions. only one enabled at a time
                     for funame, fu in fus.items():
                         do = self.des[funame]
@@ -379,31 +401,52 @@ class NonProductionCore(ControlBase):
                         # run this FunctionUnit if enabled route op,
                         # issue, busy, read flags and mask to FU
                         with m.If(enable):
-                            # operand comes from the *local*  decoder
+                            # operand comes from the *local* decoder,
+                            # which is asserted even if not issued,
+                            # so that WaW-detection can check for hazards.
+                            # only if the waw hazard is clear does the
+                            # instruction actually get issued
                             comb += fu.oper_i.eq_from(do)
-                            comb += fu.issue_i.eq(1) # issue when valid
-                            # rdmask, which is for registers,
-                            # needs to come
-                            # from the *main* decoder
-                            rdmask = get_rdflags(self.ireg.e, fu)
-                            comb += fu.rdmaskn.eq(~rdmask)
-                            comb += self.p.o_ready.eq(1)
-                            comb += busy_o.eq(0)
-                            m.next = "READY"
+                            # issue when valid
+                            comb += fu.issue_i.eq(~self.waw_hazard)
+                            with m.If(~self.waw_hazard):
+                                comb += self.p.o_ready.eq(1)
+                                comb += busy_o.eq(0)
+                                m.next = "READY"
 
         print ("core: overlap allowed", self.allow_overlap)
+        # true when any FU is busy (including the cycle where it is perhaps
+        # to be issued - because that's what fu_busy is)
+        comb += any_busy_o.eq(fu_busy.bool())
         if not self.allow_overlap:
             # for simple non-overlap, if any instruction is busy, set
             # busy output for core.
-            busys = map(lambda fu: fu.busy_o, fus.values())
-            comb += busy_o.eq(Cat(*busys).bool())
+            comb += busy_o.eq(any_busy_o)
+        else:
+            # sigh deal with a fun situation that needs to be investigated
+            # and resolved
+            with m.If(self.issue_conflict):
+                comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
         # connecters to give them permission to request access to regfiles
         return fu_bitdict, fu_selected
 
-    def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
+    def connect_rdport(self, m, fu_bitdict, fu_selected,
+                                rdpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -419,7 +462,12 @@ class NonProductionCore(ControlBase):
         # for checking if the read port has an outstanding write
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
-            wvchk = wv.r_ports["issue"] # write-vec bit-level hazard check
+            wvchk = wv.q_int # write-vec bit-level hazard check
+
+        # if a hazard is detected on this read port, simply blithely block
+        # every FU from reading on it.  this is complete overkill but very
+        # simple for now.
+        hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
 
         fspecs = fspec
         if not isinstance(fspecs, list):
@@ -430,33 +478,39 @@ class NonProductionCore(ControlBase):
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, fspec, len(fuspec))
+            (rf, wf, _read, _write, wid, fuspecs) = \
+                (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
+                 fspec.wid, fspec.specs)
+            print ("fpsec", i, fspec, len(fuspecs))
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
+            pplen += len(fspec.specs)
             name = "rdflag_%s_%s_%d" % (regfile, regname, i)
             rdflag = Signal(name=name, reset_less=True)
-            comb += rdflag.eq(rf)
+            comb += rdflag.eq(fspec.rdport)
             rdflags.append(rdflag)
 
         print ("pplen", pplen)
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
 
         rens = []
         addrs = []
         wvens = []
 
         for i, fspec in enumerate(fspecs):
-            (rf, wf, _read, _write, wid, fuspec) = fspec
+            (rf, wf, _read, _write, wid, fuspecs) = \
+                (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
+                 fspec.wid, fspec.specs)
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
                 pi += ppoffs[i]
                 name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
-                fu_active = fu_bitdict[funame]
+                fu_active = fu_selected[funame]
+                fu_issued = fu_bitdict[funame]
 
                 # get (or set up) a latched copy of read register number
                 rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
@@ -480,10 +534,12 @@ class NonProductionCore(ControlBase):
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+                rhazard = Signal(name="rhaz_"+name)
 
                 # exclude any currently-enabled read-request (mask out active)
+                # entirely block anything hazarded from being picked
                 comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
-                                ~delay_pick)
+                                ~delay_pick & ~rhazard)
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
@@ -514,15 +570,26 @@ class NonProductionCore(ControlBase):
                     continue
 
                 # read the write-hazard bitvector (wv) for any bit that is
-                wvchk_en = Signal(len(wvchk.ren), name="wv_chk_addr_en_"+name)
+                wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
                 issue_active = Signal(name="rd_iactive_"+name)
                 # XXX combinatorial loop here
-                #comb += issue_active.eq(self.instr_active & rdflags[i])
+                comb += issue_active.eq(fu_active & rf)
                 with m.If(issue_active):
                     if rfile.unary:
                         comb += wvchk_en.eq(read)
                     else:
                         comb += wvchk_en.eq(1<<read)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~rhazard):
+                        comb += wvchk_en.eq(0)
+
+                # read-hazard is ANDed with (filtered by) what is actually
+                # being requested.
+                comb += rhazard.eq((wvchk & wvchk_en).bool())
+
                 wvens.append(wvchk_en)
 
         # or-reduce the muxed read signals
@@ -540,12 +607,12 @@ class NonProductionCore(ControlBase):
 
         # enable the read bitvectors for this issued instruction
         # and return whether any write-hazard bit is set
-        comb += wvchk.ren.eq(ortreereduce_sig(wvens))
-        hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
-        comb += hazard_detected.eq(wvchk.o_data.bool())
+        wvchk_and = Signal(len(wvchk), name="wv_chk_"+name)
+        comb += wvchk_and.eq(wvchk & ortreereduce_sig(wvens))
+        comb += hazard_detected.eq(wvchk_and.bool())
         return hazard_detected
 
-    def connect_rdports(self, m, fu_bitdict):
+    def connect_rdports(self, m, fu_bitdict, fu_selected):
         """connect read ports
 
         orders the read regspecs into a dict-of-dicts, by regfile, by
@@ -585,7 +652,8 @@ class NonProductionCore(ControlBase):
             # also return (and collate) hazard detection)
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 print("connect rd", regname, fspec)
-                rh = self.connect_rdport(m, fu_bitdict, rdpickers, regfile,
+                rh = self.connect_rdport(m, fu_bitdict, fu_selected,
+                                       rdpickers, regfile,
                                        regname, fspec)
                 rd_hazard.append(rh)
 
@@ -614,11 +682,14 @@ class NonProductionCore(ControlBase):
         # the hazard)
 
         # the detection of what shall be written to is based
-        # on *issue*
+        # on *issue*.  it is delayed by 1 cycle so that instructions
+        # "addi 5,5,0x2" do not cause combinatorial loops due to
+        # fake-dependency on *themselves*.  this will totally fail
+        # spectacularly when doing multi-issue
         print ("write vector (for regread)", regfile, wvset)
-        wviaddr_en = Signal(len(wvset.wen), name="wv_issue_addr_en_"+name)
+        wviaddr_en = Signal(len(wvset), name="wv_issue_addr_en_"+name)
         issue_active = Signal(name="iactive_"+name)
-        comb += issue_active.eq(fu.issue_i & fu_active & wrflag)
+        sync += issue_active.eq(fu.issue_i & fu_active & wrflag)
         with m.If(issue_active):
             if rfile.unary:
                 comb += wviaddr_en.eq(write)
@@ -628,7 +699,7 @@ class NonProductionCore(ControlBase):
         # deal with write vector clear: this kicks in when the regfile
         # is written to, and clears the corresponding bitvector entry
         print ("write vector", regfile, wvclr)
-        wvaddr_en = Signal(len(wvclr.wen), name="wvaddr_en_"+name)
+        wvaddr_en = Signal(len(wvclr), name="wvaddr_en_"+name)
         if rfile.unary:
             comb += wvaddr_en.eq(addr_en)
         else:
@@ -658,7 +729,8 @@ class NonProductionCore(ControlBase):
 
         return wvaddr_en, wviaddr_en
 
-    def connect_wrport(self, m, fu_bitdict, wrpickers, regfile, regname, fspec):
+    def connect_wrport(self, m, fu_bitdict, fu_selected,
+                                wrpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -677,8 +749,10 @@ class NonProductionCore(ControlBase):
         # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
         if self.make_hazard_vecs:
             wv = regs.wv[regfile.lower()]
-            wvset = wv.w_ports["set"] # write-vec bit-level hazard ctrl
-            wvclr = wv.w_ports["clr"] # write-vec bit-level hazard ctrl
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvchk = wv.q # write-after-write hazard check
+            wvchk_qint = wv.q # write-after-write hazard check, NOT delayed
 
         fspecs = fspec
         if not isinstance(fspecs, list):
@@ -691,10 +765,12 @@ class NonProductionCore(ControlBase):
         wrflags = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, wf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, "wrflag", wf, fspec, len(fuspec))
+            (rf, wf, _read, _write, wid, fuspecs) = \
+                (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
+                 fspec.wid, fspec.specs)
+            print ("fpsec", i, "wrflag", wf, fspec, len(fuspecs))
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
+            pplen += len(fuspecs)
 
             name = "%s_%s_%d" % (regfile, regname, i)
             rdflag = Signal(name="rd_flag_"+name)
@@ -712,40 +788,54 @@ class NonProductionCore(ControlBase):
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
 
         wsigs = []
         wens = []
         wvsets = []
         wvseten = []
         wvclren = []
+        #wvens = [] - not needed: reading of writevec is permanently held hi
         addrs = []
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
-            (rf, wf, read, _write, wid, fuspec) = fspec
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            (rf, wf, _read, _write, wid, fuspecs) = \
+                (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
+                 fspec.wid, fspec.specs)
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
+                fu_requested = fu_bitdict[funame]
+                pi += ppoffs[i]
+                name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
                 # get (or set up) a write-latched copy of write register number
-                rname = "%s_%s_%s" % (funame, regfile, regname)
+                write = Signal.like(_write, name="write_"+name)
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
                 if rname not in fu.wr_latches:
-                    write = Signal.like(_write, name="wrlatch_"+rname)
+                    wrl = Signal.like(_write, name="wrlatch_"+rname)
                     fu.wr_latches[rname] = write
-                    with m.If(fu.issue_i):
-                        sync += write.eq(_write)
+                    # do not depend on fu.issue_i here, it creates a
+                    # combinatorial loop on waw checking. using the FU
+                    # "enable" bitdict entry for this FU is sufficient,
+                    # because the PowerDecoder2 read/write nums are
+                    # valid continuously when the instruction is valid
+                    with m.If(fu_requested):
+                        sync += wrl.eq(_write)
+                        comb += write.eq(_write)
+                    with m.Else():
+                        comb += write.eq(wrl)
                 else:
                     write = fu.wr_latches[rname]
 
-                pi += ppoffs[i]
-
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
-                name = "fu_wrok_%s_%s_%d" % (funame, regname, idx)
-                fu_wrok = Signal(name=name, reset_less=True)
+                name = "%s_%s_%d" % (funame, regname, idx)
+                fu_wrok = Signal(name="fu_wrok_"+name, reset_less=True)
                 comb += fu_wrok.eq(dest.ok & fu.busy_o)
 
                 # connect request-write to picker input, and output to go-wr
-                fu_active = fu_bitdict[funame]
+                fu_active = fu_selected[funame]
                 pick = fu.wr.rel_o[idx] & fu_active
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
@@ -783,7 +873,44 @@ class NonProductionCore(ControlBase):
                 wvaddr_en, wv_issue_en = res
                 wvclren.append(wvaddr_en)   # set only: no data => clear bit
                 wvseten.append(wv_issue_en) # set data same as enable
-                wvsets.append(wv_issue_en)  # because enable needs a 1
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                fu_requested = fu_bitdict[funame]
+                wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name)
+                issue_active = Signal(name="waw_iactive_"+name)
+                whazard = Signal(name="whaz_"+name)
+                if wf is None:
+                    # XXX EEK! STATE regfile (branch) does not have an
+                    # write-active indicator in regspec_decode_write()
+                    print ("XXX FIXME waw_iactive", issue_active,
+                                                    fu_requested, wf)
+                else:
+                    # check bits from the incoming instruction.  note (back
+                    # in connect_instruction) that the decoder is held for
+                    # us to be able to do this, here... *without* issue being
+                    # held HI.  we MUST NOT gate this with fu.issue_i or
+                    # with fu_bitdict "enable": it would create a loop
+                    comb += issue_active.eq(wf)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(write)
+                    else:
+                        comb += wvchk_en.eq(1<<write)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~whazard):
+                        comb += wvchk_en.eq(0)
+
+                # write-hazard is ANDed with (filtered by) what is actually
+                # being requested.  the wvchk data is on a one-clock delay,
+                # and wvchk_en comes directly from the main decoder
+                comb += whazard.eq((wvchk_qint & wvchk_en).bool())
+                with m.If(whazard):
+                    comb += fu._waw_hazard.eq(1)
+
+                #wvens.append(wvchk_en)
 
         # here is where we create the Write Broadcast Bus. simple, eh?
         comb += wport.i_data.eq(ortreereduce_sig(wsigs))
@@ -796,14 +923,21 @@ class NonProductionCore(ControlBase):
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
         if not self.make_hazard_vecs:
-            return
-
-        # for write-vectors
-        comb += wvclr.wen.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
-        comb += wvset.wen.eq(ortreereduce_sig(wvseten)) # set (issue time)
-        comb += wvset.i_data.eq(ortreereduce_sig(wvsets))
-
-    def connect_wrports(self, m, fu_bitdict):
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (ortreereduce_sig(wvclren), # clear (regfile write)
+                ortreereduce_sig(wvseten)) # set (issue time)
+
+    def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
 
         orders the write regspecs into a dict-of-dicts, by regfile,
@@ -823,6 +957,8 @@ class NonProductionCore(ControlBase):
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
         for regfile, spec in byregfiles_wr.items():
             fuspecs = byregfiles_wrspec[regfile]
             wrpickers[regfile] = {}
@@ -839,9 +975,33 @@ class NonProductionCore(ControlBase):
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                         regfile, regname, fspec)
+                wvclrers[regfile.lower()].append(wvclren)
+                wvseters[regfile.lower()].append(wvseten)
+
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
 
     def get_byregfiles(self, readmode):
 
@@ -850,9 +1010,9 @@ class NonProductionCore(ControlBase):
         fus = self.fus.fus
         e = self.ireg.e # decoded instruction to execute
 
-        # dictionary of dictionaries of lists of regfile ports.
+        # dictionary of dictionaries of lists/tuples of regfile ports.
         # first key: regfile.  second key: regfile port name
-        byregfiles = defaultdict(dict)
+        byregfiles = defaultdict(lambda: defaultdict(list))
         byregfiles_spec = defaultdict(dict)
 
         for (funame, fu) in fus.items():
@@ -878,22 +1038,20 @@ class NonProductionCore(ControlBase):
                 # the PowerDecoder2 (main one, not the satellites) contains
                 # the decoded regfile numbers. obtain these now
                 if readmode:
-                    rdflag, read = regspec_decode_read(e, regfile, regname)
+                    rdport, read = regspec_decode_read(e, regfile, regname)
                     wrport, write = None, None
                 else:
-                    rdflag, read = None, None
+                    rdport, read = None, None
                     wrport, write = regspec_decode_write(e, regfile, regname)
 
                 # construct the dictionary of regspec information by regfile
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
-                        (rdflag, wrport, read, write, wid, [])
+                        ByRegSpec(rdport, wrport, read, write, wid, [])
                 # here we start to create "lanes"
-                if idx not in byregfiles[regfile]:
-                    byregfiles[regfile][idx] = []
-                fuspec = (funame, fu, idx)
+                fuspec = FUSpec(funame, fu, idx)
                 byregfiles[regfile][idx].append(fuspec)
-                byregfiles_spec[regfile][regname][5].append(fuspec)
+                byregfiles_spec[regfile][regname].specs.append(fuspec)
 
                 continue
                 # append a latch Signal to the FU's list of latches
@@ -912,10 +1070,10 @@ class NonProductionCore(ControlBase):
             print("regfile %s ports:" % mode, regfile)
             fuspecs = byregfiles_spec[regfile]
             for regname, fspec in fuspecs.items():
-                [rdflag, wrflag, read, write, wid, fuspec] = fspec
+                [rdport, wrport, read, write, wid, fuspecs] = fspec
                 print("  rf %s port %s lane: %s" % (mode, regfile, regname))
-                print("  %s" % regname, wid, read, write, rdflag, wrflag)
-                for (funame, fu, idx) in fuspec:
+                print("  %s" % regname, wid, read, write, rdport, wrport)
+                for (funame, fu, idx) in fuspecs:
                     fusig = fu.src_i[idx] if readmode else fu.dest[idx]
                     print("    ", funame, fu.__class__.__name__, idx, fusig)
                     print()
@@ -936,6 +1094,7 @@ if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
                          addr_wid=48,
+                         allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)