local variable rename in FetchFSM
[soc.git] / src / soc / simple / core.py
index 3b541541de3e6c2568f9c5e3f0b1fb790143411e..9f3a114136b74284c503e1858b1d34e06800ea93 100644 (file)
@@ -19,7 +19,8 @@ and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
 """
 
-from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
+from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
+                    Const)
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
@@ -31,7 +32,7 @@ from nmutil.picker import PriorityPicker
 from nmutil.util import treereduce
 from nmutil.singlepipe import ControlBase
 
-from soc.fu.compunits.compunits import AllFunctionUnits
+from soc.fu.compunits.compunits import AllFunctionUnits, LDSTFunctionUnit
 from soc.regfile.regfiles import RegFiles
 from openpower.decoder.power_decoder2 import get_rdflags
 from soc.experiment.l0_cache import TstL0CacheBuffer  # test only
@@ -80,6 +81,12 @@ class NonProductionCore(ControlBase):
         self.regreduce_en = (hasattr(pspec, "regreduce") and
                              (pspec.regreduce == True))
 
+        # test to see if overlapping of instructions is allowed
+        # (not normally enabled for TestIssuer FSM but useful for checking
+        # the bitvector hazard detection, before doing In-Order)
+        self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+                             (pspec.allow_overlap == True))
+
         # test core type
         self.make_hazard_vecs = True
         self.core_type = "fsm"
@@ -110,9 +117,12 @@ class NonProductionCore(ControlBase):
         # set up input and output: unusual requirement to set data directly
         # (due to the way that the core is set up in a different domain,
         # see TestIssuer.setup_peripherals
-        self.i, self.o = self.new_specs(None)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
         self.i, self.o = self.p.i_data, self.n.o_data
 
+        # actual internal input data used (captured)
+        self.ireg = self.ispec()
+
         # create per-FU instruction decoders (subsetted).  these "satellite"
         # decoders reduce wire fan-out from the one (main) PowerDecoder2
         # (used directly by the trap unit) to the *twelve* (or more)
@@ -132,7 +142,7 @@ class NonProductionCore(ControlBase):
                 continue
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
-                                                      state=self.i.state,
+                                                      state=self.ireg.state,
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
@@ -172,12 +182,15 @@ class NonProductionCore(ControlBase):
         self.connect_satellite_decoders(m)
 
         # ssh, cheat: trap uses the main decoder because of the rewriting
-        self.des[self.trapunit] = self.i.e.do
+        self.des[self.trapunit] = self.ireg.e.do
 
-        # connect up Function Units, then read/write ports
+        # connect up Function Units, then read/write ports, and hazard conflict
+        self.issue_conflict = Signal()
         fu_bitdict, fu_selected = self.connect_instruction(m)
-        self.connect_rdports(m, fu_selected)
-        self.connect_wrports(m, fu_selected)
+        raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected)
+        self.connect_wrports(m, fu_bitdict, fu_selected)
+        if self.allow_overlap:
+            comb += self.issue_conflict.eq(raw_hazard)
 
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
@@ -196,22 +209,22 @@ class NonProductionCore(ControlBase):
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
             setattr(m.submodules, "dec_%s" % v.fn_name, v)
-            comb += v.dec.raw_opcode_in.eq(self.i.raw_insn_i)
-            comb += v.dec.bigendian.eq(self.i.bigendian_i)
+            comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
+            comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
-            comb += v.sv_a_nz.eq(self.i.sv_a_nz)
+            comb += v.sv_a_nz.eq(self.ireg.sv_a_nz)
             if self.svp64_en:
-                comb += v.pred_sm.eq(self.i.sv_pred_sm)
-                comb += v.pred_dm.eq(self.i.sv_pred_dm)
+                comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
+                comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
                 if k != self.trapunit:
-                    comb += v.sv_rm.eq(self.i.sv_rm) # pass through SVP64 ReMap
-                    comb += v.is_svp64_mode.eq(self.i.is_svp64_mode)
+                    comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
+                    comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
                     # only the LDST PowerDecodeSubset *actually* needs to
                     # know to use the alternative decoder.  this is all
                     # a terrible hack
                     if k.lower().startswith("ldst"):
                         comb += v.use_svp64_ldst_dec.eq(
-                                        self.i.use_svp64_ldst_dec)
+                                        self.ireg.use_svp64_ldst_dec)
 
     def connect_instruction(self, m):
         """connect_instruction
@@ -230,6 +243,14 @@ class NonProductionCore(ControlBase):
 
         # indicate if core is busy
         busy_o = self.o.busy_o
+        any_busy_o = self.o.any_busy_o
+
+        # connect up temporary copy of incoming instruction. the FSM will
+        # either blat the incoming instruction (if valid) into self.ireg
+        # or if the instruction could not be delivered, keep dropping the
+        # latched copy into ireg
+        ilatch = self.ispec()
+        self.instr_active = Signal()
 
         # enable/busy-signals for each FU, get one bit for each FU (by name)
         fu_enable = Signal(len(fus), reset_less=True)
@@ -271,8 +292,9 @@ class NonProductionCore(ControlBase):
                 # instruction.
                 fnunit = fu.fnunit.value
                 en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
-                fnmatch = (self.i.e.do.fn_unit & fnunit).bool()
-                comb += en_req.eq(fnmatch & ~fu.busy_o & self.p.i_valid)
+                fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool()
+                comb += en_req.eq(fnmatch & ~fu.busy_o &
+                                    self.instr_active)
                 i_l.append(en_req) # store in list for doing the Cat-trick
                 # picker output, gated by enable: store in fu_bitdict
                 po = Signal(name="o_issue_pick_"+funame) # picker output
@@ -282,63 +304,108 @@ class NonProductionCore(ControlBase):
                 # if we don't do this, then when there are no FUs available,
                 # the "p.o_ready" signal will go back "ok we accepted this
                 # instruction" which of course isn't true.
-                comb += fu_found.eq(~fnmatch | i_pp.en_o)
+                with m.If(i_pp.en_o):
+                    comb += fu_found.eq(1)
             # for each input, Cat them together and drop them into the picker
             comb += i_pp.i.eq(Cat(*i_l))
 
+        # rdmask, which is for registers needs to come from the *main* decoder
+        for funame, fu in fus.items():
+            rdmask = get_rdflags(self.ireg.e, fu)
+            comb += fu.rdmaskn.eq(~rdmask)
+
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
             sync += counter.eq(counter - 1)
             comb += busy_o.eq(1)
 
-        with m.If(self.p.i_valid): # run only when valid
-            with m.Switch(self.i.e.do.insn_type):
-                # check for ATTN: halt if true
-                with m.Case(MicrOp.OP_ATTN):
-                    m.d.sync += self.o.core_terminate_o.eq(1)
-
-                # fake NOP - this isn't really used (Issuer detects NOP)
-                with m.Case(MicrOp.OP_NOP):
-                    sync += counter.eq(2)
-                    comb += busy_o.eq(1)
-
-                with m.Default():
-                    # connect up instructions.  only one enabled at a time
+        # default to reading from incoming instruction: may be overridden
+        # by copy from latch when "waiting"
+        comb += self.ireg.eq(self.i)
+        # always say "ready" except if overridden
+        comb += self.p.o_ready.eq(1)
+
+        with m.FSM():
+            with m.State("READY"):
+                with m.If(self.p.i_valid): # run only when valid
+                    with m.Switch(self.ireg.e.do.insn_type):
+                        # check for ATTN: halt if true
+                        with m.Case(MicrOp.OP_ATTN):
+                            m.d.sync += self.o.core_terminate_o.eq(1)
+
+                        # fake NOP - this isn't really used (Issuer detects NOP)
+                        with m.Case(MicrOp.OP_NOP):
+                            sync += counter.eq(2)
+                            comb += busy_o.eq(1)
+
+                        with m.Default():
+                            comb += self.instr_active.eq(1)
+                            comb += self.p.o_ready.eq(0)
+                            # connect instructions. only one enabled at a time
+                            for funame, fu in fus.items():
+                                do = self.des[funame]
+                                enable = fu_bitdict[funame]
+
+                                # run this FunctionUnit if enabled route op,
+                                # issue, busy, read flags and mask to FU
+                                with m.If(enable):
+                                    # operand comes from the *local*  decoder
+                                    comb += fu.oper_i.eq_from(do)
+                                    comb += fu.issue_i.eq(1) # issue when valid
+                                    # instruction ok, indicate ready
+                                    comb += self.p.o_ready.eq(1)
+
+                            if self.allow_overlap:
+                                with m.If(~fu_found):
+                                    # latch copy of instruction
+                                    sync += ilatch.eq(self.i)
+                                    comb += self.p.o_ready.eq(1) # accept
+                                    comb += busy_o.eq(1)
+                                    m.next = "WAITING"
+
+            with m.State("WAITING"):
+                comb += self.instr_active.eq(1)
+                comb += self.p.o_ready.eq(0)
+                comb += busy_o.eq(1)
+                # using copy of instruction, keep waiting until an FU is free
+                comb += self.ireg.eq(ilatch)
+                with m.If(fu_found): # wait for conflict to clear
+                    # connect instructions. only one enabled at a time
                     for funame, fu in fus.items():
                         do = self.des[funame]
                         enable = fu_bitdict[funame]
 
-                        # run this FunctionUnit if enabled
-                        # route op, issue, busy, read flags and mask to FU
+                        # run this FunctionUnit if enabled route op,
+                        # issue, busy, read flags and mask to FU
                         with m.If(enable):
                             # operand comes from the *local*  decoder
                             comb += fu.oper_i.eq_from(do)
-                            comb += fu.issue_i.eq(1) # issue when input valid
-                            # rdmask, which is for registers, needs to come
-                            # from the *main* decoder
-                            rdmask = get_rdflags(self.i.e, fu)
-                            comb += fu.rdmaskn.eq(~rdmask)
+                            comb += fu.issue_i.eq(1) # issue when valid
+                            comb += self.p.o_ready.eq(1)
+                            comb += busy_o.eq(0)
+                            m.next = "READY"
 
-        # if instruction is busy, set busy output for core.
+        print ("core: overlap allowed", self.allow_overlap)
         busys = map(lambda fu: fu.busy_o, fus.values())
-        comb += busy_o.eq(Cat(*busys).bool())
-
-        # ready/valid signalling.  if busy, means refuse incoming issue.
-        # (this is a global signal, TODO, change to one which allows
-        # overlapping instructions)
-        # also, if there was no fu found we must not send back a valid
-        # indicator.  BUT, of course, when there is no instruction
-        # we must ignore the fu_found flag, otherwise o_ready will never
-        # be set when everything is idle
-        comb += self.p.o_ready.eq(fu_found | ~self.p.i_valid)
+        comb += any_busy_o.eq(Cat(*busys).bool())
+        if not self.allow_overlap:
+            # for simple non-overlap, if any instruction is busy, set
+            # busy output for core.
+            comb += busy_o.eq(any_busy_o)
+        else:
+            # sigh deal with a fun situation that needs to be investigated
+            # and resolved
+            with m.If(self.issue_conflict):
+                comb += busy_o.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
         # connecters to give them permission to request access to regfiles
         return fu_bitdict, fu_selected
 
-    def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
+    def connect_rdport(self, m, fu_bitdict, fu_selected,
+                                rdpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -351,13 +418,22 @@ class NonProductionCore(ControlBase):
         print("read regfile", rpidx, regfile, regs.rf.keys(),
                               rfile, rfile.unary)
 
+        # for checking if the read port has an outstanding write
+        if self.make_hazard_vecs:
+            wv = regs.wv[regfile.lower()]
+            wvchk = wv.r_ports["issue"] # write-vec bit-level hazard check
+
+        # if a hazard is detected on this read port, simply blithely block
+        # every FU from reading on it.  this is complete overkill but very
+        # simple for now.
+        hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
+
         fspecs = fspec
         if not isinstance(fspecs, list):
             fspecs = [fspecs]
 
         rdflags = []
         pplen = 0
-        reads = []
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
@@ -369,7 +445,6 @@ class NonProductionCore(ControlBase):
             rdflag = Signal(name=name, reset_less=True)
             comb += rdflag.eq(rf)
             rdflags.append(rdflag)
-            reads.append(read)
 
         print ("pplen", pplen)
 
@@ -382,30 +457,50 @@ class NonProductionCore(ControlBase):
         wvens = []
 
         for i, fspec in enumerate(fspecs):
-            (rf, wf, read, write, wid, fuspec) = fspec
+            (rf, wf, _read, _write, wid, fuspec) = fspec
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
             for pi, (funame, fu, idx) in enumerate(fuspec):
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
+                fu_active = fu_selected[funame]
+                fu_issued = fu_bitdict[funame]
+
+                # get (or set up) a latched copy of read register number
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+                read = Signal.like(_read, name="read_"+name)
+                if rname not in fu.rd_latches:
+                    rdl = Signal.like(_read, name="rdlatch_"+rname)
+                    fu.rd_latches[rname] = rdl
+                    with m.If(fu.issue_i):
+                        sync += rdl.eq(_read)
+                else:
+                    rdl = fu.rd_latches[rname]
+                # latch to make the read immediately available on issue cycle
+                # after the read cycle, use the latched copy
+                with m.If(fu.issue_i):
+                    comb += read.eq(_read)
+                with m.Else():
+                    comb += read.eq(rdl)
 
                 # connect request-read to picker input, and output to go-rd
-                fu_active = fu_bitdict[funame]
-                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
-                addr_en = Signal.like(reads[i], name="addr_en_"+name)
+                addr_en = Signal.like(read, name="addr_en_"+name)
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+                rhazard = Signal(name="rhaz_"+name)
 
                 # exclude any currently-enabled read-request (mask out active)
                 comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
-                                ~delay_pick)
+                                ~delay_pick & ~rhazard)
+                # entirely block anything hazarded from being picked
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
                 # if picked, select read-port "reg select" number to port
                 comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
                 sync += delay_pick.eq(rp) # delayed "pick"
-                comb += addr_en.eq(Mux(rp, reads[i], 0))
+                comb += addr_en.eq(Mux(rp, read, 0))
 
                 # the read-enable happens combinatorially (see mux-bus below)
                 # but it results in the data coming out on a one-cycle delay.
@@ -425,6 +520,29 @@ class NonProductionCore(ControlBase):
                     # all FUs connect to same port
                     comb += src.eq(rport.o_data)
 
+                if not self.make_hazard_vecs:
+                    continue
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                wvchk_en = Signal(len(wvchk.ren), name="wv_chk_addr_en_"+name)
+                issue_active = Signal(name="rd_iactive_"+name)
+                # XXX combinatorial loop here
+                comb += issue_active.eq(fu_active & rf)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(read)
+                    else:
+                        comb += wvchk_en.eq(1<<read)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards)
+                with m.If(fu.busy_o & ~rhazard):
+                        comb += wvchk_en.eq(0)
+
+                comb += rhazard.eq((wvchk.o_data & wvchk_en).bool())
+
+                wvens.append(wvchk_en)
+
         # or-reduce the muxed read signals
         if rfile.unary:
             # for unary-addressed
@@ -435,7 +553,16 @@ class NonProductionCore(ControlBase):
             comb += rport.ren.eq(Cat(*rens).bool())
             print ("binary", regfile, rpidx, rport, rport.ren, rens, addrs)
 
-    def connect_rdports(self, m, fu_bitdict):
+        if not self.make_hazard_vecs:
+            return Const(0) # declare "no hazards"
+
+        # enable the read bitvectors for this issued instruction
+        # and return whether any write-hazard bit is set
+        comb += wvchk.ren.eq(ortreereduce_sig(wvens))
+        comb += hazard_detected.eq(wvchk.o_data.bool())
+        return hazard_detected
+
+    def connect_rdports(self, m, fu_bitdict, fu_selected):
         """connect read ports
 
         orders the read regspecs into a dict-of-dicts, by regfile, by
@@ -445,6 +572,7 @@ class NonProductionCore(ControlBase):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
+        rd_hazard = []
 
         # dictionary of lists of regfile read ports
         byregfiles_rd, byregfiles_rdspec = self.get_byregfiles(True)
@@ -471,24 +599,98 @@ class NonProductionCore(ControlBase):
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             # for each named regfile port, connect up all FUs to that port
+            # also return (and collate) hazard detection)
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 print("connect rd", regname, fspec)
-                self.connect_rdport(m, fu_bitdict, rdpickers, regfile,
+                rh = self.connect_rdport(m, fu_bitdict, fu_selected,
+                                       rdpickers, regfile,
                                        regname, fspec)
+                rd_hazard.append(rh)
+
+        return Cat(*rd_hazard).bool()
 
-    def connect_wrport(self, m, fu_bitdict, wrpickers, regfile, regname, fspec):
+    def make_hazards(self, m, regfile, rfile, wvclr, wvset,
+                    funame, regname, idx,
+                    addr_en, wp, fu, fu_active, wrflag, write,
+                    fu_wrok):
+        """make_hazards: a setter and a clearer for the regfile write ports
+
+        setter is at issue time (using PowerDecoder2 regfile write numbers)
+        clearer is at regfile write time (when FU has said what to write to)
+
+        there is *one* unusual case here which has to be dealt with:
+        when the Function Unit does *NOT* request a write to the regfile
+        (has its data.ok bit CLEARED).  this is perfectly legitimate.
+        and a royal pain.
+        """
+        comb, sync = m.d.comb, m.d.sync
+        name = "%s_%s_%d" % (funame, regname, idx)
+
+        # connect up the bitvector write hazard.  unlike the
+        # regfile writeports, a ONE must be written to the corresponding
+        # bit of the hazard bitvector (to indicate the existence of
+        # the hazard)
+
+        # the detection of what shall be written to is based
+        # on *issue*
+        print ("write vector (for regread)", regfile, wvset)
+        wviaddr_en = Signal(len(wvset.wen), name="wv_issue_addr_en_"+name)
+        issue_active = Signal(name="iactive_"+name)
+        comb += issue_active.eq(fu.issue_i & fu_active & wrflag)
+        with m.If(issue_active):
+            if rfile.unary:
+                comb += wviaddr_en.eq(write)
+            else:
+                comb += wviaddr_en.eq(1<<write)
+
+        # deal with write vector clear: this kicks in when the regfile
+        # is written to, and clears the corresponding bitvector entry
+        print ("write vector", regfile, wvclr)
+        wvaddr_en = Signal(len(wvclr.wen), name="wvaddr_en_"+name)
+        if rfile.unary:
+            comb += wvaddr_en.eq(addr_en)
+        else:
+            with m.If(wp):
+                comb += wvaddr_en.eq(1<<addr_en)
+
+        # XXX ASSUME that LDSTFunctionUnit always sets the data it intends to
+        # this may NOT be the case when an exception occurs
+        if isinstance(fu, LDSTFunctionUnit):
+            return wvaddr_en, wviaddr_en
+
+        # okaaay, this is preparation for the awkward case.
+        # * latch a copy of wrflag when issue goes high.
+        # * when the fu_wrok (data.ok) flag is NOT set,
+        #   but the FU is done, the FU is NEVER going to write
+        #   so the bitvector has to be cleared.
+        latch_wrflag = Signal(name="latch_wrflag_"+name)
+        with m.If(~fu.busy_o):
+            sync += latch_wrflag.eq(0)
+        with m.If(fu.issue_i & fu_active):
+            sync += latch_wrflag.eq(wrflag)
+        with m.If(fu.alu_done_o & latch_wrflag & ~fu_wrok):
+            if rfile.unary:
+                comb += wvaddr_en.eq(write) # addr_en gated with wp, don't use
+            else:
+                comb += wvaddr_en.eq(1<<addr_en) # binary addr_en not gated
+
+        return wvaddr_en, wviaddr_en
+
+    def connect_wrport(self, m, fu_bitdict, fu_selected,
+                                wrpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
 
-        print("connect wr", regname, fspec)
         rpidx = regname
 
         # select the required write port.  these are pre-defined sizes
-        print(regfile, regs.rf.keys())
         rfile = regs.rf[regfile.lower()]
         wport = rfile.w_ports[rpidx]
 
+        print("connect wr", regname, "unary", rfile.unary, fspec)
+        print(regfile, regs.rf.keys())
+
         # select the write-protection hazard vector.  note that this still
         # requires to WRITE to the hazard bitvector!  read-requests need
         # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
@@ -540,20 +742,34 @@ class NonProductionCore(ControlBase):
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
-            (rf, wf, read, write, wid, fuspec) = fspec
+            (rf, wf, read, _write, wid, fuspec) = fspec
             for pi, (funame, fu, idx) in enumerate(fuspec):
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+                # get (or set up) a write-latched copy of write register number
+                write = Signal.like(_write, name="write_"+name)
+                rname = "%s_%s_%s" % (funame, regfile, regname)
+                if rname not in fu.wr_latches:
+                    wrl = Signal.like(_write, name="wrlatch_"+rname)
+                    fu.wr_latches[rname] = write
+                    with m.If(fu.issue_i):
+                        sync += wrl.eq(_write)
+                        comb += write.eq(_write)
+                    with m.Else():
+                        comb += write.eq(wrl)
+                else:
+                    write = fu.wr_latches[rname]
 
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
-                name = "wrflag_%s_%s_%d" % (funame, regname, idx)
-                wrflag = Signal(name=name, reset_less=True)
-                comb += wrflag.eq(dest.ok & fu.busy_o)
+                name = "fu_wrok_%s_%s_%d" % (funame, regname, idx)
+                fu_wrok = Signal(name=name, reset_less=True)
+                comb += fu_wrok.eq(dest.ok & fu.busy_o)
 
                 # connect request-write to picker input, and output to go-wr
-                fu_active = fu_bitdict[funame]
-                pick = fu.wr.rel_o[idx] & fu_active  # & wrflag
+                fu_active = fu_selected[funame]
+                pick = fu.wr.rel_o[idx] & fu_active
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
                 wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
@@ -563,7 +779,8 @@ class NonProductionCore(ControlBase):
                 # connect the regspec write "reg select" number to this port
                 # only if one FU actually requests (and is granted) the port
                 # will the write-enable be activated
-                addr_en = Signal.like(write)
+                wname = "waddr_en_%s_%s_%d" % (funame, regname, idx)
+                addr_en = Signal.like(write, name=wname)
                 wp = Signal()
                 comb += wp.eq(wr_pick & wrpick.en_o)
                 comb += addr_en.eq(Mux(wp, write, 0))
@@ -582,35 +799,14 @@ class NonProductionCore(ControlBase):
                 # now connect up the bitvector write hazard
                 if not self.make_hazard_vecs:
                     continue
-                print ("write vector", regfile, wvclr)
-                wname = "wvaddr_en_%s_%s_%d" % (funame, regname, idx)
-                wvaddr_en = Signal(len(wvclr.wen), name=wname)
-                if rfile.unary:
-                    comb += wvaddr_en.eq(addr_en)
-                else:
-                    with m.If(wp):
-                        comb += wvaddr_en.eq(1<<addr_en)
-                wvclren.append(wvaddr_en)
-
-                continue
-                # now connect up the bitvector write hazard.  unlike the
-                # regfile writeports, a ONE must be written to the corresponding
-                # bit of the hazard bitvector (to indicate the existence of
-                # the hazard)
-
-                # the detection of what shall be written to is based
-                # on *issue*
-                print ("write vector (for regread)", regfile, wvset)
-                wname = "wv_issue_addr_en_%s_%s_%d" % (funame, regname, idx)
-                wvaddr_en = Signal(len(wvset.wen), name=wname)
-                issue_active = Signal(name="iactive_"+name)
-                comb += issue_active.eq(fu.issue_i & fu_active & wrflags[i])
-                with m.If(issue_active):
-                    if rfile.unary:
-                        comb += wvaddr_en.eq(addr_en)
-                    else:
-                        comb += wvaddr_en.eq(1<<addr_en)
-                    wvsetens.append(wvaddr_en)
+                res = self.make_hazards(m, regfile, rfile, wvclr, wvset,
+                                        funame, regname, idx,
+                                        addr_en, wp, fu, fu_active,
+                                        wrflags[i], write, fu_wrok)
+                wvaddr_en, wv_issue_en = res
+                wvclren.append(wvaddr_en)   # set only: no data => clear bit
+                wvseten.append(wv_issue_en) # set data same as enable
+                wvsets.append(wv_issue_en)  # because enable needs a 1
 
         # here is where we create the Write Broadcast Bus. simple, eh?
         comb += wport.i_data.eq(ortreereduce_sig(wsigs))
@@ -622,12 +818,15 @@ class NonProductionCore(ControlBase):
             comb += wport.addr.eq(ortreereduce_sig(addrs))
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
+        if not self.make_hazard_vecs:
+            return
+
         # for write-vectors
-        comb += wvclr.wen.eq(ortreereduce_sig(wvclren))
-        #comb += wvset.wen.eq(ortreereduce_sig(wvens))
-        #comb += wvset.i_data.eq(ortreereduce_sig(wvsets))
+        comb += wvclr.wen.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+        comb += wvset.wen.eq(ortreereduce_sig(wvseten)) # set (issue time)
+        comb += wvset.i_data.eq(ortreereduce_sig(wvsets))
 
-    def connect_wrports(self, m, fu_bitdict):
+    def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
 
         orders the write regspecs into a dict-of-dicts, by regfile,
@@ -664,7 +863,7 @@ class NonProductionCore(ControlBase):
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, wrpickers,
+                self.connect_wrport(m, fu_bitdict, fu_selected, wrpickers,
                                         regfile, regname, fspec)
 
     def get_byregfiles(self, readmode):
@@ -672,28 +871,43 @@ class NonProductionCore(ControlBase):
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
-        e = self.i.e # decoded instruction to execute
+        e = self.ireg.e # decoded instruction to execute
+
+        # dictionary of dictionaries of lists of regfile ports.
+        # first key: regfile.  second key: regfile port name
+        byregfiles = defaultdict(dict)
+        byregfiles_spec = defaultdict(dict)
 
-        # dictionary of lists of regfile ports
-        byregfiles = {}
-        byregfiles_spec = {}
         for (funame, fu) in fus.items():
+            # create in each FU a receptacle for the read/write register
+            # hazard numbers.  to be latched in connect_rd/write_ports
+            # XXX better that this is moved into the actual FUs, but
+            # the issue there is that this function is actually better
+            # suited at the moment
+            if readmode:
+                fu.rd_latches = {}
+            else:
+                fu.wr_latches = {}
+
             print("%s ports for %s" % (mode, funame))
             for idx in range(fu.n_src if readmode else fu.n_dst):
+                # construct regfile specs: read uses inspec, write outspec
                 if readmode:
                     (regfile, regname, wid) = fu.get_in_spec(idx)
                 else:
                     (regfile, regname, wid) = fu.get_out_spec(idx)
                 print("    %d %s %s %s" % (idx, regfile, regname, str(wid)))
+
+                # the PowerDecoder2 (main one, not the satellites) contains
+                # the decoded regfile numbers. obtain these now
                 if readmode:
                     rdflag, read = regspec_decode_read(e, regfile, regname)
                     wrport, write = None, None
                 else:
                     rdflag, read = None, None
                     wrport, write = regspec_decode_write(e, regfile, regname)
-                if regfile not in byregfiles:
-                    byregfiles[regfile] = {}
-                    byregfiles_spec[regfile] = {}
+
+                # construct the dictionary of regspec information by regfile
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
                         (rdflag, wrport, read, write, wid, [])
@@ -704,7 +918,19 @@ class NonProductionCore(ControlBase):
                 byregfiles[regfile][idx].append(fuspec)
                 byregfiles_spec[regfile][regname][5].append(fuspec)
 
-        # ok just print that out, for convenience
+                continue
+                # append a latch Signal to the FU's list of latches
+                rname = "%s_%s" % (regfile, regname)
+                if readmode:
+                    if rname not in fu.rd_latches:
+                        rdl = Signal.like(read, name="rdlatch_"+rname)
+                        fu.rd_latches[rname] = rdl
+                else:
+                    if rname not in fu.wr_latches:
+                        wrl = Signal.like(write, name="wrlatch_"+rname)
+                        fu.wr_latches[rname] = wrl
+
+        # ok just print that all out, for convenience
         for regfile, spec in byregfiles.items():
             print("regfile %s ports:" % mode, regfile)
             fuspecs = byregfiles_spec[regfile]