Implement CR predication

[soc.git] / src / soc / simple / issuer.py
diff --git a/src/soc/simple/issuer.py b/src/soc/simple/issuer.py

index 805940c7fa363efc950a1f4e654581fba4701a3f..d2c248d50bef2566784967afa2429089c207f0d6 100644 (file)
--- a/src/soc/simple/issuer.py
+++ b/src/soc/simple/issuer.py
@@ -16,11 +16,13 @@ improved.
  """
  
  from nmigen import (Elaboratable, Module, Signal, ClockSignal, ResetSignal,
-                    ClockDomain, DomainRenamer, Mux, Const)
+                    ClockDomain, DomainRenamer, Mux, Const, Repl, Cat)
  from nmigen.cli import rtlil
  from nmigen.cli import main
  import sys
  
+from nmigen.lib.coding import PriorityEncoder
+
  from soc.decoder.power_decoder import create_pdecode
  from soc.decoder.power_decoder2 import PowerDecode2, SVP64PrefixDecoder
  from soc.decoder.decode2execute1 import IssuerDecode2ToOperand
@@ -30,7 +32,9 @@ from soc.regfile.regfiles import StateRegs, FastRegs
  from soc.simple.core import NonProductionCore
  from soc.config.test.test_loadstore import TestMemPspec
  from soc.config.ifetch import ConfigFetchUnit
-from soc.decoder.power_enums import MicrOp
+from soc.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR,
+                                     SVP64PredMode)
+from soc.consts import (CR, SVP64CROffs)
  from soc.debug.dmi import CoreDebug, DMIInterface
  from soc.debug.jtag import JTAG
  from soc.config.pinouts import get_pinspecs
@@ -53,40 +57,47 @@ def get_insn(f_instr_o, pc):
          return f_instr_o.word_select(pc[2], 32)
  
  # gets state input or reads from state regfile
-def state_get(m, state_i, name, regfile, regnum):
+def state_get(m, core_rst, state_i, name, regfile, regnum):
      comb = m.d.comb
      sync = m.d.sync
      # read the PC
      res = Signal(64, reset_less=True, name=name)
      res_ok_delay = Signal(name="%s_ok_delay" % name)
-    sync += res_ok_delay.eq(~state_i.ok)
-    with m.If(state_i.ok):
-        # incoming override (start from pc_i)
-        comb += res.eq(state_i.data)
-    with m.Else():
-        # otherwise read StateRegs regfile for PC...
-        comb += regfile.ren.eq(1<<regnum)
-    # ... but on a 1-clock delay
-    with m.If(res_ok_delay):
-        comb += res.eq(regfile.data_o)
+    with m.If(~core_rst):
+        sync += res_ok_delay.eq(~state_i.ok)
+        with m.If(state_i.ok):
+            # incoming override (start from pc_i)
+            comb += res.eq(state_i.data)
+        with m.Else():
+            # otherwise read StateRegs regfile for PC...
+            comb += regfile.ren.eq(1<<regnum)
+        # ... but on a 1-clock delay
+        with m.If(res_ok_delay):
+            comb += res.eq(regfile.data_o)
      return res
  
-def get_predint(m, mask):
+def get_predint(m, mask, name):
      """decode SVP64 predicate integer mask field to reg number and invert
      this is identical to the equivalent function in ISACaller except that
      it doesn't read the INT directly, it just decodes "what needs to be done"
      i.e. which INT reg, whether it is shifted and whether it is bit-inverted.
+
+    * all1s is set to indicate that no mask is to be applied.
+    * regread indicates the GPR register number to be read
+    * invert is set to indicate that the register value is to be inverted
+    * unary indicates that the contents of the register is to be shifted 1<<r3
      """
-    regread = Signal(5)
-    invert = Signal()
-    unary = Signal()
+    comb = m.d.comb
+    regread = Signal(5, name=name+"regread")
+    invert = Signal(name=name+"invert")
+    unary = Signal(name=name+"unary")
+    all1s = Signal(name=name+"all1s")
      with m.Switch(mask):
          with m.Case(SVP64PredInt.ALWAYS.value):
-            comb += regread.eq(0)
-            comb += invert.eq(1)
+            comb += all1s.eq(1)      # use 0b1111 (all ones)
          with m.Case(SVP64PredInt.R3_UNARY.value):
              comb += regread.eq(3)
-            comb += unary.eq(1)
+            comb += unary.eq(1)        # 1<<r3 - shift r3 (single bit)
          with m.Case(SVP64PredInt.R3.value):
              comb += regread.eq(3)
          with m.Case(SVP64PredInt.R3_N.value):
@@ -102,59 +113,72 @@ def get_predint(m, mask):
          with m.Case(SVP64PredInt.R30_N.value):
              comb += regread.eq(30)
              comb += invert.eq(1)
-    return regread, invert, unary
+    return regread, invert, unary, all1s
  
-def get_predcr(m, mask):
+def get_predcr(m, mask, name):
      """decode SVP64 predicate CR to reg number field and invert status
      this is identical to _get_predcr in ISACaller
      """
-    idx = Signal(2)
-    invert = Signal()
+    comb = m.d.comb
+    idx = Signal(2, name=name+"idx")
+    invert = Signal(name=name+"crinvert")
      with m.Switch(mask):
          with m.Case(SVP64PredCR.LT.value):
-            comb += idx.eq(0)
-            comb += invert.eq(1)
-        with m.Case(SVP64PredCR.GE.value):
-            comb += idx.eq(0)
+            comb += idx.eq(CR.LT)
              comb += invert.eq(0)
-        with m.Case(SVP64PredCR.GT.value):
-            comb += idx.eq(1)
+        with m.Case(SVP64PredCR.GE.value):
+            comb += idx.eq(CR.LT)
              comb += invert.eq(1)
-        with m.Case(SVP64PredCR.LE.value):
-            comb += idx.eq(1)
+        with m.Case(SVP64PredCR.GT.value):
+            comb += idx.eq(CR.GT)
              comb += invert.eq(0)
-        with m.Case(SVP64PredCR.EQ.value):
-            comb += idx.eq(2)
+        with m.Case(SVP64PredCR.LE.value):
+            comb += idx.eq(CR.GT)
              comb += invert.eq(1)
-        with m.Case(SVP64PredCR.NE.value):
-            comb += idx.eq(1)
+        with m.Case(SVP64PredCR.EQ.value):
+            comb += idx.eq(CR.EQ)
              comb += invert.eq(0)
-        with m.Case(SVP64PredCR.SO.value):
-            comb += idx.eq(3)
+        with m.Case(SVP64PredCR.NE.value):
+            comb += idx.eq(CR.EQ)
              comb += invert.eq(1)
-        with m.Case(SVP64PredCR.NS.value):
-            comb += idx.eq(3)
+        with m.Case(SVP64PredCR.SO.value):
+            comb += idx.eq(CR.SO)
              comb += invert.eq(0)
+        with m.Case(SVP64PredCR.NS.value):
+            comb += idx.eq(CR.SO)
+            comb += invert.eq(1)
      return idx, invert
  
  
  class TestIssuerInternal(Elaboratable):
      """TestIssuer - reads instructions from TestMemory and issues them
  
-    efficiency and speed is not the main goal here: functional correctness is.
+    efficiency and speed is not the main goal here: functional correctness
+    and code clarity is.  optimisations (which almost 100% interfere with
+    easy understanding) come later.
      """
      def __init__(self, pspec):
  
          # test is SVP64 is to be enabled
          self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
  
+        # and if regfiles are reduced
+        self.regreduce_en = (hasattr(pspec, "regreduce") and
+                                            (pspec.regreduce == True))
+
          # JTAG interface.  add this right at the start because if it's
          # added it *modifies* the pspec, by adding enable/disable signals
          # for parts of the rest of the core
          self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
          if self.jtag_en:
-            subset = {'uart', 'mtwi', 'eint', 'gpio', 'mspi0', 'mspi1',
-                      'pwm', 'sd0', 'sdr'}
+            # XXX MUST keep this up-to-date with litex, and
+            # soc-cocotb-sim, and err.. all needs sorting out, argh
+            subset = ['uart',
+                      'mtwi',
+                      'eint', 'gpio', 'mspi0',
+                      # 'mspi1', - disabled for now
+                      # 'pwm', 'sd0', - disabled for now
+                       'sdr']
              self.jtag = JTAG(get_pinspecs(subset=subset))
              # add signals to pspec to enable/disable icache and dcache
              # (or data and intstruction wishbone if icache/dcache not included)
@@ -197,7 +221,8 @@ class TestIssuerInternal(Elaboratable):
          self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
          self.pdecode2 = PowerDecode2(pdecode, state=self.cur_state,
                                       opkls=IssuerDecode2ToOperand,
-                                     svp64_en=self.svp64_en)
+                                     svp64_en=self.svp64_en,
+                                     regreduce_en=self.regreduce_en)
          if self.svp64_en:
              self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
  
@@ -231,9 +256,10 @@ class TestIssuerInternal(Elaboratable):
          self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
          self.xer_r = xerrf.r_ports['full_xer'] # XER read
  
-        # for predication
-        self.int_pred = intrf.r_ports['pred'] # INT predicate read
-        self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
+        if self.svp64_en:
+            # for predication
+            self.int_pred = intrf.r_ports['pred'] # INT predicate read
+            self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
  
          # hack method of keeping an eye on whether branch/trap set the PC
          self.state_nia = self.core.regs.rf['state'].w_ports['nia']
@@ -251,6 +277,7 @@ class TestIssuerInternal(Elaboratable):
                          fetch_pc_ready_o, fetch_pc_valid_i,
                          fetch_insn_valid_o, fetch_insn_ready_i):
          """fetch FSM
+
          this FSM performs fetch of raw instruction data, partial-decodes
          it 32-bit at a time to detect SVP64 prefixes, and will optionally
          read a 2nd 32-bit quantity if that occurs.
@@ -339,9 +366,20 @@ class TestIssuerInternal(Elaboratable):
                      sync += dec_opcode_i.eq(insn)
                      m.next = "INSN_READY"
                      # TODO: probably can start looking at pdecode2.rm_dec
-                    # here (or maybe even in INSN_READ state, if svp64_mode
+                    # here or maybe even in INSN_READ state, if svp64_mode
                      # detected, in order to trigger - and wait for - the
                      # predicate reading.
+                    if self.svp64_en:
+                        pmode = pdecode2.rm_dec.predmode
+                    """
+                    if pmode != SVP64PredMode.ALWAYS.value:
+                        fire predicate loading FSM and wait before
+                        moving to INSN_READY
+                    else:
+                        sync += self.srcmask.eq(-1) # set to all 1s
+                        sync += self.dstmask.eq(-1) # set to all 1s
+                        m.next = "INSN_READY"
+                    """
  
              with m.State("INSN_READY"):
                  # hand over the instruction, to be decoded
@@ -349,7 +387,9 @@ class TestIssuerInternal(Elaboratable):
                  with m.If(fetch_insn_ready_i):
                      m.next = "IDLE"
  
-    def fetch_predicate_fsm(self, m, core, TODO):
+    def fetch_predicate_fsm(self, m,
+                            pred_insn_valid_i, pred_insn_ready_o,
+                            pred_mask_valid_o, pred_mask_ready_i):
          """fetch_predicate_fsm - obtains (constructs in the case of CR)
             src/dest predicate masks
  
@@ -360,6 +400,8 @@ class TestIssuerInternal(Elaboratable):
          later, a faster way would be to use the 32-bit-wide CR port but
          this is more complex decoding, here.  equivalent code used in
          ISACaller is "from soc.decoder.isa.caller import get_predcr"
+
+        note: this ENTIRE FSM is not to be called when svp64 is disabled
          """
          comb = m.d.comb
          sync = m.d.sync
@@ -368,22 +410,149 @@ class TestIssuerInternal(Elaboratable):
          predmode = rm_dec.predmode
          srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred
          cr_pred, int_pred = self.cr_pred, self.int_pred   # read regfiles
-        # if predmode == INT:
-        #    INT-src sregread, sinvert, sunary = get_predint(m, srcpred)
-        #    INT-dst dregread, dinvert, dunary = get_predint(m, dstpred)
-        #    TODO read INT-src and INT-dst into self.srcmask+dstmask
-        # elif predmode == CR:
-        #    CR-src sidx, sinvert = get_predcr(m, srcpred)
-        #    CR-dst didx, dinvert = get_predcr(m, dstpred)
-        #    TODO read CR-src and CR-dst into self.srcmask+dstmask with loop
-        # else
-        #    sync += self.srcmask.eq(-1) # set to all 1s
-        #    sync += self.dstmask.eq(-1) # set to all 1s
+        # get src/dst step, so we can skip already used mask bits
+        cur_state = self.cur_state
+        srcstep = cur_state.svstate.srcstep
+        dststep = cur_state.svstate.dststep
+        cur_vl = cur_state.svstate.vl
+
+        # decode predicates
+        sregread, sinvert, sunary, sall1s = get_predint(m, srcpred, 's')
+        dregread, dinvert, dunary, dall1s = get_predint(m, dstpred, 'd')
+        sidx, scrinvert = get_predcr(m, srcpred, 's')
+        didx, dcrinvert = get_predcr(m, dstpred, 'd')
+
+        with m.FSM(name="fetch_predicate"):
+
+            with m.State("FETCH_PRED_IDLE"):
+                comb += pred_insn_ready_o.eq(1)
+                with m.If(pred_insn_valid_i):
+                    with m.If(predmode == SVP64PredMode.INT):
+                        # skip fetching destination mask register, when zero
+                        with m.If(dall1s):
+                            sync += self.dstmask.eq(-1)
+                            # directly go to fetch source mask register
+                            # guaranteed not to be zero (otherwise predmode
+                            # would be SVP64PredMode.ALWAYS, not INT)
+                            comb += int_pred.addr.eq(sregread)
+                            comb += int_pred.ren.eq(1)
+                            m.next = "INT_SRC_READ"
+                        # fetch destination predicate register
+                        with m.Else():
+                            comb += int_pred.addr.eq(dregread)
+                            comb += int_pred.ren.eq(1)
+                            m.next = "INT_DST_READ"
+                    with m.Elif(predmode == SVP64PredMode.CR):
+                        # go fetch masks from the CR register file
+                        sync += self.srcmask.eq(0)
+                        sync += self.dstmask.eq(0)
+                        m.next = "CR_READ"
+                    with m.Else():
+                        sync += self.srcmask.eq(-1)
+                        sync += self.dstmask.eq(-1)
+                        m.next = "FETCH_PRED_DONE"
+
+            with m.State("INT_DST_READ"):
+                # store destination mask
+                inv = Repl(dinvert, 64)
+                new_dstmask = Signal(64)
+                with m.If(dunary):
+                    # set selected mask bit for 1<<r3 mode
+                    dst_shift = Signal(range(64))
+                    comb += dst_shift.eq(self.int_pred.data_o & 0b111111)
+                    comb += new_dstmask.eq(1 << dst_shift)
+                with m.Else():
+                    # invert mask if requested
+                    comb += new_dstmask.eq(self.int_pred.data_o ^ inv)
+                # shift-out already used mask bits
+                sync += self.dstmask.eq(new_dstmask >> dststep)
+                # skip fetching source mask register, when zero
+                with m.If(sall1s):
+                    sync += self.srcmask.eq(-1)
+                    m.next = "FETCH_PRED_DONE"
+                # fetch source predicate register
+                with m.Else():
+                    comb += int_pred.addr.eq(sregread)
+                    comb += int_pred.ren.eq(1)
+                    m.next = "INT_SRC_READ"
+
+            with m.State("INT_SRC_READ"):
+                # store source mask
+                inv = Repl(sinvert, 64)
+                new_srcmask = Signal(64)
+                with m.If(sunary):
+                    # set selected mask bit for 1<<r3 mode
+                    src_shift = Signal(range(64))
+                    comb += src_shift.eq(self.int_pred.data_o & 0b111111)
+                    comb += new_srcmask.eq(1 << src_shift)
+                with m.Else():
+                    # invert mask if requested
+                    comb += new_srcmask.eq(self.int_pred.data_o ^ inv)
+                # shift-out already used mask bits
+                sync += self.srcmask.eq(new_srcmask >> srcstep)
+                m.next = "FETCH_PRED_DONE"
+
+            # fetch masks from the CR register file
+            # implements the following loop:
+            # idx, inv = get_predcr(mask)
+            # mask = 0
+            # for cr_idx in range(vl):
+            #     cr = crl[cr_idx + SVP64CROffs.CRPred]  # takes one cycle to complete
+            #     if cr[idx] ^ inv:
+            #         mask |= 1 << cr_idx
+            # return mask
+            with m.State("CR_READ"):
+                # the CR index to be read, which will be ready by the next cycle
+                cr_idx = Signal.like(cur_vl, reset_less=True)
+                # submit the read operation to the regfile
+                with m.If(cr_idx != cur_vl):
+                    # the CR read port is unary ...
+                    # ren = 1 << cr_idx
+                    # ... in MSB0 convention ...
+                    # ren = 1 << (7 - cr_idx)
+                    # ... and with an offset:
+                    # ren = 1 << (7 - off - cr_idx)
+                    comb += cr_pred.ren.eq(1 << (7 - SVP64CROffs.CRPred - cr_idx))
+                    # signal data valid in the next cycle
+                    cr_read = Signal(reset_less=True)
+                    sync += cr_read.eq(1)
+                    # load the next index
+                    sync += cr_idx.eq(cr_idx + 1)
+                with m.Else():
+                    # exit on loop end
+                    sync += cr_read.eq(0)
+                    sync += cr_idx.eq(0)
+                    m.next = "FETCH_PRED_DONE"
+                with m.If(cr_read):
+                    # compensate for the one cycle delay on the regfile
+                    cur_cr_idx = Signal.like(cur_vl)
+                    comb += cur_cr_idx.eq(cr_idx - 1)
+                    # read the CR field, select the appropriate bit
+                    cr_field = Signal(4)
+                    scr_bit = Signal()
+                    dcr_bit = Signal()
+                    comb += cr_field.eq(cr_pred.data_o)
+                    comb += scr_bit.eq(cr_field.bit_select(sidx, 1) ^ scrinvert)
+                    comb += dcr_bit.eq(cr_field.bit_select(didx, 1) ^ dcrinvert)
+                    # set the corresponding mask bit
+                    bit_to_set = Signal.like(self.srcmask)
+                    comb += bit_to_set.eq(1 << cur_cr_idx)
+                    with m.If(scr_bit):
+                        sync += self.srcmask.eq(self.srcmask | bit_to_set)
+                    with m.If(dcr_bit):
+                        sync += self.dstmask.eq(self.dstmask | bit_to_set)
+
+            with m.State("FETCH_PRED_DONE"):
+                comb += pred_mask_valid_o.eq(1)
+                with m.If(pred_mask_ready_i):
+                    m.next = "FETCH_PRED_IDLE"
  
      def issue_fsm(self, m, core, pc_changed, sv_changed, nia,
                    dbg, core_rst, is_svp64_mode,
                    fetch_pc_ready_o, fetch_pc_valid_i,
                    fetch_insn_valid_o, fetch_insn_ready_i,
+                  pred_insn_valid_i, pred_insn_ready_o,
+                  pred_mask_valid_o, pred_mask_ready_i,
                    exec_insn_valid_i, exec_insn_ready_o,
                    exec_pc_valid_o, exec_pc_ready_i):
          """issue FSM
@@ -410,6 +579,14 @@ class TestIssuerInternal(Elaboratable):
          new_svstate = SVSTATERec("new_svstate")
          comb += new_svstate.eq(cur_state.svstate)
  
+        # precalculate srcstep+1 and dststep+1
+        cur_srcstep = cur_state.svstate.srcstep
+        cur_dststep = cur_state.svstate.dststep
+        next_srcstep = Signal.like(cur_srcstep)
+        next_dststep = Signal.like(cur_dststep)
+        comb += next_srcstep.eq(cur_state.svstate.srcstep+1)
+        comb += next_dststep.eq(cur_state.svstate.dststep+1)
+
          with m.FSM(name="issue_fsm"):
  
              # sync with the "fetch" phase which is reading the instruction
@@ -424,7 +601,6 @@ class TestIssuerInternal(Elaboratable):
                          m.next = "INSN_WAIT"
                  with m.Else():
                      # tell core it's stopped, and acknowledge debug handshake
-                    comb += core.core_stopped_i.eq(1)
                      comb += dbg.core_stopped_i.eq(1)
                      # while stopped, allow updating the PC and SVSTATE
                      with m.If(self.pc_i.ok):
@@ -436,17 +612,10 @@ class TestIssuerInternal(Elaboratable):
                          comb += update_svstate.eq(1)
                          sync += sv_changed.eq(1)
  
-            # decode the instruction when it arrives
+            # wait for an instruction to arrive from Fetch
              with m.State("INSN_WAIT"):
                  comb += fetch_insn_ready_i.eq(1)
                  with m.If(fetch_insn_valid_o):
-                    # decode the instruction
-                    sync += core.e.eq(pdecode2.e)
-                    sync += core.state.eq(cur_state)
-                    sync += core.raw_insn_i.eq(dec_opcode_i)
-                    sync += core.bigendian_i.eq(self.core_bigendian_i)
-                    # set RA_OR_ZERO detection in satellite decoders
-                    sync += core.sv_a_nz.eq(pdecode2.sv_a_nz)
                      # loop into ISSUE_START if it's a SVP64 instruction
                      # and VL == 0.  this because VL==0 is a for-loop
                      # from 0 to 0 i.e. always, always a NOP.
@@ -460,15 +629,98 @@ class TestIssuerInternal(Elaboratable):
                          comb += self.insn_done.eq(1)
                          m.next = "ISSUE_START"
                      with m.Else():
-                        m.next = "INSN_EXECUTE"  # move to "execute"
+                        if self.svp64_en:
+                            m.next = "PRED_START"  # start fetching predicate
+                        else:
+                            m.next = "DECODE_SV"  # skip predication
+
+            with m.State("PRED_START"):
+                comb += pred_insn_valid_i.eq(1)  # tell fetch_pred to start
+                with m.If(pred_insn_ready_o):  # fetch_pred acknowledged us
+                    m.next = "MASK_WAIT"
+
+            with m.State("MASK_WAIT"):
+                comb += pred_mask_ready_i.eq(1) # ready to receive the masks
+                with m.If(pred_mask_valid_o): # predication masks are ready
+                    m.next = "PRED_SKIP"
+
+            # skip zeros in predicate
+            with m.State("PRED_SKIP"):
+                with m.If(~is_svp64_mode):
+                    m.next = "DECODE_SV"  # nothing to do
+                with m.Else():
+                    if self.svp64_en:
+                        pred_src_zero = pdecode2.rm_dec.pred_sz
+                        pred_dst_zero = pdecode2.rm_dec.pred_dz
+
+                        # new srcstep, after skipping zeros
+                        skip_srcstep = Signal.like(cur_srcstep)
+                        # value to be added to the current srcstep
+                        src_delta = Signal.like(cur_srcstep)
+                        # add leading zeros to srcstep, if not in zero mode
+                        with m.If(~pred_src_zero):
+                            # priority encoder (count leading zeros)
+                            # append guard bit, in case the mask is all zeros
+                            pri_enc_src = PriorityEncoder(65)
+                            m.submodules.pri_enc_src = pri_enc_src
+                            comb += pri_enc_src.i.eq(Cat(self.srcmask,
+                                                         Const(1, 1)))
+                            comb += src_delta.eq(pri_enc_src.o)
+                        # apply delta to srcstep
+                        comb += skip_srcstep.eq(cur_srcstep + src_delta)
+                        # shift-out all leading zeros from the mask
+                        # plus the leading "one" bit
+                        # TODO count leading zeros and shift-out the zero
+                        #      bits, in the same step, in hardware
+                        sync += self.srcmask.eq(self.srcmask >> (src_delta+1))
+
+                        # same as above, but for dststep
+                        skip_dststep = Signal.like(cur_dststep)
+                        dst_delta = Signal.like(cur_dststep)
+                        with m.If(~pred_dst_zero):
+                            pri_enc_dst = PriorityEncoder(65)
+                            m.submodules.pri_enc_dst = pri_enc_dst
+                            comb += pri_enc_dst.i.eq(Cat(self.dstmask,
+                                                         Const(1, 1)))
+                            comb += dst_delta.eq(pri_enc_dst.o)
+                        comb += skip_dststep.eq(cur_dststep + dst_delta)
+                        sync += self.dstmask.eq(self.dstmask >> (dst_delta+1))
+
+                        # TODO: initialize mask[VL]=1 to avoid passing past VL
+                        with m.If((skip_srcstep >= cur_vl) |
+                                  (skip_dststep >= cur_vl)):
+                            # end of VL loop. Update PC and reset src/dst step
+                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                            comb += self.state_w_pc.data_i.eq(nia)
+                            comb += new_svstate.srcstep.eq(0)
+                            comb += new_svstate.dststep.eq(0)
+                            comb += update_svstate.eq(1)
+                            # synchronize with the simulator
+                            comb += self.insn_done.eq(1)
+                            # go back to Issue
+                            m.next = "ISSUE_START"
+                        with m.Else():
+                            # update new src/dst step
+                            comb += new_svstate.srcstep.eq(skip_srcstep)
+                            comb += new_svstate.dststep.eq(skip_dststep)
+                            comb += update_svstate.eq(1)
+                            # proceed to Decode
+                            m.next = "DECODE_SV"
+
+            # after src/dst step have been updated, we are ready
+            # to decode the instruction
+            with m.State("DECODE_SV"):
+                # decode the instruction
+                sync += core.e.eq(pdecode2.e)
+                sync += core.state.eq(cur_state)
+                sync += core.raw_insn_i.eq(dec_opcode_i)
+                sync += core.bigendian_i.eq(self.core_bigendian_i)
+                # set RA_OR_ZERO detection in satellite decoders
+                sync += core.sv_a_nz.eq(pdecode2.sv_a_nz)
+                m.next = "INSN_EXECUTE"  # move to "execute"
  
              # handshake with execution FSM, move to "wait" once acknowledged
              with m.State("INSN_EXECUTE"):
-                # with m.If(is_svp64_mode):
-                #    TODO advance src/dst step to "skip" over predicated-out
-                #    from self.srcmask and self.dstmask
-                #    https://bugs.libre-soc.org/show_bug.cgi?id=617#c3
-                #    but still without exceeding VL in either case
                  comb += exec_insn_valid_i.eq(1) # trigger execute
                  with m.If(exec_insn_ready_o):   # execute acknowledged us
                      m.next = "EXECUTE_WAIT"
@@ -479,11 +731,6 @@ class TestIssuerInternal(Elaboratable):
                  with m.If(~dbg.core_stop_o & ~core_rst):
                      comb += exec_pc_ready_i.eq(1)
                      with m.If(exec_pc_valid_o):
-                        # precalculate srcstep+1 and dststep+1
-                        next_srcstep = Signal.like(cur_state.svstate.srcstep)
-                        next_dststep = Signal.like(cur_state.svstate.dststep)
-                        comb += next_srcstep.eq(cur_state.svstate.srcstep+1)
-                        comb += next_dststep.eq(cur_state.svstate.dststep+1)
  
                          # was this the last loop iteration?
                          is_last = Signal()
@@ -508,7 +755,12 @@ class TestIssuerInternal(Elaboratable):
                              comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
                              comb += self.state_w_pc.data_i.eq(nia)
                              # reset SRCSTEP before returning to Fetch
-                            with m.If(pdecode2.loop_continue):
+                            if self.svp64_en:
+                                with m.If(pdecode2.loop_continue):
+                                    comb += new_svstate.srcstep.eq(0)
+                                    comb += new_svstate.dststep.eq(0)
+                                    comb += update_svstate.eq(1)
+                            else:
                                  comb += new_svstate.srcstep.eq(0)
                                  comb += new_svstate.dststep.eq(0)
                                  comb += update_svstate.eq(1)
@@ -519,10 +771,10 @@ class TestIssuerInternal(Elaboratable):
                              comb += new_svstate.srcstep.eq(next_srcstep)
                              comb += new_svstate.dststep.eq(next_dststep)
                              comb += update_svstate.eq(1)
-                            m.next = "DECODE_SV"
+                            # return to mask skip loop
+                            m.next = "PRED_SKIP"
  
                  with m.Else():
-                    comb += core.core_stopped_i.eq(1)
                      comb += dbg.core_stopped_i.eq(1)
                      # while stopped, allow updating the PC and SVSTATE
                      with m.If(self.pc_i.ok):
@@ -534,17 +786,6 @@ class TestIssuerInternal(Elaboratable):
                          comb += update_svstate.eq(1)
                          sync += sv_changed.eq(1)
  
-            # need to decode the instruction again, after updating SRCSTEP
-            # in the previous state.
-            # mostly a copy of INSN_WAIT, but without the actual wait
-            with m.State("DECODE_SV"):
-                # decode the instruction
-                sync += core.e.eq(pdecode2.e)
-                sync += core.state.eq(cur_state)
-                sync += core.bigendian_i.eq(self.core_bigendian_i)
-                sync += core.sv_a_nz.eq(pdecode2.sv_a_nz)
-                m.next = "INSN_EXECUTE"  # move to "execute"
-
          # check if svstate needs updating: if so, write it to State Regfile
          with m.If(update_svstate):
              comb += self.state_w_sv.wen.eq(1<<StateRegs.SVSTATE)
@@ -688,6 +929,10 @@ class TestIssuerInternal(Elaboratable):
          # set up peripherals and core
          core_rst = self.setup_peripherals(m)
  
+        # reset current state if core reset requested
+        with m.If(core_rst):
+            m.d.sync += self.cur_state.eq(0)
+
          # PC and instruction from I-Memory
          comb += self.pc_o.eq(cur_state.pc)
          pc_changed = Signal() # note write to PC
@@ -695,9 +940,11 @@ class TestIssuerInternal(Elaboratable):
  
          # read state either from incoming override or from regfile
          # TODO: really should be doing MSR in the same way
-        pc = state_get(m, self.pc_i, "pc",                  # read PC
+        pc = state_get(m, core_rst, self.pc_i,
+                            "pc",                  # read PC
                              self.state_r_pc, StateRegs.PC)
-        svstate = state_get(m, self.svstate_i, "svstate",   # read SVSTATE
+        svstate = state_get(m, core_rst, self.svstate_i,
+                            "svstate",   # read SVSTATE
                              self.state_r_sv, StateRegs.SVSTATE)
  
          # don't write pc every cycle
@@ -709,7 +956,7 @@ class TestIssuerInternal(Elaboratable):
  
          # address of the next instruction, in the absence of a branch
          # depends on the instruction size
-        nia = Signal(64, reset_less=True)
+        nia = Signal(64)
  
          # connect up debug signals
          # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
@@ -733,6 +980,14 @@ class TestIssuerInternal(Elaboratable):
          fetch_insn_valid_o = Signal()
          fetch_insn_ready_i = Signal()
  
+        # predicate fetch FSM decodes and fetches the predicate
+        pred_insn_valid_i = Signal()
+        pred_insn_ready_o = Signal()
+
+        # predicate fetch FSM delivers the masks
+        pred_mask_valid_o = Signal()
+        pred_mask_ready_i = Signal()
+
          # issue FSM delivers the instruction to the be executed
          exec_insn_valid_i = Signal()
          exec_insn_ready_o = Signal()
@@ -746,9 +1001,9 @@ class TestIssuerInternal(Elaboratable):
          # (as opposed to using sync - which would be on a clock's delay)
          # this includes the actual opcode, valid flags and so on.
  
-        # Fetch, then Issue, then Execute. Issue is where the VL for-loop
-        # lives.  the ready/valid signalling is used to communicate between
-        # the three.
+        # Fetch, then predicate fetch, then Issue, then Execute.
+        # Issue is where the VL for-loop # lives.  the ready/valid
+        # signalling is used to communicate between the four.
  
          self.fetch_fsm(m, core, pc, svstate, nia, is_svp64_mode,
                         fetch_pc_ready_o, fetch_pc_valid_i,
@@ -758,13 +1013,24 @@ class TestIssuerInternal(Elaboratable):
                         dbg, core_rst, is_svp64_mode,
                         fetch_pc_ready_o, fetch_pc_valid_i,
                         fetch_insn_valid_o, fetch_insn_ready_i,
+                       pred_insn_valid_i, pred_insn_ready_o,
+                       pred_mask_valid_o, pred_mask_ready_i,
                         exec_insn_valid_i, exec_insn_ready_o,
                         exec_pc_valid_o, exec_pc_ready_i)
  
+        if self.svp64_en:
+            self.fetch_predicate_fsm(m,
+                                     pred_insn_valid_i, pred_insn_ready_o,
+                                     pred_mask_valid_o, pred_mask_ready_i)
+
          self.execute_fsm(m, core, pc_changed, sv_changed,
                           exec_insn_valid_i, exec_insn_ready_o,
                           exec_pc_valid_o, exec_pc_ready_i)
  
+        # whatever was done above, over-ride it if core reset is held
+        with m.If(core_rst):
+            sync += nia.eq(0)
+
          # this bit doesn't have to be in the FSM: connect up to read
          # regfiles on demand from DMI
          self.do_dmi(m, dbg)
@@ -776,6 +1042,11 @@ class TestIssuerInternal(Elaboratable):
          return m
  
      def do_dmi(self, m, dbg):
+        """deals with DMI debug requests
+
+        currently only provides read requests for the INT regfile, CR and XER
+        it will later also deal with *writing* to these regfiles.
+        """
          comb = m.d.comb
          sync = m.d.sync
          dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
@@ -923,6 +1194,7 @@ class TestIssuer(Elaboratable):
          self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
          if self.pll_en:
              self.pll_18_o = Signal(reset_less=True)
+            self.clk_sel_i = Signal(reset_less=True)
  
      def elaborate(self, platform):
          m = Module()
@@ -951,6 +1223,9 @@ class TestIssuer(Elaboratable):
              # output 18 mhz PLL test signal
              comb += self.pll_18_o.eq(pll.pll_18_o)
  
+            # input to pll clock selection
+            comb += Cat(pll.sel_a0_i, pll.sel_a1_i).eq(self.clk_sel_i)
+
              # now wire up ResetSignals.  don't mind them being in this domain
              pll_rst = ResetSignal("pllclk")
              comb += pll_rst.eq(ResetSignal())
@@ -974,9 +1249,9 @@ class TestIssuer(Elaboratable):
          ports.append(ClockSignal())
          ports.append(ResetSignal())
          if self.pll_en:
-            ports.append(self.pll.clk_sel_i)
+            ports.append(self.clk_sel_i)
              ports.append(self.pll_18_o)
-            ports.append(self.pll.pll_lck_o)
+            ports.append(self.pll.pll_ana_o)
          return ports