minor cleanup in ISACaller on result handling
[openpower-isa.git] / src / openpower / decoder / isa / caller.py
index 39e195ca7906f1f44dd602523e2d5a35b6cc38f3..94ac7c008366d5edc9a7fadd34e8033e135084df 100644 (file)
@@ -14,7 +14,7 @@ related bugs:
 """
 
 import re
 """
 
 import re
-from nmigen.sim import Settle
+from nmigen.sim import Settle, Delay
 from functools import wraps
 from copy import copy, deepcopy
 from openpower.decoder.orderedset import OrderedSet
 from functools import wraps
 from copy import copy, deepcopy
 from openpower.decoder.orderedset import OrderedSet
@@ -30,14 +30,14 @@ from openpower.decoder.power_enums import (spr_dict, spr_byname, XER_bits,
                                            OutSel, CRInSel, CROutSel, LDSTMode,
                                            SVP64RMMode, SVP64PredMode,
                                            SVP64PredInt, SVP64PredCR,
                                            OutSel, CRInSel, CROutSel, LDSTMode,
                                            SVP64RMMode, SVP64PredMode,
                                            SVP64PredInt, SVP64PredCR,
-                                           SVP64LDSTmode)
+                                           SVP64LDSTmode, FPTRANS_INSNS)
 
 from openpower.decoder.power_enums import SVPtype
 
 from openpower.decoder.helpers import (exts, gtu, ltu, undefined,
                                        ISACallerHelper, ISAFPHelpers)
 from openpower.consts import PIb, MSRb  # big-endian (PowerISA versions)
 
 from openpower.decoder.power_enums import SVPtype
 
 from openpower.decoder.helpers import (exts, gtu, ltu, undefined,
                                        ISACallerHelper, ISAFPHelpers)
 from openpower.consts import PIb, MSRb  # big-endian (PowerISA versions)
-from openpower.consts import (SVP64MODE,
+from openpower.consts import (SVP64MODE, SVP64MODEb,
                               SVP64CROffs,
                               )
 from openpower.decoder.power_svp64 import SVP64RM, decode_extra
                               SVP64CROffs,
                               )
 from openpower.decoder.power_svp64 import SVP64RM, decode_extra
@@ -66,6 +66,9 @@ special_sprs = {
     'VRSAVE': 256}
 
 
     'VRSAVE': 256}
 
 
+# rrright.  this is here basically because the compiler pywriter returns
+# results in a specific priority order.  to make sure regs match up they
+# need partial sorting. sigh.
 REG_SORT_ORDER = {
     # TODO (lkcl): adjust other registers that should be in a particular order
     # probably CA, CA32, and CR
 REG_SORT_ORDER = {
     # TODO (lkcl): adjust other registers that should be in a particular order
     # probably CA, CA32, and CR
@@ -95,6 +98,7 @@ REG_SORT_ORDER = {
     "CA32": 0,
 
     "overflow": 7,  # should definitely be last
     "CA32": 0,
 
     "overflow": 7,  # should definitely be last
+    "CR0": 8,       # likewise
 }
 
 fregs = ['FRA', 'FRB', 'FRC', 'FRS', 'FRT']
 }
 
 fregs = ['FRA', 'FRB', 'FRC', 'FRS', 'FRT']
@@ -278,33 +282,32 @@ class CRFields:
             _cr = FieldSelectableInt(self.cr, bits)
             self.crl.append(_cr)
 
             _cr = FieldSelectableInt(self.cr, bits)
             self.crl.append(_cr)
 
-# decode SVP64 predicate integer to reg number and invert
-
 
 
+# decode SVP64 predicate integer to reg number and invert
 def get_predint(gpr, mask):
 def get_predint(gpr, mask):
+    r3 = gpr(3)
     r10 = gpr(10)
     r30 = gpr(30)
     log("get_predint", mask, SVP64PredInt.ALWAYS.value)
     if mask == SVP64PredInt.ALWAYS.value:
         return 0xffff_ffff_ffff_ffff  # 64 bits of 1
     if mask == SVP64PredInt.R3_UNARY.value:
     r10 = gpr(10)
     r30 = gpr(30)
     log("get_predint", mask, SVP64PredInt.ALWAYS.value)
     if mask == SVP64PredInt.ALWAYS.value:
         return 0xffff_ffff_ffff_ffff  # 64 bits of 1
     if mask == SVP64PredInt.R3_UNARY.value:
-        return 1 << (gpr(3).value & 0b111111)
+        return 1 << (r3.value & 0b111111)
     if mask == SVP64PredInt.R3.value:
     if mask == SVP64PredInt.R3.value:
-        return gpr(3).value
+        return r3.value
     if mask == SVP64PredInt.R3_N.value:
     if mask == SVP64PredInt.R3_N.value:
-        return ~gpr(3).value
+        return ~r3.value
     if mask == SVP64PredInt.R10.value:
     if mask == SVP64PredInt.R10.value:
-        return gpr(10).value
+        return r10.value
     if mask == SVP64PredInt.R10_N.value:
     if mask == SVP64PredInt.R10_N.value:
-        return ~gpr(10).value
+        return ~r10.value
     if mask == SVP64PredInt.R30.value:
     if mask == SVP64PredInt.R30.value:
-        return gpr(30).value
+        return r30.value
     if mask == SVP64PredInt.R30_N.value:
     if mask == SVP64PredInt.R30_N.value:
-        return ~gpr(30).value
-
-# decode SVP64 predicate CR to reg number and invert status
+        return ~r30.value
 
 
 
 
+# decode SVP64 predicate CR to reg number and invert status
 def _get_predcr(mask):
     if mask == SVP64PredCR.LT.value:
         return 0, 1
 def _get_predcr(mask):
     if mask == SVP64PredCR.LT.value:
         return 0, 1
@@ -323,10 +326,9 @@ def _get_predcr(mask):
     if mask == SVP64PredCR.NS.value:
         return 3, 0
 
     if mask == SVP64PredCR.NS.value:
         return 3, 0
 
+
 # read individual CR fields (0..VL-1), extract the required bit
 # and construct the mask
 # read individual CR fields (0..VL-1), extract the required bit
 # and construct the mask
-
-
 def get_predcr(crl, mask, vl):
     idx, noninv = _get_predcr(mask)
     mask = 0
 def get_predcr(crl, mask, vl):
     idx, noninv = _get_predcr(mask)
     mask = 0
@@ -363,7 +365,7 @@ def get_pdecode_idx_in(dec2, name):
     log("get_pdecode_idx_in FRC in3", name, in3_sel, In3Sel.FRC.value,
         in3, in3_isvec)
     # identify which regnames map to in1/2/3
     log("get_pdecode_idx_in FRC in3", name, in3_sel, In3Sel.FRC.value,
         in3, in3_isvec)
     # identify which regnames map to in1/2/3
-    if name == 'RA':
+    if name == 'RA' or name == 'RA_OR_ZERO':
         if (in1_sel == In1Sel.RA.value or
                 (in1_sel == In1Sel.RA_OR_ZERO.value and in1 != 0)):
             return in1, in1_isvec
         if (in1_sel == In1Sel.RA.value or
                 (in1_sel == In1Sel.RA_OR_ZERO.value and in1 != 0)):
             return in1, in1_isvec
@@ -376,6 +378,8 @@ def get_pdecode_idx_in(dec2, name):
             return in3, in3_isvec
     # XXX TODO, RC doesn't exist yet!
     elif name == 'RC':
             return in3, in3_isvec
     # XXX TODO, RC doesn't exist yet!
     elif name == 'RC':
+        if in3_sel == In3Sel.RC.value:
+            return in3, in3_isvec
         assert False, "RC does not exist yet"
     elif name == 'RS':
         if in1_sel == In1Sel.RS.value:
         assert False, "RC does not exist yet"
     elif name == 'RS':
         if in1_sel == In1Sel.RS.value:
@@ -445,6 +449,9 @@ def get_pdecode_cr_out(dec2, name):
     if name == 'CR0':
         if out_sel == CROutSel.CR0.value:
             return out, o_isvec
     if name == 'CR0':
         if out_sel == CROutSel.CR0.value:
             return out, o_isvec
+    if name == 'CR1':  # these are not actually calculated correctly
+        if out_sel == CROutSel.CR1.value:
+            return out, o_isvec
     log("get_pdecode_cr_out not found", name)
     return None, False
 
     log("get_pdecode_cr_out not found", name)
     return None, False
 
@@ -467,6 +474,8 @@ def get_pdecode_idx_out(dec2, name):
             dec2.dec.RT)
         if out_sel == OutSel.RT.value:
             return out, o_isvec
             dec2.dec.RT)
         if out_sel == OutSel.RT.value:
             return out, o_isvec
+        if out_sel == OutSel.RT_OR_ZERO.value and out != 0:
+            return out, o_isvec
     elif name == 'RT_OR_ZERO':
         log("get_pdecode_idx_out", out_sel, OutSel.RT.value,
             OutSel.RT_OR_ZERO.value, out, o_isvec,
     elif name == 'RT_OR_ZERO':
         log("get_pdecode_idx_out", out_sel, OutSel.RT.value,
             OutSel.RT_OR_ZERO.value, out, o_isvec,
@@ -507,10 +516,14 @@ def get_pdecode_idx_out2(dec2, name):
                 out, o_isvec)
             if upd == LDSTMode.update.value:
                 return out, o_isvec
                 out, o_isvec)
             if upd == LDSTMode.update.value:
                 return out, o_isvec
+    if name == 'RS':
+        fft_en = yield dec2.implicit_rs
+        if fft_en:
+            log("get_pdecode_idx_out2", out_sel, OutSel.RS.value,
+                out, o_isvec)
+            return out, o_isvec
     if name == 'FRS':
     if name == 'FRS':
-        int_op = yield dec2.dec.op.internal_op
-        fft_en = yield dec2.use_svp64_fft
-        # if int_op == MicrOp.OP_FP_MADD.value and fft_en:
+        fft_en = yield dec2.implicit_rs
         if fft_en:
             log("get_pdecode_idx_out2", out_sel, OutSel.FRS.value,
                 out, o_isvec)
         if fft_en:
             log("get_pdecode_idx_out2", out_sel, OutSel.FRS.value,
                 out, o_isvec)
@@ -518,7 +531,459 @@ def get_pdecode_idx_out2(dec2, name):
     return None, False
 
 
     return None, False
 
 
-class ISACaller(ISACallerHelper, ISAFPHelpers):
+class StepLoop:
+    """deals with svstate looping.
+    """
+
+    def __init__(self, svstate):
+        self.svstate = svstate
+        self.new_iterators()
+
+    def new_iterators(self):
+        self.src_it = self.src_iterator()
+        self.dst_it = self.dst_iterator()
+        self.loopend = False
+        self.new_srcstep = 0
+        self.new_dststep = 0
+        self.new_ssubstep = 0
+        self.new_dsubstep = 0
+        self.pred_dst_zero = 0
+        self.pred_src_zero = 0
+    def src_iterator(self):
+        """source-stepping iterator
+        """
+        pack = self.svstate.pack
+
+        # source step
+        if pack:
+            # pack advances subvl in *outer* loop
+            while True:  # outer subvl loop
+                while True:  # inner vl loop
+                    vl = self.svstate.vl
+                    subvl = self.subvl
+                    srcmask = self.srcmask
+                    srcstep = self.svstate.srcstep
+                    pred_src_zero = ((1 << srcstep) & srcmask) != 0
+                    if self.pred_sz or pred_src_zero:
+                        self.pred_src_zero = not pred_src_zero
+                        log("    advance src", srcstep, vl,
+                            self.svstate.ssubstep, subvl)
+                        # yield actual substep/srcstep
+                        yield (self.svstate.ssubstep, srcstep)
+                    # the way yield works these could have been modified.
+                    vl = self.svstate.vl
+                    subvl = self.subvl
+                    srcstep = self.svstate.srcstep
+                    log("    advance src check", srcstep, vl,
+                        self.svstate.ssubstep, subvl, srcstep == vl-1,
+                        self.svstate.ssubstep == subvl)
+                    if srcstep == vl-1:  # end-point
+                        self.svstate.srcstep = SelectableInt(0, 7)  # reset
+                        if self.svstate.ssubstep == subvl:  # end-point
+                            log("    advance pack stop")
+                            return
+                        break # exit inner loop
+                    self.svstate.srcstep += SelectableInt(1, 7)  # advance ss
+                subvl = self.subvl
+                if self.svstate.ssubstep == subvl:  # end-point
+                    self.svstate.ssubstep = SelectableInt(0, 2)  # reset
+                    log("    advance pack stop")
+                    return
+                self.svstate.ssubstep += SelectableInt(1, 2)
+
+        else:
+            # these cannot be done as for-loops because SVSTATE may change
+            # (srcstep/substep may be modified, interrupted, subvl/vl change)
+            # but they *can* be done as while-loops as long as every SVSTATE
+            # "thing" is re-read every single time a yield gives indices
+            while True:  # outer vl loop
+                while True:  # inner subvl loop
+                    vl = self.svstate.vl
+                    subvl = self.subvl
+                    srcmask = self.srcmask
+                    srcstep = self.svstate.srcstep
+                    pred_src_zero = ((1 << srcstep) & srcmask) != 0
+                    if self.pred_sz or pred_src_zero:
+                        self.pred_src_zero = not pred_src_zero
+                        log("    advance src", srcstep, vl,
+                            self.svstate.ssubstep, subvl)
+                        # yield actual substep/srcstep
+                        yield (self.svstate.ssubstep, srcstep)
+                    if self.svstate.ssubstep == subvl:  # end-point
+                        self.svstate.ssubstep = SelectableInt(0, 2)  # reset
+                        break # exit inner loop
+                    self.svstate.ssubstep += SelectableInt(1, 2)
+                vl = self.svstate.vl
+                if srcstep == vl-1:  # end-point
+                    self.svstate.srcstep = SelectableInt(0, 7)  # reset
+                    self.loopend = True
+                    return
+                self.svstate.srcstep += SelectableInt(1, 7)  # advance srcstep
+
+    def dst_iterator(self):
+        """dest-stepping iterator
+        """
+        unpack = self.svstate.unpack
+
+        # dest step
+        if unpack:
+            # pack advances subvl in *outer* loop
+            while True:  # outer subvl loop
+                while True:  # inner vl loop
+                    vl = self.svstate.vl
+                    subvl = self.subvl
+                    dstmask = self.dstmask
+                    dststep = self.svstate.dststep
+                    pred_dst_zero = ((1 << dststep) & dstmask) != 0
+                    if self.pred_dz or pred_dst_zero:
+                        self.pred_dst_zero = not pred_dst_zero
+                        log("    advance dst", dststep, vl,
+                            self.svstate.dsubstep, subvl)
+                        # yield actual substep/dststep
+                        yield (self.svstate.dsubstep, dststep)
+                    # the way yield works these could have been modified.
+                    vl = self.svstate.vl
+                    dststep = self.svstate.dststep
+                    log("    advance dst check", dststep, vl,
+                        self.svstate.ssubstep, subvl)
+                    if dststep == vl-1:  # end-point
+                        self.svstate.dststep = SelectableInt(0, 7)  # reset
+                        if self.svstate.dsubstep == subvl:  # end-point
+                            log("    advance unpack stop")
+                            return
+                        break
+                    self.svstate.dststep += SelectableInt(1, 7)  # advance ds
+                subvl = self.subvl
+                if self.svstate.dsubstep == subvl:  # end-point
+                    self.svstate.dsubstep = SelectableInt(0, 2)  # reset
+                    log("    advance unpack stop")
+                    return
+                self.svstate.dsubstep += SelectableInt(1, 2)
+        else:
+            # these cannot be done as for-loops because SVSTATE may change
+            # (dststep/substep may be modified, interrupted, subvl/vl change)
+            # but they *can* be done as while-loops as long as every SVSTATE
+            # "thing" is re-read every single time a yield gives indices
+            while True:  # outer vl loop
+                while True:  # inner subvl loop
+                    subvl = self.subvl
+                    dstmask = self.dstmask
+                    dststep = self.svstate.dststep
+                    pred_dst_zero = ((1 << dststep) & dstmask) != 0
+                    if self.pred_dz or pred_dst_zero:
+                        self.pred_dst_zero = not pred_dst_zero
+                        log("    advance dst", dststep, self.svstate.vl,
+                            self.svstate.dsubstep, subvl)
+                        # yield actual substep/dststep
+                        yield (self.svstate.dsubstep, dststep)
+                    if self.svstate.dsubstep == subvl:  # end-point
+                        self.svstate.dsubstep = SelectableInt(0, 2)  # reset
+                        break
+                    self.svstate.dsubstep += SelectableInt(1, 2)
+                subvl = self.subvl
+                vl = self.svstate.vl
+                if dststep == vl-1:  # end-point
+                    self.svstate.dststep = SelectableInt(0, 7)  # reset
+                    return
+                self.svstate.dststep += SelectableInt(1, 7)  # advance dststep
+
+    def src_iterate(self):
+        """source-stepping iterator
+        """
+        subvl = self.subvl
+        vl = self.svstate.vl
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
+        ssubstep = self.svstate.ssubstep
+        end_ssub = ssubstep == subvl
+        end_src = self.svstate.srcstep == vl-1
+        log("    pack/unpack/subvl", pack, unpack, subvl,
+            "end", end_src,
+            "sub", end_ssub)
+        # first source step
+        srcstep = self.svstate.srcstep
+        srcmask = self.srcmask
+        if pack:
+            # pack advances subvl in *outer* loop
+            while True:
+                assert srcstep <= vl-1
+                end_src = srcstep == vl-1
+                if end_src:
+                    if end_ssub:
+                        self.loopend = True
+                    else:
+                        self.svstate.ssubstep += SelectableInt(1, 2)
+                    srcstep = 0  # reset
+                    break
+                else:
+                    srcstep += 1  # advance srcstep
+                    if not self.srcstep_skip:
+                        break
+                    if ((1 << srcstep) & srcmask) != 0:
+                        break
+                    else:
+                        log("      sskip", bin(srcmask), bin(1 << srcstep))
+        else:
+            # advance subvl in *inner* loop
+            if end_ssub:
+                while True:
+                    assert srcstep <= vl-1
+                    end_src = srcstep == vl-1
+                    if end_src:  # end-point
+                        self.loopend = True
+                        srcstep = 0
+                        break
+                    else:
+                        srcstep += 1
+                    if not self.srcstep_skip:
+                        break
+                    if ((1 << srcstep) & srcmask) != 0:
+                        break
+                    else:
+                        log("      sskip", bin(srcmask), bin(1 << srcstep))
+                self.svstate.ssubstep = SelectableInt(0, 2)  # reset
+            else:
+                # advance ssubstep
+                self.svstate.ssubstep += SelectableInt(1, 2)
+
+        self.svstate.srcstep = SelectableInt(srcstep, 7)
+        log("    advance src", self.svstate.srcstep, self.svstate.ssubstep,
+                               self.loopend)
+
+    def dst_iterate(self):
+        """dest step iterator
+        """
+        vl = self.svstate.vl
+        subvl = self.subvl
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
+        dsubstep = self.svstate.dsubstep
+        end_dsub = dsubstep == subvl
+        dststep = self.svstate.dststep
+        end_dst = dststep == vl-1
+        dstmask = self.dstmask
+        log("    pack/unpack/subvl", pack, unpack, subvl,
+            "end", end_dst,
+            "sub", end_dsub)
+        # now dest step
+        if unpack:
+            # unpack advances subvl in *outer* loop
+            while True:
+                assert dststep <= vl-1
+                end_dst = dststep == vl-1
+                if end_dst:
+                    if end_dsub:
+                        self.loopend = True
+                    else:
+                        self.svstate.dsubstep += SelectableInt(1, 2)
+                    dststep = 0  # reset
+                    break
+                else:
+                    dststep += 1  # advance dststep
+                    if not self.dststep_skip:
+                        break
+                    if ((1 << dststep) & dstmask) != 0:
+                        break
+                    else:
+                        log("      dskip", bin(dstmask), bin(1 << dststep))
+        else:
+            # advance subvl in *inner* loop
+            if end_dsub:
+                while True:
+                    assert dststep <= vl-1
+                    end_dst = dststep == vl-1
+                    if end_dst:  # end-point
+                        self.loopend = True
+                        dststep = 0
+                        break
+                    else:
+                        dststep += 1
+                    if not self.dststep_skip:
+                        break
+                    if ((1 << dststep) & dstmask) != 0:
+                        break
+                    else:
+                        log("      dskip", bin(dstmask), bin(1 << dststep))
+                self.svstate.dsubstep = SelectableInt(0, 2)  # reset
+            else:
+                # advance ssubstep
+                self.svstate.dsubstep += SelectableInt(1, 2)
+
+        self.svstate.dststep = SelectableInt(dststep, 7)
+        log("    advance dst", self.svstate.dststep, self.svstate.dsubstep,
+                               self.loopend)
+
+    def at_loopend(self):
+        """tells if this is the last possible element.  uses the cached values
+        for src/dst-step and sub-steps
+        """
+        subvl = self.subvl
+        vl = self.svstate.vl
+        srcstep, dststep = self.new_srcstep, self.new_dststep
+        ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
+        end_ssub = ssubstep == subvl
+        end_dsub = dsubstep == subvl
+        if srcstep == vl-1 and end_ssub:
+            return True
+        if dststep == vl-1 and end_dsub:
+            return True
+        return False
+
+    def advance_svstate_steps(self):
+        """ advance sub/steps. note that Pack/Unpack *INVERTS* the order.
+        TODO when Pack/Unpack is set, substep becomes the *outer* loop
+        """
+        self.subvl = yield self.dec2.rm_dec.rm_in.subvl
+        if self.loopend: # huhn??
+            return
+        self.src_iterate()
+        self.dst_iterate()
+
+    def read_src_mask(self):
+        """read/update pred_sz and src mask
+        """
+        # get SVSTATE VL (oh and print out some debug stuff)
+        vl = self.svstate.vl
+        srcstep = self.svstate.srcstep
+        ssubstep = self.svstate.ssubstep
+
+        # get predicate mask (all 64 bits)
+        srcmask = 0xffff_ffff_ffff_ffff
+
+        pmode = yield self.dec2.rm_dec.predmode
+        sv_ptype = yield self.dec2.dec.op.SV_Ptype
+        srcpred = yield self.dec2.rm_dec.srcpred
+        dstpred = yield self.dec2.rm_dec.dstpred
+        pred_sz = yield self.dec2.rm_dec.pred_sz
+        if pmode == SVP64PredMode.INT.value:
+            srcmask = dstmask = get_predint(self.gpr, dstpred)
+            if sv_ptype == SVPtype.P2.value:
+                srcmask = get_predint(self.gpr, srcpred)
+        elif pmode == SVP64PredMode.CR.value:
+            srcmask = dstmask = get_predcr(self.crl, dstpred, vl)
+            if sv_ptype == SVPtype.P2.value:
+                srcmask = get_predcr(self.crl, srcpred, vl)
+        # work out if the ssubsteps are completed
+        ssubstart = ssubstep == 0
+        log("    pmode", pmode)
+        log("    ptype", sv_ptype)
+        log("    srcpred", bin(srcpred))
+        log("    srcmask", bin(srcmask))
+        log("    pred_sz", bin(pred_sz))
+        log("    ssubstart", ssubstart)
+
+        # store all that above
+        self.srcstep_skip = False
+        self.srcmask = srcmask
+        self.pred_sz = pred_sz
+        self.new_ssubstep = ssubstep
+        log("    new ssubstep", ssubstep)
+        # until the predicate mask has a "1" bit... or we run out of VL
+        # let srcstep==VL be the indicator to move to next instruction
+        if not pred_sz:
+            self.srcstep_skip = True
+
+    def read_dst_mask(self):
+        """same as read_src_mask - check and record everything needed
+        """
+        # get SVSTATE VL (oh and print out some debug stuff)
+        # yield Delay(1e-10)  # make changes visible
+        vl = self.svstate.vl
+        dststep = self.svstate.dststep
+        dsubstep = self.svstate.dsubstep
+
+        # get predicate mask (all 64 bits)
+        dstmask = 0xffff_ffff_ffff_ffff
+
+        pmode = yield self.dec2.rm_dec.predmode
+        reverse_gear = yield self.dec2.rm_dec.reverse_gear
+        sv_ptype = yield self.dec2.dec.op.SV_Ptype
+        dstpred = yield self.dec2.rm_dec.dstpred
+        pred_dz = yield self.dec2.rm_dec.pred_dz
+        if pmode == SVP64PredMode.INT.value:
+            dstmask = get_predint(self.gpr, dstpred)
+        elif pmode == SVP64PredMode.CR.value:
+            dstmask = get_predcr(self.crl, dstpred, vl)
+        # work out if the ssubsteps are completed
+        dsubstart = dsubstep == 0
+        log("    pmode", pmode)
+        log("    ptype", sv_ptype)
+        log("    dstpred", bin(dstpred))
+        log("    dstmask", bin(dstmask))
+        log("    pred_dz", bin(pred_dz))
+        log("    dsubstart", dsubstart)
+
+        self.dststep_skip = False
+        self.dstmask = dstmask
+        self.pred_dz = pred_dz
+        self.new_dsubstep = dsubstep
+        log("    new dsubstep", dsubstep)
+        if not pred_dz:
+            self.dststep_skip = True
+
+    def svstate_pre_inc(self):
+        """check if srcstep/dststep need to skip over masked-out predicate bits
+        note that this is not supposed to do anything to substep,
+        it is purely for skipping masked-out bits
+        """
+
+        self.subvl = yield self.dec2.rm_dec.rm_in.subvl
+        yield from self.read_src_mask()
+        yield from self.read_dst_mask()
+
+        self.skip_src()
+        self.skip_dst()
+
+    def skip_src(self):
+
+        srcstep = self.svstate.srcstep
+        srcmask = self.srcmask
+        pred_src_zero = self.pred_sz
+        vl = self.svstate.vl
+        # srcstep-skipping opportunity identified
+        if self.srcstep_skip:
+            # cannot do this with sv.bc - XXX TODO
+            if srcmask == 0:
+                self.loopend = True
+            while (((1 << srcstep) & srcmask) == 0) and (srcstep != vl):
+                log("      sskip", bin(1 << srcstep))
+                srcstep += 1
+
+        # now work out if the relevant mask bits require zeroing
+        if pred_src_zero:
+            pred_src_zero = ((1 << srcstep) & srcmask) == 0
+
+        # store new srcstep / dststep
+        self.new_srcstep = srcstep
+        self.pred_src_zero = pred_src_zero
+        log("    new srcstep", srcstep)
+
+    def skip_dst(self):
+        # dststep-skipping opportunity identified
+        dststep = self.svstate.dststep
+        dstmask = self.dstmask
+        pred_dst_zero = self.pred_dz
+        vl = self.svstate.vl
+        if self.dststep_skip:
+            # cannot do this with sv.bc - XXX TODO
+            if dstmask == 0:
+                self.loopend = True
+            while (((1 << dststep) & dstmask) == 0) and (dststep != vl):
+                log("      dskip", bin(1 << dststep))
+                dststep += 1
+
+        # now work out if the relevant mask bits require zeroing
+        if pred_dst_zero:
+            pred_dst_zero = ((1 << dststep) & dstmask) == 0
+
+        # store new srcstep / dststep
+        self.new_dststep = dststep
+        self.pred_dst_zero = pred_dst_zero
+        log("    new dststep", dststep)
+
+
+class ISACaller(ISACallerHelper, ISAFPHelpers, StepLoop):
     # decoder2 - an instance of power_decoder2
     # regfile - a list of initial values for the registers
     # initial_{etc} - initial values for SPRs, Condition Register, Mem, MSR
     # decoder2 - an instance of power_decoder2
     # regfile - a list of initial values for the registers
     # initial_{etc} - initial values for SPRs, Condition Register, Mem, MSR
@@ -575,7 +1040,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if isinstance(initial_svstate, int):
             initial_svstate = SVP64State(initial_svstate)
         # SVSTATE, MSR and PC
         if isinstance(initial_svstate, int):
             initial_svstate = SVP64State(initial_svstate)
         # SVSTATE, MSR and PC
-        self.svstate = initial_svstate
+        StepLoop.__init__(self, initial_svstate)
         self.msr = SelectableInt(initial_msr, 64)  # underlying reg
         self.pc = PC()
         # GPR FPR SPR registers
         self.msr = SelectableInt(initial_msr, 64)  # underlying reg
         self.pc = PC()
         # GPR FPR SPR registers
@@ -587,10 +1052,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # set up 4 dummy SVSHAPEs if they aren't already set up
         for i in range(4):
             sname = 'SVSHAPE%d' % i
         # set up 4 dummy SVSHAPEs if they aren't already set up
         for i in range(4):
             sname = 'SVSHAPE%d' % i
-            if sname not in self.spr:
-                val = 0
-            else:
-                val = self.spr[sname].value
+            val = self.spr.get(sname, 0)
             # make sure it's an SVSHAPE
             self.spr[sname] = SVSHAPE(val, self.gpr)
         self.last_op_svshape = False
             # make sure it's an SVSHAPE
             self.spr[sname] = SVSHAPE(val, self.gpr)
         self.last_op_svshape = False
@@ -690,7 +1152,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         self.spr['SRR1'].value = msr
         if self.is_svp64_mode:
             self.spr['SVSRR0'] = self.namespace['SVSTATE'].value
         self.spr['SRR1'].value = msr
         if self.is_svp64_mode:
             self.spr['SVSRR0'] = self.namespace['SVSTATE'].value
-        self.trap_nia = SelectableInt(trap_addr | (kaivb&~0x1fff), 64)
+        self.trap_nia = SelectableInt(trap_addr | (kaivb & ~0x1fff), 64)
         self.spr['SRR1'][trap_bit] = 1  # change *copy* of MSR in SRR1
 
         # set exception bits.  TODO: this should, based on the address
         self.spr['SRR1'][trap_bit] = 1  # change *copy* of MSR in SRR1
 
         # set exception bits.  TODO: this should, based on the address
@@ -725,7 +1187,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # then "yield" fields only from op_fields rather than hard-coded
         # list, here.
         fields = self.decoder.sigforms[formname]
         # then "yield" fields only from op_fields rather than hard-coded
         # list, here.
         fields = self.decoder.sigforms[formname]
-        log("prep_namespace", formname, op_fields)
+        log("prep_namespace", formname, op_fields, insn_name)
         for name in op_fields:
             # CR immediates. deal with separately.  needs modifying
             # pseudocode
         for name in op_fields:
             # CR immediates. deal with separately.  needs modifying
             # pseudocode
@@ -738,10 +1200,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 assert regnum <= 7, "sigh, TODO, 128 CR fields"
                 val = (val & 0b11) | (regnum << 2)
             else:
                 assert regnum <= 7, "sigh, TODO, 128 CR fields"
                 val = (val & 0b11) | (regnum << 2)
             else:
-                if name == 'spr':
-                    sig = getattr(fields, name.upper())
-                else:
-                    sig = getattr(fields, name)
+                sig = getattr(fields, name)
                 val = yield sig
             # these are all opcode fields involved in index-selection of CR,
             # and need to do "standard" arithmetic.  CR[BA+32] for example
                 val = yield sig
             # these are all opcode fields involved in index-selection of CR,
             # and need to do "standard" arithmetic.  CR[BA+32] for example
@@ -765,13 +1224,15 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if self.is_svp64_mode and insn_name.startswith("sv.bc"):
             # blegh grab bits manually
             mode = yield self.dec2.rm_dec.rm_in.mode
         if self.is_svp64_mode and insn_name.startswith("sv.bc"):
             # blegh grab bits manually
             mode = yield self.dec2.rm_dec.rm_in.mode
-            bc_vlset = (mode & SVP64MODE.BC_VLSET) != 0
-            bc_vli = (mode & SVP64MODE.BC_VLI) != 0
-            bc_snz = (mode & SVP64MODE.BC_SNZ) != 0
+            mode = SelectableInt(mode, 5) # convert to SelectableInt before test
+            bc_vlset = mode[SVP64MODEb.BC_VLSET] != 0
+            bc_vli = mode[SVP64MODEb.BC_VLI] != 0
+            bc_snz = mode[SVP64MODEb.BC_SNZ] != 0
             bc_vsb = yield self.dec2.rm_dec.bc_vsb
             bc_lru = yield self.dec2.rm_dec.bc_lru
             bc_gate = yield self.dec2.rm_dec.bc_gate
             sz = yield self.dec2.rm_dec.pred_sz
             bc_vsb = yield self.dec2.rm_dec.bc_vsb
             bc_lru = yield self.dec2.rm_dec.bc_lru
             bc_gate = yield self.dec2.rm_dec.bc_gate
             sz = yield self.dec2.rm_dec.pred_sz
+            self.namespace['mode'] = SelectableInt(mode, 5)
             self.namespace['ALL'] = SelectableInt(bc_gate, 1)
             self.namespace['VSb'] = SelectableInt(bc_vsb, 1)
             self.namespace['LRu'] = SelectableInt(bc_lru, 1)
             self.namespace['ALL'] = SelectableInt(bc_gate, 1)
             self.namespace['VSb'] = SelectableInt(bc_vsb, 1)
             self.namespace['LRu'] = SelectableInt(bc_lru, 1)
@@ -780,7 +1241,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             self.namespace['sz'] = SelectableInt(sz, 1)
             self.namespace['SNZ'] = SelectableInt(bc_snz, 1)
 
             self.namespace['sz'] = SelectableInt(sz, 1)
             self.namespace['SNZ'] = SelectableInt(bc_snz, 1)
 
-    def handle_carry_(self, inputs, outputs, already_done):
+    def handle_carry_(self, inputs, output, ca, ca32):
         inv_a = yield self.dec2.e.do.invert_in
         if inv_a:
             inputs[0] = ~inputs[0]
         inv_a = yield self.dec2.e.do.invert_in
         if inv_a:
             inputs[0] = ~inputs[0]
@@ -789,12 +1250,6 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if imm_ok:
             imm = yield self.dec2.e.do.imm_data.data
             inputs.append(SelectableInt(imm, 64))
         if imm_ok:
             imm = yield self.dec2.e.do.imm_data.data
             inputs.append(SelectableInt(imm, 64))
-        assert len(outputs) >= 1
-        log("outputs", repr(outputs))
-        if isinstance(outputs, list) or isinstance(outputs, tuple):
-            output = outputs[0]
-        else:
-            output = outputs
         gts = []
         for x in inputs:
             log("gt input", x, output)
         gts = []
         for x in inputs:
             log("gt input", x, output)
@@ -803,10 +1258,9 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log(gts)
         cy = 1 if any(gts) else 0
         log("CA", cy, gts)
         log(gts)
         cy = 1 if any(gts) else 0
         log("CA", cy, gts)
-        if not (1 & already_done):
+        if ca is None: # already written
             self.spr['XER'][XER_bits['CA']] = cy
 
             self.spr['XER'][XER_bits['CA']] = cy
 
-        log("inputs", already_done, inputs)
         # 32 bit carry
         # ARGH... different for OP_ADD... *sigh*...
         op = yield self.dec2.e.do.insn_type
         # 32 bit carry
         # ARGH... different for OP_ADD... *sigh*...
         op = yield self.dec2.e.do.insn_type
@@ -829,10 +1283,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 gts.append(gt)
             cy32 = 1 if any(gts) else 0
             log("CA32", cy32, gts)
                 gts.append(gt)
             cy32 = 1 if any(gts) else 0
             log("CA32", cy32, gts)
-        if not (2 & already_done):
+        if ca32 is None: # already written
             self.spr['XER'][XER_bits['CA32']] = cy32
 
             self.spr['XER'][XER_bits['CA32']] = cy32
 
-    def handle_overflow(self, inputs, outputs, div_overflow):
+    def handle_overflow(self, inputs, output, div_overflow):
         if hasattr(self.dec2.e.do, "invert_in"):
             inv_a = yield self.dec2.e.do.invert_in
             if inv_a:
         if hasattr(self.dec2.e.do, "invert_in"):
             inv_a = yield self.dec2.e.do.invert_in
             if inv_a:
@@ -842,8 +1296,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if imm_ok:
             imm = yield self.dec2.e.do.imm_data.data
             inputs.append(SelectableInt(imm, 64))
         if imm_ok:
             imm = yield self.dec2.e.do.imm_data.data
             inputs.append(SelectableInt(imm, 64))
-        assert len(outputs) >= 1
-        log("handle_overflow", inputs, outputs, div_overflow)
+        log("handle_overflow", inputs, output, div_overflow)
         if len(inputs) < 2 and div_overflow is None:
             return
 
         if len(inputs) < 2 and div_overflow is None:
             return
 
@@ -853,8 +1306,6 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             ov, ov32 = div_overflow, div_overflow
         # arithmetic overflow can be done by analysing the input and output
         elif len(inputs) >= 2:
             ov, ov32 = div_overflow, div_overflow
         # arithmetic overflow can be done by analysing the input and output
         elif len(inputs) >= 2:
-            output = outputs[0]
-
             # OV (64-bit)
             input_sgn = [exts(x.value, x.bits) < 0 for x in inputs]
             output_sgn = exts(output.value, output.bits) < 0
             # OV (64-bit)
             input_sgn = [exts(x.value, x.bits) < 0 for x in inputs]
             output_sgn = exts(output.value, output.bits) < 0
@@ -869,14 +1320,13 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
 
         # now update XER OV/OV32/SO
         so = self.spr['XER'][XER_bits['SO']]
 
         # now update XER OV/OV32/SO
         so = self.spr['XER'][XER_bits['SO']]
-        new_so = so | ov # sticky overflow ORs in old with new
+        new_so = so | ov  # sticky overflow ORs in old with new
         self.spr['XER'][XER_bits['OV']] = ov
         self.spr['XER'][XER_bits['OV32']] = ov32
         self.spr['XER'][XER_bits['SO']] = new_so
         log("    set overflow", ov, ov32, so, new_so)
 
         self.spr['XER'][XER_bits['OV']] = ov
         self.spr['XER'][XER_bits['OV32']] = ov32
         self.spr['XER'][XER_bits['SO']] = new_so
         log("    set overflow", ov, ov32, so, new_so)
 
-    def handle_comparison(self, outputs, cr_idx=0, overflow=None, no_so=False):
-        out = outputs[0]
+    def handle_comparison(self, out, cr_idx=0, overflow=None, no_so=False):
         assert isinstance(out, SelectableInt), \
             "out zero not a SelectableInt %s" % repr(outputs)
         log("handle_comparison", out.bits, hex(out.value))
         assert isinstance(out, SelectableInt), \
             "out zero not a SelectableInt %s" % repr(outputs)
         log("handle_comparison", out.bits, hex(out.value))
@@ -947,8 +1397,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         opcode = yield self.dec2.dec.opcode_in
         opcode = SelectableInt(value=opcode, bits=32)
         pfx = SVP64Instruction.Prefix(opcode)
         opcode = yield self.dec2.dec.opcode_in
         opcode = SelectableInt(value=opcode, bits=32)
         pfx = SVP64Instruction.Prefix(opcode)
-        log("prefix test: opcode:", pfx.PO, bin(pfx.PO), pfx.id)
-        self.is_svp64_mode = bool((pfx.PO == 0b000001) and (pfx.id == 0b11))
+        log("prefix test: opcode:", pfx.po, bin(pfx.po), pfx.id)
+        self.is_svp64_mode = bool((pfx.po == 0b000001) and (pfx.id == 0b11))
         self.pc.update_nia(self.is_svp64_mode)
         # set SVP64 decode
         yield self.dec2.is_svp64_mode.eq(self.is_svp64_mode)
         self.pc.update_nia(self.is_svp64_mode)
         # set SVP64 decode
         yield self.dec2.is_svp64_mode.eq(self.is_svp64_mode)
@@ -971,18 +1421,14 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         """execute one instruction
         """
         # get the disassembly code for this instruction
         """execute one instruction
         """
         # get the disassembly code for this instruction
-        if self.is_svp64_mode:
-            if not self.disassembly:
-                code = yield from self.get_assembly_name()
-            else:
-                code = self.disassembly[self._pc+4]
-            log("    svp64 sim-execute", hex(self._pc), code)
+        if not self.disassembly:
+            code = yield from self.get_assembly_name()
         else:
         else:
-            if not self.disassembly:
-                code = yield from self.get_assembly_name()
-            else:
-                code = self.disassembly[self._pc]
-            log("sim-execute", hex(self._pc), code)
+            offs, dbg = 0, ""
+            if self.is_svp64_mode:
+               offs, dbg = 4, "svp64 "
+            code = self.disassembly[self._pc+offs]
+            log("    %s sim-execute" % dbg, hex(self._pc), code)
         opname = code.split(' ')[0]
         try:
             yield from self.call(opname)         # execute the instruction
         opname = code.split(' ')[0]
         try:
             yield from self.call(opname)         # execute the instruction
@@ -1045,6 +1491,11 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         else:
             rc_en = False
             rc_ok = False
         else:
             rc_en = False
             rc_ok = False
+        # annoying: ignore rc_ok if RC1 is set (for creating *assembly name*)
+        RC1 = yield self.dec2.rm_dec.RC1
+        if RC1:
+            rc_en = False
+            rc_ok = False
         # grrrr have to special-case MUL op (see DecodeOE)
         log("ov %d en %d rc %d en %d op %d" %
             (ov_ok, ov_en, rc_ok, rc_en, int_op))
         # grrrr have to special-case MUL op (see DecodeOE)
         log("ov %d en %d rc %d en %d op %d" %
             (ov_ok, ov_en, rc_ok, rc_en, int_op))
@@ -1081,6 +1532,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 asmop = 'mtcrf'
         return asmop
 
                 asmop = 'mtcrf'
         return asmop
 
+    def reset_remaps(self):
+        self.remap_loopends = [0] * 4
+        self.remap_idxs = [0, 1, 2, 3]
+
     def get_remap_indices(self):
         """WARNING, this function stores remap_idxs and remap_loopends
         in the class for later use.  this to avoid problems with yield
     def get_remap_indices(self):
         """WARNING, this function stores remap_idxs and remap_loopends
         in the class for later use.  this to avoid problems with yield
@@ -1088,6 +1543,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # go through all iterators in lock-step, advance to next remap_idx
         srcstep, dststep, ssubstep, dsubstep = self.get_src_dststeps()
         # get four SVSHAPEs. here we are hard-coding
         # go through all iterators in lock-step, advance to next remap_idx
         srcstep, dststep, ssubstep, dsubstep = self.get_src_dststeps()
         # get four SVSHAPEs. here we are hard-coding
+        self.reset_remaps()
         SVSHAPE0 = self.spr['SVSHAPE0']
         SVSHAPE1 = self.spr['SVSHAPE1']
         SVSHAPE2 = self.spr['SVSHAPE2']
         SVSHAPE0 = self.spr['SVSHAPE0']
         SVSHAPE1 = self.spr['SVSHAPE1']
         SVSHAPE2 = self.spr['SVSHAPE2']
@@ -1099,8 +1555,6 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                   (SVSHAPE3, SVSHAPE3.get_iterator()),
                   ]
 
                   (SVSHAPE3, SVSHAPE3.get_iterator()),
                   ]
 
-        self.remap_loopends = [0] * 4
-        self.remap_idxs = [0, 1, 2, 3]
         dbg = []
         for i, (shape, remap) in enumerate(remaps):
             # zero is "disabled"
         dbg = []
         for i, (shape, remap) in enumerate(remaps):
             # zero is "disabled"
@@ -1139,6 +1593,9 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         asmop = yield from self.get_assembly_name()
         log("call", ins_name, asmop)
 
         asmop = yield from self.get_assembly_name()
         log("call", ins_name, asmop)
 
+        # sv.setvl is *not* a loop-function. sigh
+        log("is_svp64_mode", self.is_svp64_mode, asmop)
+
         # check privileged
         int_op = yield self.dec2.dec.op.internal_op
         spr_msb = yield from self.get_spr_msb()
         # check privileged
         int_op = yield self.dec2.dec.op.internal_op
         spr_msb = yield from self.get_spr_msb()
@@ -1174,14 +1631,16 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
 
         # list of instructions not being supported by binutils (.long)
         dotstrp = asmop[:-1] if asmop[-1] == '.' else asmop
 
         # list of instructions not being supported by binutils (.long)
         dotstrp = asmop[:-1] if asmop[-1] == '.' else asmop
-        if dotstrp in [ 'fsins', 'fcoss',
-                    'ffmadds', 'fdmadds', 'ffadds',
-                     'mins', 'maxs', 'minu', 'maxu',
-                    'setvl', 'svindex', 'svremap', 'svstep', 'svshape',
-                    'grev', 'ternlogi', 'bmask', 'cprop',
-                    'absdu', 'absds', 'absdacs', 'absdacu', 'avgadd',
-                    'fmvis', 'fishmv',
-                    ]:
+        if dotstrp in [*FPTRANS_INSNS,
+                       'ffmadds', 'fdmadds', 'ffadds',
+                       'mins', 'maxs', 'minu', 'maxu',
+                       'setvl', 'svindex', 'svremap', 'svstep',
+                       'svshape', 'svshape2',
+                       'grev', 'ternlogi', 'bmask', 'cprop',
+                       'absdu', 'absds', 'absdacs', 'absdacu', 'avgadd',
+                       'fmvis', 'fishmv', 'pcdec', "maddedu", "divmod2du",
+                       "dsld", "dsrd",
+                       ]:
             illegal = False
             ins_name = dotstrp
 
             illegal = False
             ins_name = dotstrp
 
@@ -1212,7 +1671,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             return
 
         # look up instruction in ISA.instrs, prepare namespace
             return
 
         # look up instruction in ISA.instrs, prepare namespace
-        info = self.instrs[ins_name]
+        if ins_name == 'pcdec': # grrrr yes there are others ("stbcx." etc.)
+            info = self.instrs[ins_name+"."]
+        else:
+            info = self.instrs[ins_name]
         yield from self.prep_namespace(ins_name, info.form, info.op_fields)
 
         # preserve order of register names
         yield from self.prep_namespace(ins_name, info.form, info.op_fields)
 
         # preserve order of register names
@@ -1229,8 +1691,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("sv rm", sv_rm, dest_cr, src_cr, src_byname, dest_byname)
 
         # see if srcstep/dststep need skipping over masked-out predicate bits
         log("sv rm", sv_rm, dest_cr, src_cr, src_byname, dest_byname)
 
         # see if srcstep/dststep need skipping over masked-out predicate bits
-        if (self.is_svp64_mode or ins_name == 'setvl' or
-                ins_name in ['svremap', 'svstate']):
+        self.reset_remaps()
+        if (self.is_svp64_mode or ins_name in ['setvl', 'svremap', 'svstate']):
             yield from self.svstate_pre_inc()
         if self.is_svp64_mode:
             pre = yield from self.update_new_svstate_steps()
             yield from self.svstate_pre_inc()
         if self.is_svp64_mode:
             pre = yield from self.update_new_svstate_steps()
@@ -1263,7 +1725,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if persist or self.last_op_svshape:
             remaps = self.get_remap_indices()
         if self.is_svp64_mode and (persist or self.last_op_svshape):
         if persist or self.last_op_svshape:
             remaps = self.get_remap_indices()
         if self.is_svp64_mode and (persist or self.last_op_svshape):
-            yield from self.remap_debug(remaps)
+            yield from self.remap_set_steps(remaps)
         # after that, settle down (combinatorial) to let Vector reg numbers
         # work themselves out
         yield Settle()
         # after that, settle down (combinatorial) to let Vector reg numbers
         # work themselves out
         yield Settle()
@@ -1318,7 +1780,11 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # execute actual instruction here (finally)
         log("inputs", inputs)
         results = info.func(self, *inputs)
         # execute actual instruction here (finally)
         log("inputs", inputs)
         results = info.func(self, *inputs)
-        log("results", results)
+        output_names = create_args(info.write_regs)
+        outs = {}
+        for out, n in zip(results or [], output_names):
+            outs[n] = out
+        log("results", outs)
 
         # "inject" decorator takes namespace from function locals: we need to
         # overwrite NIA being overwritten (sigh)
 
         # "inject" decorator takes namespace from function locals: we need to
         # overwrite NIA being overwritten (sigh)
@@ -1339,65 +1805,106 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             self.last_st_addr, self.last_ld_addr)
 
         # detect if CA/CA32 already in outputs (sra*, basically)
             self.last_st_addr, self.last_ld_addr)
 
         # detect if CA/CA32 already in outputs (sra*, basically)
-        already_done = 0
-        if info.write_regs:
-            output_names = create_args(info.write_regs)
-            for name in output_names:
-                if name == 'CA':
-                    already_done |= 1
-                if name == 'CA32':
-                    already_done |= 2
-
-        log("carry already done?", bin(already_done))
-        if hasattr(self.dec2.e.do, "output_carry"):
-            carry_en = yield self.dec2.e.do.output_carry
-        else:
-            carry_en = False
+        ca = outs.get("CA")
+        ca32 = outs.get("CA32 ")
+
+        log("carry already done?", ca, ca32, output_names)
+        carry_en = yield self.dec2.e.do.output_carry
         if carry_en:
         if carry_en:
-            yield from self.handle_carry_(inputs, results, already_done)
+            yield from self.handle_carry_(inputs, results[0], ca, ca32)
 
         # check if one of the regs was named "overflow"
 
         # check if one of the regs was named "overflow"
-        overflow = None
-        if info.write_regs:
-            for name, output in zip(output_names, results):
-                if name == 'overflow':
-                    overflow = output
+        overflow = outs.get('overflow')
+        # and one called CR0
+        cr0 = outs.get('CR0')
 
         if not self.is_svp64_mode:  # yeah just no. not in parallel processing
             # detect if overflow was in return result
 
         if not self.is_svp64_mode:  # yeah just no. not in parallel processing
             # detect if overflow was in return result
-            if hasattr(self.dec2.e.do, "oe"):
-                ov_en = yield self.dec2.e.do.oe.oe
-                ov_ok = yield self.dec2.e.do.oe.ok
-            else:
-                ov_en = False
-                ov_ok = False
+            ov_en = yield self.dec2.e.do.oe.oe
+            ov_ok = yield self.dec2.e.do.oe.ok
             log("internal overflow", ins_name, overflow, "en?", ov_en, ov_ok)
             if ov_en & ov_ok:
             log("internal overflow", ins_name, overflow, "en?", ov_en, ov_ok)
             if ov_en & ov_ok:
-                yield from self.handle_overflow(inputs, results, overflow)
+                yield from self.handle_overflow(inputs, results[0], overflow)
 
         # only do SVP64 dest predicated Rc=1 if dest-pred is not enabled
         rc_en = False
         if not self.is_svp64_mode or not pred_dst_zero:
             if hasattr(self.dec2.e.do, "rc"):
                 rc_en = yield self.dec2.e.do.rc.rc
 
         # only do SVP64 dest predicated Rc=1 if dest-pred is not enabled
         rc_en = False
         if not self.is_svp64_mode or not pred_dst_zero:
             if hasattr(self.dec2.e.do, "rc"):
                 rc_en = yield self.dec2.e.do.rc.rc
+        # don't do Rc=1 for svstep it is handled explicitly.
+        # XXX TODO: now that CR0 is supported, sort out svstep's pseudocode
+        # to write directly to CR0 instead of in ISACaller. hooyahh.
         if rc_en and ins_name not in ['svstep']:
         if rc_en and ins_name not in ['svstep']:
-            regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, "CR0")
-            cmps = results
-            # hang on... for `setvl` actually you want to test SVSTATE.VL
-            is_setvl = ins_name == 'setvl'
-            if is_setvl:
-                vl = results[0].vl
-                cmps = (SelectableInt(vl, 64), overflow,)
-            else:
-                overflow = None # do not override overflow except in setvl
-            self.handle_comparison(cmps, regnum, overflow, no_so=is_setvl)
+            yield from self.do_rc_ov(ins_name, results[0], overflow, cr0)
+
+        # check failfirst
+        ffirst_hit = False
+        if self.is_svp64_mode:
+            ffirst_hit = (yield from self.check_ffirst(rc_en, srcstep))
 
         # any modified return results?
 
         # any modified return results?
-        if info.write_regs:
-            for name, output in zip(output_names, results):
-                yield from self.check_write(info, name, output, carry_en)
+        yield from self.do_outregs_nia(asmop, ins_name, info, outs,
+                                       carry_en, rc_en, ffirst_hit)
 
 
-        nia_update = (yield from self.check_step_increment(results, rc_en,
+    def check_ffirst(self, rc_en, srcstep):
+        rm_mode = yield self.dec2.rm_dec.mode
+        ff_inv = yield self.dec2.rm_dec.inv
+        cr_bit = yield self.dec2.rm_dec.cr_sel
+        RC1 = yield self.dec2.rm_dec.RC1
+        vli = yield self.dec2.rm_dec.vli # VL inclusive if truncated
+        log(" ff rm_mode", rc_en, rm_mode, SVP64RMMode.FFIRST.value)
+        log("        inv", ff_inv)
+        log("        RC1", RC1)
+        log("        vli", vli)
+        log("     cr_bit", cr_bit)
+        if not rc_en or rm_mode != SVP64RMMode.FFIRST.value:
+            return False
+        regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, "CR0")
+        crtest = self.crl[regnum]
+        ffirst_hit = crtest[cr_bit] != ff_inv
+        log("cr test", regnum, int(crtest), crtest, cr_bit, ff_inv)
+        log("cr test?", ffirst_hit)
+        if not ffirst_hit:
+            return False
+        vli = SelectableInt(int(vli), 7)
+        self.svstate.vl = srcstep + vli
+        yield self.dec2.state.svstate.eq(self.svstate.value)
+        yield Settle()  # let decoder update
+        return True
+
+    def do_rc_ov(self, ins_name, result, overflow, cr0):
+        if ins_name.startswith("f"):
+            rc_reg = "CR1"  # not calculated correctly yet (not FP compares)
+        else:
+            rc_reg = "CR0"
+        regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, rc_reg)
+        # hang on... for `setvl` actually you want to test SVSTATE.VL
+        is_setvl = ins_name == 'setvl'
+        if is_setvl:
+            result = SelectableInt(result.vl, 64)
+        else:
+            overflow = None  # do not override overflow except in setvl
+
+        # if there was not an explicit CR0 in the pseudocode, do implicit Rc=1
+        if cr0 is None:
+            self.handle_comparison(result, regnum, overflow, no_so=is_setvl)
+        else:
+            # otherwise we just blat CR0 into the required regnum
+            log("explicit rc0", cr0)
+            self.crl[regnum].eq(cr0)
+
+    def do_outregs_nia(self, asmop, ins_name, info, outs,
+                       carry_en, rc_en, ffirst_hit):
+        # write out any regs for this instruction
+        for name, output in outs.items():
+            yield from self.check_write(info, name, output, carry_en)
+
+        if ffirst_hit:
+            self.svp64_reset_loop()
+            nia_update = True
+        else:
+            # check advancement of src/dst/sub-steps and if PC needs updating
+            nia_update = (yield from self.check_step_increment(rc_en,
                                                            asmop, ins_name))
         if nia_update:
             self.update_pc_next()
                                                            asmop, ins_name))
         if nia_update:
             self.update_pc_next()
@@ -1471,18 +1978,23 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             log('reading reg %s %s' % (name, str(regnum)), is_vec)
             if name in fregs:
                 reg_val = SelectableInt(self.fpr(regnum))
             log('reading reg %s %s' % (name, str(regnum)), is_vec)
             if name in fregs:
                 reg_val = SelectableInt(self.fpr(regnum))
-                log(f"read reg f{regnum}: 0x{reg_val.value:X}",
-                    kind=LogKind.InstrInOuts)
+                log("read reg %d: 0x%x" % (regnum, reg_val.value))
             elif name is not None:
                 reg_val = SelectableInt(self.gpr(regnum))
             elif name is not None:
                 reg_val = SelectableInt(self.gpr(regnum))
-                log(f"read reg r{regnum}: 0x{reg_val.value:X}",
-                    kind=LogKind.InstrInOuts)
+                log("read reg %d: 0x%x" % (regnum, reg_val.value))
         else:
             log('zero input reg %s %s' % (name, str(regnum)), is_vec)
             reg_val = 0
         return reg_val
 
         else:
             log('zero input reg %s %s' % (name, str(regnum)), is_vec)
             reg_val = 0
         return reg_val
 
-    def remap_debug(self, remaps):
+    def remap_set_steps(self, remaps):
+        """remap_set_steps sets up the in1/2/3 and out1/2 steps.
+        they work in concert with PowerDecoder2 at the moment,
+        there is no HDL implementation of REMAP.  therefore this
+        function, because ISACaller still uses PowerDecoder2,
+        will *explicitly* write the dec2.XX_step values. this has
+        to get sorted out.
+        """
         # just some convenient debug info
         for i in range(4):
             sname = 'SVSHAPE%d' % i
         # just some convenient debug info
         for i in range(4):
             sname = 'SVSHAPE%d' % i
@@ -1525,129 +2037,124 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
     def check_write(self, info, name, output, carry_en):
         if name == 'overflow':  # ignore, done already (above)
             return
     def check_write(self, info, name, output, carry_en):
         if name == 'overflow':  # ignore, done already (above)
             return
+        if name == 'CR0':  # ignore, done already (above)
+            return
         if isinstance(output, int):
             output = SelectableInt(output, 256)
         if isinstance(output, int):
             output = SelectableInt(output, 256)
+        # write carry flafs
         if name in ['CA', 'CA32']:
             if carry_en:
                 log("writing %s to XER" % name, output)
         if name in ['CA', 'CA32']:
             if carry_en:
                 log("writing %s to XER" % name, output)
-                log(f"write XER field {name}: "
-                    f"0x{output.value % (1 << 64):X}",
-                    kind=LogKind.InstrInOuts)
+                log("write XER %s 0x%x" % (name, output.value))
                 self.spr['XER'][XER_bits[name]] = output.value
             else:
                 log("NOT writing %s to XER" % name, output)
                 self.spr['XER'][XER_bits[name]] = output.value
             else:
                 log("NOT writing %s to XER" % name, output)
-        elif name in info.special_regs:
+            return
+        # write special SPRs
+        if name in info.special_regs:
             log('writing special %s' % name, output, special_sprs)
             log('writing special %s' % name, output, special_sprs)
-            log(f"write reg {name}: "
-                f"0x{output.value % (1 << 64):X}",
-                kind=LogKind.InstrInOuts)
+            log("write reg %s 0x%x" % (name, output.value))
             if name in special_sprs:
                 self.spr[name] = output
             else:
                 self.namespace[name].eq(output)
             if name == 'MSR':
                 log('msr written', hex(self.msr.value))
             if name in special_sprs:
                 self.spr[name] = output
             else:
                 self.namespace[name].eq(output)
             if name == 'MSR':
                 log('msr written', hex(self.msr.value))
+            return
+        # find out1/out2 PR/FPR
+        regnum, is_vec = yield from get_pdecode_idx_out(self.dec2, name)
+        if regnum is None:
+            regnum, is_vec = yield from get_pdecode_idx_out2(self.dec2, name)
+        if regnum is None:
+            # temporary hack for not having 2nd output
+            regnum = yield getattr(self.decoder, name)
+            is_vec = False
+        # convenient debug prefix
+        if name in fregs:
+            reg_prefix = 'f'
         else:
         else:
-            regnum, is_vec = yield from get_pdecode_idx_out(self.dec2, name)
-            if regnum is None:
-                regnum, is_vec = yield from get_pdecode_idx_out2(
-                    self.dec2, name)
-            if regnum is None:
-                # temporary hack for not having 2nd output
-                regnum = yield getattr(self.decoder, name)
-                is_vec = False
-            if self.is_svp64_mode and self.pred_dst_zero:
-                log('zeroing reg %d %s' % (regnum, str(output)),
-                    is_vec)
-                output = SelectableInt(0, 256)
-            else:
-                if name in fregs:
-                    reg_prefix = 'f'
-                else:
-                    reg_prefix = 'r'
-                log(f"write reg {reg_prefix}{regnum}: "
-                    f"0x{output.value % (1 << 64):X}",
-                    kind=LogKind.InstrInOuts)
-            if output.bits > 64:
-                output = SelectableInt(output.value, 64)
-            if name in fregs:
-                self.fpr[regnum] = output
-            else:
-                self.gpr[regnum] = output
+            reg_prefix = 'r'
+        # check zeroing due to predicate bit being zero
+        if self.is_svp64_mode and self.pred_dst_zero:
+            log('zeroing reg %d %s' % (regnum, str(output)), is_vec)
+            output = SelectableInt(0, 256)
+        log("write reg %s%d 0x%x" % (reg_prefix, regnum, output.value),
+            kind=LogKind.InstrInOuts)
+        # zero-extend tov64 bit begore storing (should use EXT oh well)
+        if output.bits > 64:
+            output = SelectableInt(output.value, 64)
+        if name in fregs:
+            self.fpr[regnum] = output
+        else:
+            self.gpr[regnum] = output
 
 
-    def check_step_increment(self, results, rc_en, asmop, ins_name):
+    def check_step_increment(self, rc_en, asmop, ins_name):
         # check if it is the SVSTATE.src/dest step that needs incrementing
         # this is our Sub-Program-Counter loop from 0 to VL-1
         # check if it is the SVSTATE.src/dest step that needs incrementing
         # this is our Sub-Program-Counter loop from 0 to VL-1
+        if not self.allow_next_step_inc:
+            if self.is_svp64_mode:
+                return (yield from self.svstate_post_inc(ins_name))
+
+            # XXX only in non-SVP64 mode!
+            # record state of whether the current operation was an svshape,
+            # OR svindex!
+            # to be able to know if it should apply in the next instruction.
+            # also (if going to use this instruction) should disable ability
+            # to interrupt in between. sigh.
+            self.last_op_svshape = asmop in ['svremap', 'svindex',
+                                             'svshape2']
+            return True
+
         pre = False
         post = False
         nia_update = True
         pre = False
         post = False
         nia_update = True
-        if self.allow_next_step_inc:
-            log("SVSTATE_NEXT: inc requested, mode",
-                self.svstate_next_mode, self.allow_next_step_inc)
-            yield from self.svstate_pre_inc()
-            pre = yield from self.update_new_svstate_steps()
-            if pre:
-                # reset at end of loop including exit Vertical Mode
-                log("SVSTATE_NEXT: end of loop, reset")
-                self.svp64_reset_loop()
-                self.svstate.vfirst = 0
-                self.update_nia()
-                if not rc_en:
-                    return True
-                results = [SelectableInt(0, 64)]
-                self.handle_comparison(results)  # CR0
+        log("SVSTATE_NEXT: inc requested, mode",
+            self.svstate_next_mode, self.allow_next_step_inc)
+        yield from self.svstate_pre_inc()
+        pre = yield from self.update_new_svstate_steps()
+        if pre:
+            # reset at end of loop including exit Vertical Mode
+            log("SVSTATE_NEXT: end of loop, reset")
+            self.svp64_reset_loop()
+            self.svstate.vfirst = 0
+            self.update_nia()
+            if not rc_en:
                 return True
                 return True
-            if self.allow_next_step_inc == 2:
-                log("SVSTATE_NEXT: read")
-                nia_update = (yield from self.svstate_post_inc(ins_name))
-            else:
-                log("SVSTATE_NEXT: post-inc")
-            # use actual src/dst-step here to check end, do NOT
-            # use bit-reversed version
-            srcstep, dststep = self.new_srcstep, self.new_dststep
-            ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
-            remaps = self.get_remap_indices()
-            remap_idxs = self.remap_idxs
-            vl = self.svstate.vl
-            subvl = yield self.dec2.rm_dec.rm_in.subvl
-            end_src = srcstep == vl-1
-            end_dst = dststep == vl-1
-            if self.allow_next_step_inc != 2:
-                yield from self.advance_svstate_steps(end_src, end_dst)
-            self.namespace['SVSTATE'] = self.svstate.spr
-            # set CR0 (if Rc=1) based on end
-            if rc_en:
-                endtest = 1 if (end_src or end_dst) else 0
-                #results = [SelectableInt(endtest, 64)]
-                # self.handle_comparison(results) # CR0
-
-                # see if svstep was requested, if so, which SVSTATE
-                endings = 0b111
-                if self.svstate_next_mode > 0:
-                    shape_idx = self.svstate_next_mode.value-1
-                    endings = self.remap_loopends[shape_idx]
-                cr_field = SelectableInt((~endings) << 1 | endtest, 4)
-                log("svstep Rc=1, CR0", cr_field)
-                self.crl[0].eq(cr_field)  # CR0
-            if end_src or end_dst:
-                # reset at end of loop including exit Vertical Mode
-                log("SVSTATE_NEXT: after increments, reset")
-                self.svp64_reset_loop()
-                self.svstate.vfirst = 0
-            return nia_update
-
-        if self.is_svp64_mode:
-            return (yield from self.svstate_post_inc(ins_name))
-
-        # XXX only in non-SVP64 mode!
-        # record state of whether the current operation was an svshape,
-        # OR svindex!
-        # to be able to know if it should apply in the next instruction.
-        # also (if going to use this instruction) should disable ability
-        # to interrupt in between. sigh.
-        self.last_op_svshape = asmop in ['svremap', 'svindex']
-
-        return True
+            self.handle_comparison(SelectableInt(0, 64))  # CR0
+            return True
+        if self.allow_next_step_inc == 2:
+            log("SVSTATE_NEXT: read")
+            nia_update = (yield from self.svstate_post_inc(ins_name))
+        else:
+            log("SVSTATE_NEXT: post-inc")
+        # use actual (cached) src/dst-step here to check end
+        remaps = self.get_remap_indices()
+        remap_idxs = self.remap_idxs
+        vl = self.svstate.vl
+        subvl = yield self.dec2.rm_dec.rm_in.subvl
+        if self.allow_next_step_inc != 2:
+            yield from self.advance_svstate_steps()
+        #self.namespace['SVSTATE'] = self.svstate.spr
+        # set CR0 (if Rc=1) based on end
+        endtest = 1 if self.at_loopend() else 0
+        if rc_en:
+            #results = [SelectableInt(endtest, 64)]
+            # self.handle_comparison(results) # CR0
+
+            # see if svstep was requested, if so, which SVSTATE
+            endings = 0b111
+            if self.svstate_next_mode > 0:
+                shape_idx = self.svstate_next_mode.value-1
+                endings = self.remap_loopends[shape_idx]
+            cr_field = SelectableInt((~endings) << 1 | endtest, 4)
+            log("svstep Rc=1, CR0", cr_field, endtest)
+            self.crl[0].eq(cr_field)  # CR0
+        if endtest:
+            # reset at end of loop including exit Vertical Mode
+            log("SVSTATE_NEXT: after increments, reset")
+            self.svp64_reset_loop()
+            self.svstate.vfirst = 0
+        return nia_update
 
     def SVSTATE_NEXT(self, mode, submode):
         """explicitly moves srcstep/dststep on to next element, for
 
     def SVSTATE_NEXT(self, mode, submode):
         """explicitly moves srcstep/dststep on to next element, for
@@ -1670,132 +2177,66 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if self.svstate_next_mode == 6:
             self.svstate_next_mode = 0
             return SelectableInt(self.svstate.dststep, 7)
         if self.svstate_next_mode == 6:
             self.svstate_next_mode = 0
             return SelectableInt(self.svstate.dststep, 7)
+        if self.svstate_next_mode == 7:
+            self.svstate_next_mode = 0
+            return SelectableInt(self.svstate.ssubstep, 7)
+        if self.svstate_next_mode == 8:
+            self.svstate_next_mode = 0
+            return SelectableInt(self.svstate.dsubstep, 7)
         return SelectableInt(0, 7)
 
         return SelectableInt(0, 7)
 
-    def svstate_pre_inc(self):
-        """check if srcstep/dststep need to skip over masked-out predicate bits
-        note that this is not supposed to do anything to substep,
-        it is purely for skipping masked-out bits
-        """
-        # get SVSTATE VL (oh and print out some debug stuff)
-        vl = self.svstate.vl
-        subvl = yield self.dec2.rm_dec.rm_in.subvl
-        srcstep = self.svstate.srcstep
-        dststep = self.svstate.dststep
-        ssubstep = self.svstate.ssubstep
-        dsubstep = self.svstate.dsubstep
-        sv_a_nz = yield self.dec2.sv_a_nz
-        fft_mode = yield self.dec2.use_svp64_fft
-        in1 = yield self.dec2.e.read_reg1.data
-        log("SVP64: VL, subvl, srcstep, dststep, ssubstep, dsybstep, sv_a_nz, "
-            "in1 fft, svp64",
-            vl, subvl, srcstep, dststep, ssubstep, dsubstep,
-            sv_a_nz, in1, fft_mode,
-            self.is_svp64_mode)
-
-        # get predicate mask (all 64 bits)
-        srcmask = dstmask = 0xffff_ffff_ffff_ffff
-
-        pmode = yield self.dec2.rm_dec.predmode
-        pack = yield self.dec2.rm_dec.pack
-        unpack = yield self.dec2.rm_dec.unpack
-        reverse_gear = yield self.dec2.rm_dec.reverse_gear
-        sv_ptype = yield self.dec2.dec.op.SV_Ptype
-        srcpred = yield self.dec2.rm_dec.srcpred
-        dstpred = yield self.dec2.rm_dec.dstpred
-        pred_src_zero = yield self.dec2.rm_dec.pred_sz
-        pred_dst_zero = yield self.dec2.rm_dec.pred_dz
-        if pmode == SVP64PredMode.INT.value:
-            srcmask = dstmask = get_predint(self.gpr, dstpred)
-            if sv_ptype == SVPtype.P2.value:
-                srcmask = get_predint(self.gpr, srcpred)
-        elif pmode == SVP64PredMode.CR.value:
-            srcmask = dstmask = get_predcr(self.crl, dstpred, vl)
-            if sv_ptype == SVPtype.P2.value:
-                srcmask = get_predcr(self.crl, srcpred, vl)
-        # work out if the ssubsteps are completed
-        ssubstart = ssubstep == 0
-        dsubstart = dsubstep == 0
-        log("    pmode", pmode)
-        log("    pack/unpack", pack, unpack)
-        log("    reverse", reverse_gear)
-        log("    ptype", sv_ptype)
-        log("    srcpred", bin(srcpred))
-        log("    dstpred", bin(dstpred))
-        log("    srcmask", bin(srcmask))
-        log("    dstmask", bin(dstmask))
-        log("    pred_sz", bin(pred_src_zero))
-        log("    pred_dz", bin(pred_dst_zero))
-        log("    ssubstart", ssubstart)
-        log("    dsubstart", dsubstart)
-
-        # okaaay, so here we simply advance srcstep (TODO dststep)
-        # this can ONLY be done at the beginning of the "for" loop
-        # (this is all actually a FSM so it's hell to keep track sigh)
-        if ssubstart:
-            # until the predicate mask has a "1" bit... or we run out of VL
-            # let srcstep==VL be the indicator to move to next instruction
-            if not pred_src_zero:
-                while (((1 << srcstep) & srcmask) == 0) and (srcstep != vl):
-                    log("      sskip", bin(1 << srcstep))
-                    srcstep += 1
-        if dsubstart:
-            # same for dststep
-            if not pred_dst_zero:
-                while (((1 << dststep) & dstmask) == 0) and (dststep != vl):
-                    log("      dskip", bin(1 << dststep))
-                    dststep += 1
-
-        # now work out if the relevant mask bits require zeroing
-        if pred_dst_zero:
-            pred_dst_zero = ((1 << dststep) & dstmask) == 0
-        if pred_src_zero:
-            pred_src_zero = ((1 << srcstep) & srcmask) == 0
-
-        # store new srcstep / dststep
-        self.new_srcstep, self.new_dststep = (srcstep, dststep)
-        self.new_ssubstep, self.new_dsubstep = (ssubstep, dsubstep)
-        self.pred_dst_zero, self.pred_src_zero = (pred_dst_zero, pred_src_zero)
-        log("    new srcstep", srcstep)
-        log("    new dststep", dststep)
-        log("    new ssubstep", ssubstep)
-        log("    new dsubstep", dsubstep)
-
     def get_src_dststeps(self):
         """gets srcstep, dststep, and ssubstep, dsubstep
         """
         return (self.new_srcstep, self.new_dststep,
                 self.new_ssubstep, self.new_dsubstep)
 
     def get_src_dststeps(self):
         """gets srcstep, dststep, and ssubstep, dsubstep
         """
         return (self.new_srcstep, self.new_dststep,
                 self.new_ssubstep, self.new_dsubstep)
 
-    def update_new_svstate_steps(self):
-        # note, do not get the bit-reversed srcstep here!
-        srcstep, dststep = self.new_srcstep, self.new_dststep
-        ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
+    def update_svstate_namespace(self, overwrite_svstate=True):
+        if overwrite_svstate:
+            # note, do not get the bit-reversed srcstep here!
+            srcstep, dststep = self.new_srcstep, self.new_dststep
+            ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
 
 
-        # update SVSTATE with new srcstep
-        self.svstate.srcstep = srcstep
-        self.svstate.dststep = dststep
-        self.svstate.ssubstep = ssubstep
-        self.svstate.dsubstep = dsubstep
+            # update SVSTATE with new srcstep
+            self.svstate.srcstep = srcstep
+            self.svstate.dststep = dststep
+            self.svstate.ssubstep = ssubstep
+            self.svstate.dsubstep = dsubstep
         self.namespace['SVSTATE'] = self.svstate
         yield self.dec2.state.svstate.eq(self.svstate.value)
         yield Settle()  # let decoder update
         self.namespace['SVSTATE'] = self.svstate
         yield self.dec2.state.svstate.eq(self.svstate.value)
         yield Settle()  # let decoder update
+
+    def update_new_svstate_steps(self, overwrite_svstate=True):
+        yield from self.update_svstate_namespace(overwrite_svstate)
         srcstep = self.svstate.srcstep
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
         srcstep = self.svstate.srcstep
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
         vl = self.svstate.vl
         subvl = yield self.dec2.rm_dec.rm_in.subvl
         vl = self.svstate.vl
         subvl = yield self.dec2.rm_dec.rm_in.subvl
+        rm_mode = yield self.dec2.rm_dec.mode
+        ff_inv = yield self.dec2.rm_dec.inv
+        cr_bit = yield self.dec2.rm_dec.cr_sel
         log("    srcstep", srcstep)
         log("    dststep", dststep)
         log("    srcstep", srcstep)
         log("    dststep", dststep)
+        log("        pack", pack)
+        log("      unpack", unpack)
         log("    ssubstep", ssubstep)
         log("    dsubstep", dsubstep)
         log("         vl", vl)
         log("      subvl", subvl)
         log("    ssubstep", ssubstep)
         log("    dsubstep", dsubstep)
         log("         vl", vl)
         log("      subvl", subvl)
+        log("    rm_mode", rm_mode)
+        log("        inv", ff_inv)
+        log("     cr_bit", cr_bit)
 
         # check if end reached (we let srcstep overrun, above)
         # nothing needs doing (TODO zeroing): just do next instruction
 
         # check if end reached (we let srcstep overrun, above)
         # nothing needs doing (TODO zeroing): just do next instruction
-        return srcstep == vl or dststep == vl
+        if self.loopend:
+            return True
+        return ((ssubstep == subvl and srcstep == vl) or
+                (dsubstep == subvl and dststep == vl))
 
     def svstate_post_inc(self, insn_name, vf=0):
         # check if SV "Vertical First" mode is enabled
 
     def svstate_post_inc(self, insn_name, vf=0):
         # check if SV "Vertical First" mode is enabled
@@ -1815,6 +2256,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
         rm_mode = yield self.dec2.rm_dec.mode
         reverse_gear = yield self.dec2.rm_dec.reverse_gear
         sv_ptype = yield self.dec2.dec.op.SV_Ptype
         rm_mode = yield self.dec2.rm_dec.mode
         reverse_gear = yield self.dec2.rm_dec.reverse_gear
         sv_ptype = yield self.dec2.dec.op.SV_Ptype
@@ -1827,6 +2270,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("    svstate.dststep", dststep)
         log("    svstate.ssubstep", ssubstep)
         log("    svstate.dsubstep", dsubstep)
         log("    svstate.dststep", dststep)
         log("    svstate.ssubstep", ssubstep)
         log("    svstate.dsubstep", dsubstep)
+        log("    svstate.pack", pack)
+        log("    svstate.unpack", unpack)
         log("    mode", rm_mode)
         log("    reverse", reverse_gear)
         log("    out_vec", out_vec)
         log("    mode", rm_mode)
         log("    reverse", reverse_gear)
         log("    out_vec", out_vec)
@@ -1848,8 +2293,9 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         else:
             svp64_is_vector = out_vec
         # loops end at the first "hit" (source or dest)
         else:
             svp64_is_vector = out_vec
         # loops end at the first "hit" (source or dest)
-        loopend = ((srcstep == vl-1 and ssubstep == subvl) or
-                   (dststep == vl-1 and dsubstep == subvl))
+        yield from self.advance_svstate_steps()
+        loopend = self.loopend
+        log("loopend", svp64_is_vector, loopend)
         if not svp64_is_vector or loopend:
             # reset loop to zero and update NIA
             self.svp64_reset_loop()
         if not svp64_is_vector or loopend:
             # reset loop to zero and update NIA
             self.svp64_reset_loop()
@@ -1858,8 +2304,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             return True
 
         # still looping, advance and update NIA
             return True
 
         # still looping, advance and update NIA
-        yield from self.advance_svstate_steps()
         self.namespace['SVSTATE'] = self.svstate
         self.namespace['SVSTATE'] = self.svstate
+
         # not an SVP64 branch, so fix PC (NIA==CIA) for next loop
         # (by default, NIA is CIA+4 if v3.0B or CIA+8 if SVP64)
         # this way we keep repeating the same instruction (with new steps)
         # not an SVP64 branch, so fix PC (NIA==CIA) for next loop
         # (by default, NIA is CIA+4 if v3.0B or CIA+8 if SVP64)
         # this way we keep repeating the same instruction (with new steps)
@@ -1868,34 +2314,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("end of sub-pc call", self.namespace['CIA'], self.namespace['NIA'])
         return False  # DO NOT allow PC update whilst Sub-PC loop running
 
         log("end of sub-pc call", self.namespace['CIA'], self.namespace['NIA'])
         return False  # DO NOT allow PC update whilst Sub-PC loop running
 
-    def advance_svstate_steps(self, end_src=False, end_dst=False):
-        """ advance sub/steps. note that Pack/Unpack *INVERTS* the order.
-        TODO when Pack/Unpack is set, substep becomes the *outer* loop
-        """
-        subvl = yield self.dec2.rm_dec.rm_in.subvl
-        # first source step
-        ssubstep = self.svstate.ssubstep
-        end_sub = ssubstep == subvl
-        if end_sub:
-            if not end_src:
-                self.svstate.srcstep += SelectableInt(1, 7)
-            self.svstate.ssubstep = SelectableInt(0, 2)  # reset
-        else:
-            self.svstate.ssubstep += SelectableInt(1, 2) # advance ssubstep
-        # now dest step
-        dsubstep = self.svstate.dsubstep
-        end_sub = dsubstep == subvl
-        if end_sub:
-            if not end_dst:
-                self.svstate.dststep += SelectableInt(1, 7)
-            self.svstate.dsubstep = SelectableInt(0, 2)  # reset
-        else:
-            self.svstate.dsubstep += SelectableInt(1, 2) # advance ssubstep
-
     def update_pc_next(self):
         # UPDATE program counter
         self.pc.update(self.namespace, self.is_svp64_mode)
     def update_pc_next(self):
         # UPDATE program counter
         self.pc.update(self.namespace, self.is_svp64_mode)
-        self.svstate.spr = self.namespace['SVSTATE']
+        #self.svstate.spr = self.namespace['SVSTATE']
         log("end of call", self.namespace['CIA'],
             self.namespace['NIA'],
             self.namespace['SVSTATE'])
         log("end of call", self.namespace['CIA'],
             self.namespace['NIA'],
             self.namespace['SVSTATE'])
@@ -1905,6 +2327,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         self.svstate.dststep = 0
         self.svstate.ssubstep = 0
         self.svstate.dsubstep = 0
         self.svstate.dststep = 0
         self.svstate.ssubstep = 0
         self.svstate.dsubstep = 0
+        self.loopend = False
         log("    svstate.srcstep loop end (PC to update)")
         self.namespace['SVSTATE'] = self.svstate
 
         log("    svstate.srcstep loop end (PC to update)")
         self.namespace['SVSTATE'] = self.svstate