add data-dependent fail-first mode, Rc=1 variant not RC1 yet
[openpower-isa.git] / src / openpower / decoder / isa / caller.py
index 7eb265e430e22eace277718ecd45076fd77441a1..8a8a3f6ef0e3c693b68b5d9f7cbb9c22a50e9ec5 100644 (file)
@@ -30,7 +30,7 @@ from openpower.decoder.power_enums import (spr_dict, spr_byname, XER_bits,
                                            OutSel, CRInSel, CROutSel, LDSTMode,
                                            SVP64RMMode, SVP64PredMode,
                                            SVP64PredInt, SVP64PredCR,
-                                           SVP64LDSTmode)
+                                           SVP64LDSTmode, FPTRANS_INSNS)
 
 from openpower.decoder.power_enums import SVPtype
 
@@ -95,6 +95,7 @@ REG_SORT_ORDER = {
     "CA32": 0,
 
     "overflow": 7,  # should definitely be last
+    "CR0": 8,       # likewise
 }
 
 fregs = ['FRA', 'FRB', 'FRC', 'FRS', 'FRT']
@@ -363,7 +364,7 @@ def get_pdecode_idx_in(dec2, name):
     log("get_pdecode_idx_in FRC in3", name, in3_sel, In3Sel.FRC.value,
         in3, in3_isvec)
     # identify which regnames map to in1/2/3
-    if name == 'RA':
+    if name == 'RA' or name == 'RA_OR_ZERO':
         if (in1_sel == In1Sel.RA.value or
                 (in1_sel == In1Sel.RA_OR_ZERO.value and in1 != 0)):
             return in1, in1_isvec
@@ -376,6 +377,8 @@ def get_pdecode_idx_in(dec2, name):
             return in3, in3_isvec
     # XXX TODO, RC doesn't exist yet!
     elif name == 'RC':
+        if in3_sel == In3Sel.RC.value:
+            return in3, in3_isvec
         assert False, "RC does not exist yet"
     elif name == 'RS':
         if in1_sel == In1Sel.RS.value:
@@ -445,7 +448,7 @@ def get_pdecode_cr_out(dec2, name):
     if name == 'CR0':
         if out_sel == CROutSel.CR0.value:
             return out, o_isvec
-    if name == 'CR1': # these are not actually calculated correctly
+    if name == 'CR1':  # these are not actually calculated correctly
         if out_sel == CROutSel.CR1.value:
             return out, o_isvec
     log("get_pdecode_cr_out not found", name)
@@ -470,6 +473,8 @@ def get_pdecode_idx_out(dec2, name):
             dec2.dec.RT)
         if out_sel == OutSel.RT.value:
             return out, o_isvec
+        if out_sel == OutSel.RT_OR_ZERO.value and out != 0:
+            return out, o_isvec
     elif name == 'RT_OR_ZERO':
         log("get_pdecode_idx_out", out_sel, OutSel.RT.value,
             OutSel.RT_OR_ZERO.value, out, o_isvec,
@@ -510,10 +515,14 @@ def get_pdecode_idx_out2(dec2, name):
                 out, o_isvec)
             if upd == LDSTMode.update.value:
                 return out, o_isvec
+    if name == 'RS':
+        fft_en = yield dec2.implicit_rs
+        if fft_en:
+            log("get_pdecode_idx_out2", out_sel, OutSel.RS.value,
+                out, o_isvec)
+            return out, o_isvec
     if name == 'FRS':
-        int_op = yield dec2.dec.op.internal_op
-        fft_en = yield dec2.use_svp64_fft
-        # if int_op == MicrOp.OP_FP_MADD.value and fft_en:
+        fft_en = yield dec2.implicit_rs
         if fft_en:
             log("get_pdecode_idx_out2", out_sel, OutSel.FRS.value,
                 out, o_isvec)
@@ -521,7 +530,305 @@ def get_pdecode_idx_out2(dec2, name):
     return None, False
 
 
-class ISACaller(ISACallerHelper, ISAFPHelpers):
+class StepLoop:
+    """deals with svstate looping.
+    """
+
+    def __init__(self, svstate):
+        self.svstate = svstate
+
+    def get_iterators(self):
+        self.src_it = self.src_iterator()
+        self.dst_it = self.dst_iterator()
+
+    def src_iterator(self):
+        """source-stepping iterator
+        """
+        pack = self.svstate.pack
+
+        # source step
+        if pack:
+            # pack advances subvl in *outer* loop
+            if end_src:
+                if not end_ssub:
+                    self.svstate.ssubstep += SelectableInt(1, 2)
+                self.svstate.srcstep = SelectableInt(0, 7)  # reset
+            else:
+                self.svstate.srcstep += SelectableInt(1, 7)  # advance srcstep
+        else:
+            # these cannot be done as for-loops because SVSTATE may change
+            # (srcstep/substep may be modified, interrupted, subvl/vl change)
+            # but they *can* be done as while-loops as long as every SVSTATE
+            # "thing" is re-read every single time a yield gives indices
+            while True:  # outer vl loop
+                while True:  # inner subvl loop
+                    subvl = self.subvl
+                    srcmask = self.srcmask
+                    srcstep = self.svstate.srcstep
+                    if self.pred_sz or ((1 << srcstep) & srcmask) != 0:
+                        log("    advance src", srcstep, self.svstate.vl,
+                            self.svstate.ssubstep, subvl)
+                        # yield actual substep/srcstep
+                        yield (self.svstate.ssubstep, srcstep)
+                    if self.svstate.ssubstep == subvl:  # end-point
+                        self.svstate.ssubstep = SelectableInt(0, 2)  # reset
+                        break
+                    self.svstate.ssubstep += SelectableInt(1, 2)
+                vl = self.svstate.vl
+                if srcstep == vl-1:  # end-point
+                    self.svstate.srcstep = SelectableInt(0, 7)  # reset
+                    break  # trigger StopIteration
+                self.svstate.srcstep += SelectableInt(1, 7)  # advance srcstep
+
+    def dst_iterator(self):
+        """dest-stepping iterator
+        """
+        unpack = self.svstate.unpack
+
+        # dest step
+        if unpack:
+            # pack advances subvl in *outer* loop
+            pass  # TODO
+        else:
+            # these cannot be done as for-loops because SVSTATE may change
+            # (dststep/substep may be modified, interrupted, subvl/vl change)
+            # but they *can* be done as while-loops as long as every SVSTATE
+            # "thing" is re-read every single time a yield gives indices
+            while True:  # outer vl loop
+                while True:  # inner subvl loop
+                    subvl = self.subvl
+                    dstmask = self.dstmask
+                    dststep = self.svstate.dststep
+                    if self.pred_dz or ((1 << dststep) & dstmask) != 0:
+                        log("    advance dst", dststep, self.svstate.vl,
+                            self.svstate.dsubstep, subvl)
+                        # yield actual substep/dststep
+                        yield (self.svstate.dsubstep, dststep)
+                    if self.svstate.dsubstep == subvl:  # end-point
+                        self.svstate.dsubstep = SelectableInt(0, 2)  # reset
+                        break
+                    self.svstate.dsubstep += SelectableInt(1, 2)
+                vl = self.svstate.vl
+                if dststep == vl-1:  # end-point
+                    self.svstate.dststep = SelectableInt(0, 7)  # reset
+                    break  # trigger StopIteration
+                self.svstate.dststep += SelectableInt(1, 7)  # advance dststep
+
+    def src_iterate(self):
+        """source-stepping iterator
+        """
+        end_src = self.end_src
+        subvl = self.subvl
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
+        ssubstep = self.svstate.ssubstep
+        end_ssub = ssubstep == subvl
+        log("    pack/unpack/subvl", pack, unpack, subvl,
+            "end", end_src,
+            "sub", end_ssub)
+        # first source step
+        srcstep = self.svstate.srcstep
+        if pack:
+            # pack advances subvl in *outer* loop
+            if end_src:
+                if not end_ssub:
+                    self.svstate.ssubstep += SelectableInt(1, 2)
+                self.svstate.srcstep = SelectableInt(0, 7)  # reset
+            else:
+                self.svstate.srcstep += SelectableInt(1, 7)  # advance srcstep
+        else:
+            # advance subvl in *inner* loop
+            if end_ssub:
+                if not end_src:
+                    self.svstate.srcstep += SelectableInt(1, 7)
+                self.svstate.ssubstep = SelectableInt(0, 2)  # reset
+            else:
+                # advance ssubstep
+                self.svstate.ssubstep += SelectableInt(1, 2)
+
+        log("    advance src", self.svstate.srcstep, self.svstate.ssubstep)
+
+    def dst_iterate(self):
+        """dest step iterator
+        """
+        end_dst = self.end_dst
+        subvl = self.subvl
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
+        dsubstep = self.svstate.dsubstep
+        end_dsub = dsubstep == subvl
+        log("    pack/unpack/subvl", pack, unpack, subvl,
+            "end", end_dst,
+            "sub", end_dsub)
+        # now dest step
+        if unpack:
+            # unpack advances subvl in *outer* loop
+            if end_dst:
+                if not end_dsub:
+                    self.svstate.dsubstep += SelectableInt(1, 2)
+                self.svstate.dststep = SelectableInt(0, 7)  # reset
+            else:
+                self.svstate.dststep += SelectableInt(1, 7)  # advance dststep
+        else:
+            # advance subvl in *inner* loop
+            if end_dsub:
+                if not end_dst:
+                    self.svstate.dststep += SelectableInt(1, 7)
+                self.svstate.dsubstep = SelectableInt(0, 2)  # reset
+            else:
+                # advance ssubstep
+                self.svstate.dsubstep += SelectableInt(1, 2)
+        log("    advance dst", self.svstate.dststep, self.svstate.dsubstep)
+
+    def advance_svstate_steps(self, end_src=False, end_dst=False):
+        """ advance sub/steps. note that Pack/Unpack *INVERTS* the order.
+        TODO when Pack/Unpack is set, substep becomes the *outer* loop
+        """
+        self.subvl = yield self.dec2.rm_dec.rm_in.subvl
+        self.end_src = end_src
+        self.end_dst = end_dst
+        self.src_iterate()
+        self.dst_iterate()
+
+    def read_src_mask(self):
+        """read/update pred_sz and src mask
+        """
+        # get SVSTATE VL (oh and print out some debug stuff)
+        vl = self.svstate.vl
+        srcstep = self.svstate.srcstep
+        ssubstep = self.svstate.ssubstep
+
+        # get predicate mask (all 64 bits)
+        srcmask = 0xffff_ffff_ffff_ffff
+
+        pmode = yield self.dec2.rm_dec.predmode
+        sv_ptype = yield self.dec2.dec.op.SV_Ptype
+        srcpred = yield self.dec2.rm_dec.srcpred
+        dstpred = yield self.dec2.rm_dec.dstpred
+        pred_sz = yield self.dec2.rm_dec.pred_sz
+        if pmode == SVP64PredMode.INT.value:
+            srcmask = dstmask = get_predint(self.gpr, dstpred)
+            if sv_ptype == SVPtype.P2.value:
+                srcmask = get_predint(self.gpr, srcpred)
+        elif pmode == SVP64PredMode.CR.value:
+            srcmask = dstmask = get_predcr(self.crl, dstpred, vl)
+            if sv_ptype == SVPtype.P2.value:
+                srcmask = get_predcr(self.crl, srcpred, vl)
+        # work out if the ssubsteps are completed
+        ssubstart = ssubstep == 0
+        log("    pmode", pmode)
+        log("    ptype", sv_ptype)
+        log("    srcpred", bin(srcpred))
+        log("    srcmask", bin(srcmask))
+        log("    pred_sz", bin(pred_sz))
+        log("    ssubstart", ssubstart)
+
+        # store all that above
+        self.srcstep_skip = False
+        self.srcmask = srcmask
+        self.pred_sz = pred_sz
+        self.new_ssubstep = ssubstep
+        log("    new ssubstep", ssubstep)
+        if ssubstart:
+            # until the predicate mask has a "1" bit... or we run out of VL
+            # let srcstep==VL be the indicator to move to next instruction
+            if not pred_sz:
+                self.srcstep_skip = True
+
+    def read_dst_mask(self):
+        """same as read_src_mask - check and record everything needed
+        """
+        # get SVSTATE VL (oh and print out some debug stuff)
+        # yield Delay(1e-10)  # make changes visible
+        vl = self.svstate.vl
+        dststep = self.svstate.dststep
+        dsubstep = self.svstate.dsubstep
+
+        # get predicate mask (all 64 bits)
+        dstmask = 0xffff_ffff_ffff_ffff
+
+        pmode = yield self.dec2.rm_dec.predmode
+        reverse_gear = yield self.dec2.rm_dec.reverse_gear
+        sv_ptype = yield self.dec2.dec.op.SV_Ptype
+        dstpred = yield self.dec2.rm_dec.dstpred
+        pred_dz = yield self.dec2.rm_dec.pred_dz
+        if pmode == SVP64PredMode.INT.value:
+            dstmask = get_predint(self.gpr, dstpred)
+        elif pmode == SVP64PredMode.CR.value:
+            dstmask = get_predcr(self.crl, dstpred, vl)
+        # work out if the ssubsteps are completed
+        dsubstart = dsubstep == 0
+        log("    pmode", pmode)
+        log("    ptype", sv_ptype)
+        log("    dstpred", bin(dstpred))
+        log("    dstmask", bin(dstmask))
+        log("    pred_dz", bin(pred_dz))
+        log("    dsubstart", dsubstart)
+
+        self.dststep_skip = False
+        self.dstmask = dstmask
+        self.pred_dz = pred_dz
+        self.new_dsubstep = dsubstep
+        log("    new dsubstep", dsubstep)
+        if dsubstart:
+            if not pred_dz:
+                self.dststep_skip = True
+
+    def svstate_pre_inc(self):
+        """check if srcstep/dststep need to skip over masked-out predicate bits
+        note that this is not supposed to do anything to substep,
+        it is purely for skipping masked-out bits
+        """
+
+        yield from self.read_src_mask()
+        yield from self.read_dst_mask()
+
+        self.skip_src()
+        self.skip_dst()
+
+    def skip_src(self):
+
+        srcstep = self.svstate.srcstep
+        srcmask = self.srcmask
+        pred_src_zero = self.pred_sz
+        vl = self.svstate.vl
+        # srcstep-skipping opportunity identified
+        if self.srcstep_skip:
+            while (((1 << srcstep) & srcmask) == 0) and (srcstep != vl):
+                log("      sskip", bin(1 << srcstep))
+                srcstep += 1
+
+        # now work out if the relevant mask bits require zeroing
+        if pred_src_zero:
+            pred_src_zero = ((1 << srcstep) & srcmask) == 0
+
+        # store new srcstep / dststep
+        self.new_srcstep = srcstep
+        self.pred_src_zero = pred_src_zero
+        log("    new srcstep", srcstep)
+
+    def skip_dst(self):
+        # dststep-skipping opportunity identified
+        dststep = self.svstate.dststep
+        dstmask = self.dstmask
+        pred_dst_zero = self.pred_dz
+        vl = self.svstate.vl
+        if self.dststep_skip:
+            while (((1 << dststep) & dstmask) == 0) and (dststep != vl):
+                log("      dskip", bin(1 << dststep))
+                dststep += 1
+
+        # now work out if the relevant mask bits require zeroing
+        if pred_dst_zero:
+            pred_dst_zero = ((1 << dststep) & dstmask) == 0
+
+        # store new srcstep / dststep
+        self.new_dststep = dststep
+        self.pred_dst_zero = pred_dst_zero
+        log("    new dststep", dststep)
+
+
+class ISACaller(ISACallerHelper, ISAFPHelpers, StepLoop):
     # decoder2 - an instance of power_decoder2
     # regfile - a list of initial values for the registers
     # initial_{etc} - initial values for SPRs, Condition Register, Mem, MSR
@@ -578,7 +885,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if isinstance(initial_svstate, int):
             initial_svstate = SVP64State(initial_svstate)
         # SVSTATE, MSR and PC
-        self.svstate = initial_svstate
+        StepLoop.__init__(self, initial_svstate)
         self.msr = SelectableInt(initial_msr, 64)  # underlying reg
         self.pc = PC()
         # GPR FPR SPR registers
@@ -693,7 +1000,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         self.spr['SRR1'].value = msr
         if self.is_svp64_mode:
             self.spr['SVSRR0'] = self.namespace['SVSTATE'].value
-        self.trap_nia = SelectableInt(trap_addr | (kaivb&~0x1fff), 64)
+        self.trap_nia = SelectableInt(trap_addr | (kaivb & ~0x1fff), 64)
         self.spr['SRR1'][trap_bit] = 1  # change *copy* of MSR in SRR1
 
         # set exception bits.  TODO: this should, based on the address
@@ -728,7 +1035,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # then "yield" fields only from op_fields rather than hard-coded
         # list, here.
         fields = self.decoder.sigforms[formname]
-        log("prep_namespace", formname, op_fields)
+        log("prep_namespace", formname, op_fields, insn_name)
         for name in op_fields:
             # CR immediates. deal with separately.  needs modifying
             # pseudocode
@@ -741,10 +1048,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 assert regnum <= 7, "sigh, TODO, 128 CR fields"
                 val = (val & 0b11) | (regnum << 2)
             else:
-                if name == 'spr':
-                    sig = getattr(fields, name.upper())
-                else:
-                    sig = getattr(fields, name)
+                sig = getattr(fields, name)
                 val = yield sig
             # these are all opcode fields involved in index-selection of CR,
             # and need to do "standard" arithmetic.  CR[BA+32] for example
@@ -872,7 +1176,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
 
         # now update XER OV/OV32/SO
         so = self.spr['XER'][XER_bits['SO']]
-        new_so = so | ov # sticky overflow ORs in old with new
+        new_so = so | ov  # sticky overflow ORs in old with new
         self.spr['XER'][XER_bits['OV']] = ov
         self.spr['XER'][XER_bits['OV32']] = ov32
         self.spr['XER'][XER_bits['SO']] = new_so
@@ -1142,6 +1446,9 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         asmop = yield from self.get_assembly_name()
         log("call", ins_name, asmop)
 
+        # sv.setvl is *not* a loop-function. sigh
+        log("is_svp64_mode", self.is_svp64_mode, asmop)
+
         # check privileged
         int_op = yield self.dec2.dec.op.internal_op
         spr_msb = yield from self.get_spr_msb()
@@ -1177,15 +1484,15 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
 
         # list of instructions not being supported by binutils (.long)
         dotstrp = asmop[:-1] if asmop[-1] == '.' else asmop
-        if dotstrp in [ 'fsins', 'fcoss',
-                    'ffmadds', 'fdmadds', 'ffadds',
-                     'mins', 'maxs', 'minu', 'maxu',
-                    'setvl', 'svindex', 'svremap', 'svstep',
-                    'svshape', 'svshape2',
-                    'grev', 'ternlogi', 'bmask', 'cprop',
-                    'absdu', 'absds', 'absdacs', 'absdacu', 'avgadd',
-                    'fmvis', 'fishmv',
-                    ]:
+        if dotstrp in [*FPTRANS_INSNS,
+                       'ffmadds', 'fdmadds', 'ffadds',
+                       'mins', 'maxs', 'minu', 'maxu',
+                       'setvl', 'svindex', 'svremap', 'svstep',
+                       'svshape', 'svshape2',
+                       'grev', 'ternlogi', 'bmask', 'cprop',
+                       'absdu', 'absds', 'absdacs', 'absdacu', 'avgadd',
+                       'fmvis', 'fishmv', 'pcdec'
+                       ]:
             illegal = False
             ins_name = dotstrp
 
@@ -1216,7 +1523,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             return
 
         # look up instruction in ISA.instrs, prepare namespace
-        info = self.instrs[ins_name]
+        if ins_name == 'pcdec': # grrrr yes there are others ("stbcx." etc.)
+            info = self.instrs[ins_name+"."]
+        else:
+            info = self.instrs[ins_name]
         yield from self.prep_namespace(ins_name, info.form, info.op_fields)
 
         # preserve order of register names
@@ -1233,8 +1543,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("sv rm", sv_rm, dest_cr, src_cr, src_byname, dest_byname)
 
         # see if srcstep/dststep need skipping over masked-out predicate bits
-        if (self.is_svp64_mode or ins_name == 'setvl' or
-                ins_name in ['svremap', 'svstate']):
+        if (self.is_svp64_mode or ins_name in ['setvl', 'svremap', 'svstate']):
             yield from self.svstate_pre_inc()
         if self.is_svp64_mode:
             pre = yield from self.update_new_svstate_steps()
@@ -1267,7 +1576,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if persist or self.last_op_svshape:
             remaps = self.get_remap_indices()
         if self.is_svp64_mode and (persist or self.last_op_svshape):
-            yield from self.remap_debug(remaps)
+            yield from self.remap_set_steps(remaps)
         # after that, settle down (combinatorial) to let Vector reg numbers
         # work themselves out
         yield Settle()
@@ -1316,6 +1625,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         # the ALL/ANY mode we can early-exit
         if self.is_svp64_mode and ins_name.startswith("sv.bc"):
             no_in_vec = yield self.dec2.no_in_vec  # BI is scalar
+            # XXX TODO - pack/unpack here
             end_loop = no_in_vec or srcstep == vl-1 or dststep == vl-1
             self.namespace['end_loop'] = SelectableInt(end_loop, 1)
 
@@ -1344,6 +1654,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
 
         # detect if CA/CA32 already in outputs (sra*, basically)
         already_done = 0
+        output_names = []
         if info.write_regs:
             output_names = create_args(info.write_regs)
             for name in output_names:
@@ -1352,11 +1663,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 if name == 'CA32':
                     already_done |= 2
 
-        log("carry already done?", bin(already_done))
-        if hasattr(self.dec2.e.do, "output_carry"):
-            carry_en = yield self.dec2.e.do.output_carry
-        else:
-            carry_en = False
+        log("carry already done?", bin(already_done), output_names)
+        carry_en = yield self.dec2.e.do.output_carry
         if carry_en:
             yield from self.handle_carry_(inputs, results, already_done)
 
@@ -1367,14 +1675,17 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
                 if name == 'overflow':
                     overflow = output
 
+        # and one called CR0
+        cr0 = None
+        if info.write_regs:
+            for name, output in zip(output_names, results):
+                if name == 'CR0':
+                    cr0 = output
+
         if not self.is_svp64_mode:  # yeah just no. not in parallel processing
             # detect if overflow was in return result
-            if hasattr(self.dec2.e.do, "oe"):
-                ov_en = yield self.dec2.e.do.oe.oe
-                ov_ok = yield self.dec2.e.do.oe.ok
-            else:
-                ov_en = False
-                ov_ok = False
+            ov_en = yield self.dec2.e.do.oe.oe
+            ov_ok = yield self.dec2.e.do.oe.ok
             log("internal overflow", ins_name, overflow, "en?", ov_en, ov_ok)
             if ov_en & ov_ok:
                 yield from self.handle_overflow(inputs, results, overflow)
@@ -1384,28 +1695,72 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         if not self.is_svp64_mode or not pred_dst_zero:
             if hasattr(self.dec2.e.do, "rc"):
                 rc_en = yield self.dec2.e.do.rc.rc
+        # don't do Rc=1 for svstep it is handled explicitly.
+        # XXX TODO: now that CR0 is supported, sort out svstep's pseudocode
+        # to write directly to CR0 instead of in ISACaller. hooyahh.
         if rc_en and ins_name not in ['svstep']:
-            if ins_name.startswith("f"):
-                rc_reg = "CR1" # not calculated correctly yet (not FP compares)
-            else:
-                rc_reg = "CR0"
-            regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, rc_reg)
-            cmps = results
-            # hang on... for `setvl` actually you want to test SVSTATE.VL
-            is_setvl = ins_name == 'setvl'
-            if is_setvl:
-                vl = results[0].vl
-                cmps = (SelectableInt(vl, 64), overflow,)
-            else:
-                overflow = None # do not override overflow except in setvl
-            self.handle_comparison(cmps, regnum, overflow, no_so=is_setvl)
+            yield from self.do_rc_ov(ins_name, results, overflow, cr0)
+
+        # check failfirst
+        rm_mode = yield self.dec2.rm_dec.mode
+        ff_inv = yield self.dec2.rm_dec.inv
+        cr_bit = yield self.dec2.rm_dec.cr_sel
+        log(" ff rm_mode", rc_en, rm_mode, SVP64RMMode.FFIRST.value)
+        log("        inv", ff_inv)
+        log("     cr_bit", cr_bit)
+        ffirst_hit = False
+        if rc_en and rm_mode == SVP64RMMode.FFIRST.value:
+            regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, "CR0")
+            crtest = self.crl[regnum]
+            ffirst_hit = crtest[cr_bit] != ff_inv
+            log("cr test", regnum, int(crtest), crtest, cr_bit, ff_inv)
+            log("cr test?", ffirst_hit)
+            if ffirst_hit:
+                self.svstate.vl = srcstep
+                yield self.dec2.state.svstate.eq(self.svstate.value)
+                yield Settle()  # let decoder update
 
         # any modified return results?
+        yield from self.do_outregs_nia(asmop, ins_name, info,
+                                       output_names, results,
+                                       carry_en, rc_en, ffirst_hit)
+
+    def do_rc_ov(self, ins_name, results, overflow, cr0):
+        if ins_name.startswith("f"):
+            rc_reg = "CR1"  # not calculated correctly yet (not FP compares)
+        else:
+            rc_reg = "CR0"
+        regnum, is_vec = yield from get_pdecode_cr_out(self.dec2, rc_reg)
+        cmps = results
+        # hang on... for `setvl` actually you want to test SVSTATE.VL
+        is_setvl = ins_name == 'setvl'
+        if is_setvl:
+            vl = results[0].vl
+            cmps = (SelectableInt(vl, 64), overflow,)
+        else:
+            overflow = None  # do not override overflow except in setvl
+
+        # if there was not an explicit CR0 in the pseudocode, do implicit Rc=1
+        if cr0 is None:
+            self.handle_comparison(cmps, regnum, overflow, no_so=is_setvl)
+        else:
+            # otherwise we just blat CR0 into the required regnum
+            log("explicit rc0", cr0)
+            self.crl[regnum].eq(cr0)
+
+    def do_outregs_nia(self, asmop, ins_name, info, output_names, results,
+                       carry_en, rc_en, ffirst_hit):
+        # write out any regs for this instruction
         if info.write_regs:
             for name, output in zip(output_names, results):
                 yield from self.check_write(info, name, output, carry_en)
 
-        nia_update = (yield from self.check_step_increment(results, rc_en,
+        if ffirst_hit:
+            self.svp64_reset_loop()
+            nia_update = True
+        else:
+            # check advancement of src/dst/sub-steps and if PC needs updating
+            nia_update = (yield from self.check_step_increment(results, rc_en,
                                                            asmop, ins_name))
         if nia_update:
             self.update_pc_next()
@@ -1479,18 +1834,23 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             log('reading reg %s %s' % (name, str(regnum)), is_vec)
             if name in fregs:
                 reg_val = SelectableInt(self.fpr(regnum))
-                log(f"read reg f{regnum}: 0x{reg_val.value:X}",
-                    kind=LogKind.InstrInOuts)
+                log("read reg %d: 0x%x" % (regnum, reg_val.value))
             elif name is not None:
                 reg_val = SelectableInt(self.gpr(regnum))
-                log(f"read reg r{regnum}: 0x{reg_val.value:X}",
-                    kind=LogKind.InstrInOuts)
+                log("read reg %d: 0x%x" % (regnum, reg_val.value))
         else:
             log('zero input reg %s %s' % (name, str(regnum)), is_vec)
             reg_val = 0
         return reg_val
 
-    def remap_debug(self, remaps):
+    def remap_set_steps(self, remaps):
+        """remap_set_steps sets up the in1/2/3 and out1/2 steps.
+        they work in concert with PowerDecoder2 at the moment,
+        there is no HDL implementation of REMAP.  therefore this
+        function, because ISACaller still uses PowerDecoder2,
+        will *explicitly* write the dec2.XX_step values. this has
+        to get sorted out.
+        """
         # just some convenient debug info
         for i in range(4):
             sname = 'SVSHAPE%d' % i
@@ -1533,129 +1893,130 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
     def check_write(self, info, name, output, carry_en):
         if name == 'overflow':  # ignore, done already (above)
             return
+        if name == 'CR0':  # ignore, done already (above)
+            return
         if isinstance(output, int):
             output = SelectableInt(output, 256)
+        # write carry flafs
         if name in ['CA', 'CA32']:
             if carry_en:
                 log("writing %s to XER" % name, output)
-                log(f"write XER field {name}: "
-                    f"0x{output.value % (1 << 64):X}",
-                    kind=LogKind.InstrInOuts)
+                log("write XER %s 0x%x" % (name, output.value))
                 self.spr['XER'][XER_bits[name]] = output.value
             else:
                 log("NOT writing %s to XER" % name, output)
-        elif name in info.special_regs:
+            return
+        # write special SPRs
+        if name in info.special_regs:
             log('writing special %s' % name, output, special_sprs)
-            log(f"write reg {name}: "
-                f"0x{output.value % (1 << 64):X}",
-                kind=LogKind.InstrInOuts)
+            log("write reg %s 0x%x" % (name, output.value))
             if name in special_sprs:
                 self.spr[name] = output
             else:
                 self.namespace[name].eq(output)
             if name == 'MSR':
                 log('msr written', hex(self.msr.value))
+            return
+        # find out1/out2 PR/FPR
+        regnum, is_vec = yield from get_pdecode_idx_out(self.dec2, name)
+        if regnum is None:
+            regnum, is_vec = yield from get_pdecode_idx_out2(self.dec2, name)
+        if regnum is None:
+            # temporary hack for not having 2nd output
+            regnum = yield getattr(self.decoder, name)
+            is_vec = False
+        # convenient debug prefix
+        if name in fregs:
+            reg_prefix = 'f'
         else:
-            regnum, is_vec = yield from get_pdecode_idx_out(self.dec2, name)
-            if regnum is None:
-                regnum, is_vec = yield from get_pdecode_idx_out2(
-                    self.dec2, name)
-            if regnum is None:
-                # temporary hack for not having 2nd output
-                regnum = yield getattr(self.decoder, name)
-                is_vec = False
-            if self.is_svp64_mode and self.pred_dst_zero:
-                log('zeroing reg %d %s' % (regnum, str(output)),
-                    is_vec)
-                output = SelectableInt(0, 256)
-            else:
-                if name in fregs:
-                    reg_prefix = 'f'
-                else:
-                    reg_prefix = 'r'
-                log(f"write reg {reg_prefix}{regnum}: "
-                    f"0x{output.value % (1 << 64):X}",
-                    kind=LogKind.InstrInOuts)
-            if output.bits > 64:
-                output = SelectableInt(output.value, 64)
-            if name in fregs:
-                self.fpr[regnum] = output
-            else:
-                self.gpr[regnum] = output
+            reg_prefix = 'r'
+        # check zeroing due to predicate bit being zero
+        if self.is_svp64_mode and self.pred_dst_zero:
+            log('zeroing reg %d %s' % (regnum, str(output)), is_vec)
+            output = SelectableInt(0, 256)
+        log("write reg %s%d 0x%x" % (reg_prefix, regnum, output.value),
+            kind=LogKind.InstrInOuts)
+        # zero-extend tov64 bit begore storing (should use EXT oh well)
+        if output.bits > 64:
+            output = SelectableInt(output.value, 64)
+        if name in fregs:
+            self.fpr[regnum] = output
+        else:
+            self.gpr[regnum] = output
 
     def check_step_increment(self, results, rc_en, asmop, ins_name):
         # check if it is the SVSTATE.src/dest step that needs incrementing
         # this is our Sub-Program-Counter loop from 0 to VL-1
+        if not self.allow_next_step_inc:
+            if self.is_svp64_mode:
+                return (yield from self.svstate_post_inc(ins_name))
+
+            # XXX only in non-SVP64 mode!
+            # record state of whether the current operation was an svshape,
+            # OR svindex!
+            # to be able to know if it should apply in the next instruction.
+            # also (if going to use this instruction) should disable ability
+            # to interrupt in between. sigh.
+            self.last_op_svshape = asmop in ['svremap', 'svindex',
+                                             'svshape2']
+            return True
+
         pre = False
         post = False
         nia_update = True
-        if self.allow_next_step_inc:
-            log("SVSTATE_NEXT: inc requested, mode",
-                self.svstate_next_mode, self.allow_next_step_inc)
-            yield from self.svstate_pre_inc()
-            pre = yield from self.update_new_svstate_steps()
-            if pre:
-                # reset at end of loop including exit Vertical Mode
-                log("SVSTATE_NEXT: end of loop, reset")
-                self.svp64_reset_loop()
-                self.svstate.vfirst = 0
-                self.update_nia()
-                if not rc_en:
-                    return True
-                results = [SelectableInt(0, 64)]
-                self.handle_comparison(results)  # CR0
+        log("SVSTATE_NEXT: inc requested, mode",
+            self.svstate_next_mode, self.allow_next_step_inc)
+        yield from self.svstate_pre_inc()
+        pre = yield from self.update_new_svstate_steps()
+        if pre:
+            # reset at end of loop including exit Vertical Mode
+            log("SVSTATE_NEXT: end of loop, reset")
+            self.svp64_reset_loop()
+            self.svstate.vfirst = 0
+            self.update_nia()
+            if not rc_en:
                 return True
-            if self.allow_next_step_inc == 2:
-                log("SVSTATE_NEXT: read")
-                nia_update = (yield from self.svstate_post_inc(ins_name))
-            else:
-                log("SVSTATE_NEXT: post-inc")
-            # use actual src/dst-step here to check end, do NOT
-            # use bit-reversed version
-            srcstep, dststep = self.new_srcstep, self.new_dststep
-            ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
-            remaps = self.get_remap_indices()
-            remap_idxs = self.remap_idxs
-            vl = self.svstate.vl
-            subvl = yield self.dec2.rm_dec.rm_in.subvl
-            end_src = srcstep == vl-1
-            end_dst = dststep == vl-1
-            if self.allow_next_step_inc != 2:
-                yield from self.advance_svstate_steps(end_src, end_dst)
-            self.namespace['SVSTATE'] = self.svstate.spr
-            # set CR0 (if Rc=1) based on end
-            if rc_en:
-                endtest = 1 if (end_src or end_dst) else 0
-                #results = [SelectableInt(endtest, 64)]
-                # self.handle_comparison(results) # CR0
-
-                # see if svstep was requested, if so, which SVSTATE
-                endings = 0b111
-                if self.svstate_next_mode > 0:
-                    shape_idx = self.svstate_next_mode.value-1
-                    endings = self.remap_loopends[shape_idx]
-                cr_field = SelectableInt((~endings) << 1 | endtest, 4)
-                log("svstep Rc=1, CR0", cr_field)
-                self.crl[0].eq(cr_field)  # CR0
-            if end_src or end_dst:
-                # reset at end of loop including exit Vertical Mode
-                log("SVSTATE_NEXT: after increments, reset")
-                self.svp64_reset_loop()
-                self.svstate.vfirst = 0
-            return nia_update
-
-        if self.is_svp64_mode:
-            return (yield from self.svstate_post_inc(ins_name))
-
-        # XXX only in non-SVP64 mode!
-        # record state of whether the current operation was an svshape,
-        # OR svindex!
-        # to be able to know if it should apply in the next instruction.
-        # also (if going to use this instruction) should disable ability
-        # to interrupt in between. sigh.
-        self.last_op_svshape = asmop in ['svremap', 'svindex', 'svshape2']
-
-        return True
+            results = [SelectableInt(0, 64)]
+            self.handle_comparison(results)  # CR0
+            return True
+        if self.allow_next_step_inc == 2:
+            log("SVSTATE_NEXT: read")
+            nia_update = (yield from self.svstate_post_inc(ins_name))
+        else:
+            log("SVSTATE_NEXT: post-inc")
+        # use actual src/dst-step here to check end, do NOT
+        # use bit-reversed version
+        srcstep, dststep = self.new_srcstep, self.new_dststep
+        ssubstep, dsubstep = self.new_ssubstep, self.new_dsubstep
+        remaps = self.get_remap_indices()
+        remap_idxs = self.remap_idxs
+        vl = self.svstate.vl
+        subvl = yield self.dec2.rm_dec.rm_in.subvl
+        end_src = srcstep == vl-1
+        end_dst = dststep == vl-1
+        if self.allow_next_step_inc != 2:
+            yield from self.advance_svstate_steps(end_src, end_dst)
+        #self.namespace['SVSTATE'] = self.svstate.spr
+        # set CR0 (if Rc=1) based on end
+        if rc_en:
+            endtest = 1 if (end_src or end_dst) else 0
+            #results = [SelectableInt(endtest, 64)]
+            # self.handle_comparison(results) # CR0
+
+            # see if svstep was requested, if so, which SVSTATE
+            endings = 0b111
+            if self.svstate_next_mode > 0:
+                shape_idx = self.svstate_next_mode.value-1
+                endings = self.remap_loopends[shape_idx]
+            cr_field = SelectableInt((~endings) << 1 | endtest, 4)
+            log("svstep Rc=1, CR0", cr_field)
+            self.crl[0].eq(cr_field)  # CR0
+        if end_src or end_dst:
+            # reset at end of loop including exit Vertical Mode
+            log("SVSTATE_NEXT: after increments, reset")
+            self.svp64_reset_loop()
+            self.svstate.vfirst = 0
+        return nia_update
 
     def SVSTATE_NEXT(self, mode, submode):
         """explicitly moves srcstep/dststep on to next element, for
@@ -1680,96 +2041,6 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             return SelectableInt(self.svstate.dststep, 7)
         return SelectableInt(0, 7)
 
-    def svstate_pre_inc(self):
-        """check if srcstep/dststep need to skip over masked-out predicate bits
-        note that this is not supposed to do anything to substep,
-        it is purely for skipping masked-out bits
-        """
-        # get SVSTATE VL (oh and print out some debug stuff)
-        # yield Delay(1e-10)  # make changes visible
-        vl = self.svstate.vl
-        subvl = yield self.dec2.rm_dec.rm_in.subvl
-        srcstep = self.svstate.srcstep
-        dststep = self.svstate.dststep
-        ssubstep = self.svstate.ssubstep
-        dsubstep = self.svstate.dsubstep
-        sv_a_nz = yield self.dec2.sv_a_nz
-        fft_mode = yield self.dec2.use_svp64_fft
-        in1 = yield self.dec2.e.read_reg1.data
-        log("SVP64: VL, subvl, srcstep, dststep, ssubstep, dsybstep, sv_a_nz, "
-            "in1 fft, svp64",
-            vl, subvl, srcstep, dststep, ssubstep, dsubstep,
-            sv_a_nz, in1, fft_mode,
-            self.is_svp64_mode)
-
-        # get predicate mask (all 64 bits)
-        srcmask = dstmask = 0xffff_ffff_ffff_ffff
-
-        pmode = yield self.dec2.rm_dec.predmode
-        pack = yield self.dec2.rm_dec.pack
-        unpack = yield self.dec2.rm_dec.unpack
-        reverse_gear = yield self.dec2.rm_dec.reverse_gear
-        sv_ptype = yield self.dec2.dec.op.SV_Ptype
-        srcpred = yield self.dec2.rm_dec.srcpred
-        dstpred = yield self.dec2.rm_dec.dstpred
-        pred_src_zero = yield self.dec2.rm_dec.pred_sz
-        pred_dst_zero = yield self.dec2.rm_dec.pred_dz
-        if pmode == SVP64PredMode.INT.value:
-            srcmask = dstmask = get_predint(self.gpr, dstpred)
-            if sv_ptype == SVPtype.P2.value:
-                srcmask = get_predint(self.gpr, srcpred)
-        elif pmode == SVP64PredMode.CR.value:
-            srcmask = dstmask = get_predcr(self.crl, dstpred, vl)
-            if sv_ptype == SVPtype.P2.value:
-                srcmask = get_predcr(self.crl, srcpred, vl)
-        # work out if the ssubsteps are completed
-        ssubstart = ssubstep == 0
-        dsubstart = dsubstep == 0
-        log("    pmode", pmode)
-        log("    pack/unpack", pack, unpack)
-        log("    reverse", reverse_gear)
-        log("    ptype", sv_ptype)
-        log("    srcpred", bin(srcpred))
-        log("    dstpred", bin(dstpred))
-        log("    srcmask", bin(srcmask))
-        log("    dstmask", bin(dstmask))
-        log("    pred_sz", bin(pred_src_zero))
-        log("    pred_dz", bin(pred_dst_zero))
-        log("    ssubstart", ssubstart)
-        log("    dsubstart", dsubstart)
-
-        # okaaay, so here we simply advance srcstep (TODO dststep)
-        # this can ONLY be done at the beginning of the "for" loop
-        # (this is all actually a FSM so it's hell to keep track sigh)
-        if ssubstart:
-            # until the predicate mask has a "1" bit... or we run out of VL
-            # let srcstep==VL be the indicator to move to next instruction
-            if not pred_src_zero:
-                while (((1 << srcstep) & srcmask) == 0) and (srcstep != vl):
-                    log("      sskip", bin(1 << srcstep))
-                    srcstep += 1
-        if dsubstart:
-            # same for dststep
-            if not pred_dst_zero:
-                while (((1 << dststep) & dstmask) == 0) and (dststep != vl):
-                    log("      dskip", bin(1 << dststep))
-                    dststep += 1
-
-        # now work out if the relevant mask bits require zeroing
-        if pred_dst_zero:
-            pred_dst_zero = ((1 << dststep) & dstmask) == 0
-        if pred_src_zero:
-            pred_src_zero = ((1 << srcstep) & srcmask) == 0
-
-        # store new srcstep / dststep
-        self.new_srcstep, self.new_dststep = (srcstep, dststep)
-        self.new_ssubstep, self.new_dsubstep = (ssubstep, dsubstep)
-        self.pred_dst_zero, self.pred_src_zero = (pred_dst_zero, pred_src_zero)
-        log("    new srcstep", srcstep)
-        log("    new dststep", dststep)
-        log("    new ssubstep", ssubstep)
-        log("    new dsubstep", dsubstep)
-
     def get_src_dststeps(self):
         """gets srcstep, dststep, and ssubstep, dsubstep
         """
@@ -1793,18 +2064,29 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
         vl = self.svstate.vl
         subvl = yield self.dec2.rm_dec.rm_in.subvl
+        rm_mode = yield self.dec2.rm_dec.mode
+        ff_inv = yield self.dec2.rm_dec.inv
+        cr_bit = yield self.dec2.rm_dec.cr_sel
         log("    srcstep", srcstep)
         log("    dststep", dststep)
+        log("        pack", pack)
+        log("      unpack", unpack)
         log("    ssubstep", ssubstep)
         log("    dsubstep", dsubstep)
         log("         vl", vl)
         log("      subvl", subvl)
+        log("    rm_mode", rm_mode)
+        log("        inv", ff_inv)
+        log("     cr_bit", cr_bit)
 
         # check if end reached (we let srcstep overrun, above)
         # nothing needs doing (TODO zeroing): just do next instruction
-        return srcstep == vl or dststep == vl
+        return ((ssubstep == subvl and srcstep == vl) or
+                (dsubstep == subvl and dststep == vl))
 
     def svstate_post_inc(self, insn_name, vf=0):
         # check if SV "Vertical First" mode is enabled
@@ -1824,6 +2106,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         dststep = self.svstate.dststep
         ssubstep = self.svstate.ssubstep
         dsubstep = self.svstate.dsubstep
+        pack = self.svstate.pack
+        unpack = self.svstate.unpack
         rm_mode = yield self.dec2.rm_dec.mode
         reverse_gear = yield self.dec2.rm_dec.reverse_gear
         sv_ptype = yield self.dec2.dec.op.SV_Ptype
@@ -1836,6 +2120,8 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("    svstate.dststep", dststep)
         log("    svstate.ssubstep", ssubstep)
         log("    svstate.dsubstep", dsubstep)
+        log("    svstate.pack", pack)
+        log("    svstate.unpack", unpack)
         log("    mode", rm_mode)
         log("    reverse", reverse_gear)
         log("    out_vec", out_vec)
@@ -1857,8 +2143,12 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         else:
             svp64_is_vector = out_vec
         # loops end at the first "hit" (source or dest)
-        loopend = ((srcstep == vl-1 and ssubstep == subvl) or
-                   (dststep == vl-1 and dsubstep == subvl))
+        end_src = srcstep == vl-1
+        end_dst = dststep == vl-1
+        loopend = ((end_src and ssubstep == subvl) or
+                   (end_dst and dsubstep == subvl))
+        log("loopend", svp64_is_vector, loopend, end_src, end_dst,
+            ssubstep == subvl, dsubstep == subvl)
         if not svp64_is_vector or loopend:
             # reset loop to zero and update NIA
             self.svp64_reset_loop()
@@ -1867,8 +2157,9 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
             return True
 
         # still looping, advance and update NIA
-        yield from self.advance_svstate_steps()
+        yield from self.advance_svstate_steps(end_src, end_dst)
         self.namespace['SVSTATE'] = self.svstate
+
         # not an SVP64 branch, so fix PC (NIA==CIA) for next loop
         # (by default, NIA is CIA+4 if v3.0B or CIA+8 if SVP64)
         # this way we keep repeating the same instruction (with new steps)
@@ -1877,34 +2168,10 @@ class ISACaller(ISACallerHelper, ISAFPHelpers):
         log("end of sub-pc call", self.namespace['CIA'], self.namespace['NIA'])
         return False  # DO NOT allow PC update whilst Sub-PC loop running
 
-    def advance_svstate_steps(self, end_src=False, end_dst=False):
-        """ advance sub/steps. note that Pack/Unpack *INVERTS* the order.
-        TODO when Pack/Unpack is set, substep becomes the *outer* loop
-        """
-        subvl = yield self.dec2.rm_dec.rm_in.subvl
-        # first source step
-        ssubstep = self.svstate.ssubstep
-        end_sub = ssubstep == subvl
-        if end_sub:
-            if not end_src:
-                self.svstate.srcstep += SelectableInt(1, 7)
-            self.svstate.ssubstep = SelectableInt(0, 2)  # reset
-        else:
-            self.svstate.ssubstep += SelectableInt(1, 2) # advance ssubstep
-        # now dest step
-        dsubstep = self.svstate.dsubstep
-        end_sub = dsubstep == subvl
-        if end_sub:
-            if not end_dst:
-                self.svstate.dststep += SelectableInt(1, 7)
-            self.svstate.dsubstep = SelectableInt(0, 2)  # reset
-        else:
-            self.svstate.dsubstep += SelectableInt(1, 2) # advance ssubstep
-
     def update_pc_next(self):
         # UPDATE program counter
         self.pc.update(self.namespace, self.is_svp64_mode)
-        self.svstate.spr = self.namespace['SVSTATE']
+        #self.svstate.spr = self.namespace['SVSTATE']
         log("end of call", self.namespace['CIA'],
             self.namespace['NIA'],
             self.namespace['SVSTATE'])