move ffadds to not conflict with fptrans -- makes space for min/max/fmod/remainder ops

[openpower-isa.git] / src / openpower / decoder / power_decoder2.py
diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py

index 2d315522f839eab3b2d904fe642a47c688f9b89f..8cd7db702ba5be574173e704162f643317fefa05 100644 (file)
--- a/src/openpower/decoder/power_decoder2.py
+++ b/src/openpower/decoder/power_decoder2.py
@@ -28,11 +28,11 @@ from openpower.decoder.power_decoder import (create_pdecode,
                                               create_pdecode_svp64_ldst,
                                               PowerOp)
  from openpower.decoder.power_enums import (MicrOp, CryIn, Function,
-                                     CRInSel, CROutSel,
-                                     LdstLen, In1Sel, In2Sel, In3Sel,
-                                     OutSel, SPRfull, SPRreduced,
-                                     RC, SVP64LDSTmode, LDSTMode,
-                                     SVEXTRA, SVEtype, SVPtype)
+                                           CRInSel, CROutSel,
+                                           LdstLen, In1Sel, In2Sel, In3Sel,
+                                           OutSel, SPRfull, SPRreduced,
+                                           RCOE, SVP64LDSTmode, LDSTMode,
+                                           SVEXTRA, SVEtype, SVPtype)
  from openpower.decoder.decode2execute1 import (Decode2ToExecute1Type, Data,
                                                 Decode2ToOperand)
  
@@ -41,7 +41,7 @@ from openpower.consts import (MSR, SPEC, EXTRA2, EXTRA3, SVP64P, field,
                                FastRegsEnum, XERRegsEnum, TT)
  
  from openpower.state import CoreState
-from openpower.util import (spr_to_fast, log)
+from openpower.util import (spr_to_fast, spr_to_state, log)
  
  
  def decode_spr_num(spr):
@@ -57,7 +57,8 @@ def instr_is_priv(m, op, insn):
          with m.Case(MicrOp.OP_ATTN, MicrOp.OP_MFMSR, MicrOp.OP_MTMSRD,
                      MicrOp.OP_MTMSR, MicrOp.OP_RFID):
              comb += is_priv_insn.eq(1)
-        with m.Case(MicrOp.OP_TLBIE) : comb += is_priv_insn.eq(1)
+        with m.Case(MicrOp.OP_TLBIE):
+            comb += is_priv_insn.eq(1)
          with m.Case(MicrOp.OP_MFSPR, MicrOp.OP_MTSPR):
              with m.If(insn[20]):  # field XFX.spr[-1] i think
                  comb += is_priv_insn.eq(1)
@@ -77,7 +78,8 @@ class SPRMap(Elaboratable):
  
          self.spr_i = Signal(10, reset_less=True)
          self.spr_o = Data(SPR, name="spr_o")
-        self.fast_o = Data(3, name="fast_o")
+        self.fast_o = Data(4, name="fast_o")
+        self.state_o = Data(3, name="state_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -94,6 +96,10 @@ class SPRMap(Elaboratable):
                  with m.Case(x.value):
                      m.d.comb += self.fast_o.data.eq(v)
                      m.d.comb += self.fast_o.ok.eq(1)
+            for x, v in spr_to_state.items():
+                with m.Case(x.value):
+                    m.d.comb += self.state_o.data.eq(v)
+                    m.d.comb += self.state_o.ok.eq(1)
          return m
  
  
@@ -115,7 +121,8 @@ class DecodeA(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, name="reg_a")
          self.spr_out = Data(SPR, "spr_a")
-        self.fast_out = Data(3, "fast_a")
+        self.fast_out = Data(4, "fast_a")
+        self.state_out = Data(3, "state_a")
          self.sv_nz = Signal(1)
  
      def elaborate(self, platform):
@@ -180,6 +187,7 @@ class DecodeA(Elaboratable):
                  comb += sprmap.spr_i.eq(spr)
                  comb += self.spr_out.eq(sprmap.spr_o)
                  comb += self.fast_out.eq(sprmap.fast_o)
+                comb += self.state_out.eq(sprmap.state_o)
  
          return m
  
@@ -195,7 +203,7 @@ class DecodeAImm(Elaboratable):
          self.dec = dec
          self.sel_in = Signal(In1Sel, reset_less=True)
          self.immz_out = Signal(reset_less=True)
-        self.sv_nz = Signal(1) # EXTRA bits from SVP64
+        self.sv_nz = Signal(1)  # EXTRA bits from SVP64
  
      def elaborate(self, platform):
          m = Module()
@@ -205,8 +213,8 @@ class DecodeAImm(Elaboratable):
          ra = Signal(5, reset_less=True)
          comb += ra.eq(self.dec.RA)
          with m.If((self.sel_in == In1Sel.RA_OR_ZERO) &
-                    (ra == Const(0, 5)) &
-                    (self.sv_nz == Const(0, 1))):
+                  (ra == Const(0, 5)) &
+                  (self.sv_nz == Const(0, 1))):
              comb += self.immz_out.eq(1)
  
          return m
@@ -227,8 +235,8 @@ class DecodeB(Elaboratable):
          self.sel_in = Signal(In2Sel, reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(7, "reg_b")
-        self.reg_isvec = Signal(1, name="reg_b_isvec") # TODO: in reg_out
-        self.fast_out = Data(3, "fast_b")
+        self.reg_isvec = Signal(1, name="reg_b_isvec")  # TODO: in reg_out
+        self.fast_out = Data(4, "fast_b")
  
      def elaborate(self, platform):
          m = Module()
@@ -268,6 +276,7 @@ class DecodeB(Elaboratable):
  class DecodeBImm(Elaboratable):
      """DecodeB immediate from instruction
      """
+
      def __init__(self, dec):
          self.dec = dec
          self.sel_in = Signal(In2Sel, reset_less=True)
@@ -279,7 +288,7 @@ class DecodeBImm(Elaboratable):
  
          # select Register B Immediate
          with m.Switch(self.sel_in):
-            with m.Case(In2Sel.CONST_UI): # unsigned
+            with m.Case(In2Sel.CONST_UI):  # unsigned
                  comb += self.imm_out.data.eq(self.dec.UI)
                  comb += self.imm_out.ok.eq(1)
              with m.Case(In2Sel.CONST_SI):  # sign-extended 16-bit
@@ -292,35 +301,38 @@ class DecodeBImm(Elaboratable):
                  comb += si_hi.eq(self.dec.SI << 16)
                  comb += self.imm_out.data.eq(exts(si_hi, 32, 64))
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_UI_HI): # unsigned
+            with m.Case(In2Sel.CONST_UI_HI):  # unsigned
                  ui = Signal(16, reset_less=True)
                  comb += ui.eq(self.dec.UI)
                  comb += self.imm_out.data.eq(ui << 16)
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_LI): # sign-extend 24+2=26 bit
+            with m.Case(In2Sel.CONST_LI):  # sign-extend 24+2=26 bit
                  li = Signal(26, reset_less=True)
                  comb += li.eq(self.dec.LI << 2)
                  comb += self.imm_out.data.eq(exts(li, 26, 64))
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_BD): # sign-extend (14+2)=16 bit
+            with m.Case(In2Sel.CONST_BD):  # sign-extend (14+2)=16 bit
                  bd = Signal(16, reset_less=True)
                  comb += bd.eq(self.dec.BD << 2)
                  comb += self.imm_out.data.eq(exts(bd, 16, 64))
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_DS): # sign-extended (14+2=16) bit
+            with m.Case(In2Sel.CONST_DS):  # sign-extended (14+2=16) bit
                  ds = Signal(16, reset_less=True)
                  comb += ds.eq(self.dec.DS << 2)
                  comb += self.imm_out.data.eq(exts(ds, 16, 64))
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_M1): # signed (-1)
+            with m.Case(In2Sel.CONST_M1):  # signed (-1)
                  comb += self.imm_out.data.eq(~Const(0, 64))  # all 1s
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SH): # unsigned - for shift
+            with m.Case(In2Sel.CONST_SH):  # unsigned - for shift
                  comb += self.imm_out.data.eq(self.dec.sh)
                  comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SH32): # unsigned - for shift
+            with m.Case(In2Sel.CONST_SH32):  # unsigned - for shift
                  comb += self.imm_out.data.eq(self.dec.SH32)
                  comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_XBI):  # unsigned - for grevi
+                comb += self.imm_out.data.eq(self.dec.FormXB.XBI)
+                comb += self.imm_out.ok.eq(1)
  
          return m
  
@@ -362,6 +374,10 @@ class DecodeC(Elaboratable):
              with m.Case(In3Sel.RC):
                  comb += reg.data.eq(self.dec.RC)
                  comb += reg.ok.eq(1)
+            with m.Case(In3Sel.RT):
+                # for TLI-form ternlogi
+                comb += reg.data.eq(self.dec.RT)
+                comb += reg.ok.eq(1)
  
          return m
  
@@ -369,7 +385,7 @@ class DecodeC(Elaboratable):
  class DecodeOut(Elaboratable):
      """DecodeOut from instruction
  
-    decodes output register RA, RT or SPR
+    decodes output register RA, RT, FRS, FRT, or SPR
      """
  
      def __init__(self, dec, op, regreduce_en):
@@ -384,7 +400,8 @@ class DecodeOut(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, "reg_o")
          self.spr_out = Data(SPR, "spr_o")
-        self.fast_out = Data(3, "fast_o")
+        self.fast_out = Data(4, "fast_o")
+        self.state_out = Data(3, "state_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -395,6 +412,9 @@ class DecodeOut(Elaboratable):
  
          # select Register out field
          with m.Switch(self.sel_in):
+            with m.Case(OutSel.FRS):
+                comb += reg.data.eq(self.dec.FRS)
+                comb += reg.ok.eq(1)
              with m.Case(OutSel.FRT):
                  comb += reg.data.eq(self.dec.FRT)
                  comb += reg.ok.eq(1)
@@ -412,6 +432,7 @@ class DecodeOut(Elaboratable):
                      comb += sprmap.spr_i.eq(spr)
                      comb += self.spr_out.eq(sprmap.spr_o)
                      comb += self.fast_out.eq(sprmap.fast_o)
+                    comb += self.state_out.eq(sprmap.state_o)
  
          # determine Fast Reg
          with m.Switch(op.internal_op):
@@ -449,13 +470,13 @@ class DecodeOut2(Elaboratable):
          self.dec = dec
          self.op = op
          self.sel_in = Signal(OutSel, reset_less=True)
-        self.svp64_fft_mode = Signal(reset_less=True) # SVP64 FFT mode
+        self.svp64_fft_mode = Signal(reset_less=True)  # SVP64 FFT mode
          self.lk = Signal(reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, "reg_o2")
-        self.fp_madd_en = Signal(reset_less=True) # FFT instruction detected
-        self.fast_out = Data(3, "fast_o2")
-        self.fast_out3 = Data(3, "fast_o3")
+        self.fp_madd_en = Signal(reset_less=True)  # FFT instruction detected
+        self.fast_out = Data(4, "fast_o2")
+        self.fast_out3 = Data(4, "fast_o3")
  
      def elaborate(self, platform):
          m = Module()
@@ -486,12 +507,12 @@ class DecodeOut2(Elaboratable):
              with m.Case(MicrOp.OP_RFID):
                  comb += self.fast_out.data.eq(FastRegsEnum.SRR1)  # SRR1
                  comb += self.fast_out.ok.eq(1)
-                comb += self.fast_out3.data.eq(FastRegsEnum.SVSRR0) # SVSRR0
+                comb += self.fast_out3.data.eq(FastRegsEnum.SVSRR0)  # SVSRR0
                  comb += self.fast_out3.ok.eq(1)
  
          # SVP64 FFT mode, FP mul-add: 2nd output reg (FRS) same as FRT
          # will be offset by VL in hardware
-        #with m.Case(MicrOp.OP_FP_MADD):
+        # with m.Case(MicrOp.OP_FP_MADD):
          with m.If(self.svp64_fft_mode):
              comb += self.reg_out.data.eq(self.dec.FRT)
              comb += self.reg_out.ok.eq(1)
@@ -508,7 +529,7 @@ class DecodeRC(Elaboratable):
  
      def __init__(self, dec):
          self.dec = dec
-        self.sel_in = Signal(RC, reset_less=True)
+        self.sel_in = Signal(RCOE, reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.rc_out = Data(1, "rc")
  
@@ -518,13 +539,13 @@ class DecodeRC(Elaboratable):
  
          # select Record bit out field
          with m.Switch(self.sel_in):
-            with m.Case(RC.RC):
+            with m.Case(RCOE.RC, RCOE.RC_ONLY):
                  comb += self.rc_out.data.eq(self.dec.Rc)
                  comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.ONE):
+            with m.Case(RCOE.ONE):
                  comb += self.rc_out.data.eq(1)
                  comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.NONE):
+            with m.Case(RCOE.NONE):
                  comb += self.rc_out.data.eq(0)
                  comb += self.rc_out.ok.eq(1)
  
@@ -534,48 +555,30 @@ class DecodeRC(Elaboratable):
  class DecodeOE(Elaboratable):
      """DecodeOE from instruction
  
-    decodes OE field: uses RC decode detection which might not be good
-
-    -- For now, use "rc" in the decode table to decide whether oe exists.
-    -- This is not entirely correct architecturally: For mulhd and
-    -- mulhdu, the OE field is reserved. It remains to be seen what an
-    -- actual POWER9 does if we set it on those instructions, for now we
-    -- test that further down when assigning to the multiplier oe input.
+    decodes OE field: uses RC decode detection which has now been
+    updated to separate out RC_ONLY.  all cases RC_ONLY are *NOT*
+    listening to the OE field, here.
      """
  
      def __init__(self, dec, op):
          self.dec = dec
          self.op = op
-        self.sel_in = Signal(RC, reset_less=True)
+        self.sel_in = Signal(RCOE, reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.oe_out = Data(1, "oe")
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
-        op = self.op
  
-        with m.Switch(op.internal_op):
-
-            # mulhw, mulhwu, mulhd, mulhdu - these *ignore* OE
-            # also rotate
-            # XXX ARGH! ignoring OE causes incompatibility with microwatt
-            # http://lists.libre-soc.org/pipermail/libre-soc-dev/2020-August/000302.html
-            with m.Case(MicrOp.OP_MUL_H64, MicrOp.OP_MUL_H32,
-                        MicrOp.OP_EXTS, MicrOp.OP_CNTZ,
-                        MicrOp.OP_SHL, MicrOp.OP_SHR, MicrOp.OP_RLC,
-                        MicrOp.OP_LOAD, MicrOp.OP_STORE,
-                        MicrOp.OP_RLCL, MicrOp.OP_RLCR,
-                        MicrOp.OP_EXTSWSLI):
-                pass
-
-            # all other ops decode OE field
+        with m.Switch(self.sel_in):
+            with m.Case(RCOE.RC):
+                comb += self.oe_out.data.eq(self.dec.OE)
+                comb += self.oe_out.ok.eq(1)
              with m.Default():
-                # select OE bit out field
-                with m.Switch(self.sel_in):
-                    with m.Case(RC.RC):
-                        comb += self.oe_out.data.eq(self.dec.OE)
-                        comb += self.oe_out.ok.eq(1)
+                # default: clear OE.
+                comb += self.oe_out.data.eq(0)
+                comb += self.oe_out.ok.eq(0)
  
          return m
  
@@ -596,14 +599,14 @@ class DecodeCRIn(Elaboratable):
          self.cr_bitfield_b = Data(3, "cr_bitfield_b")
          self.cr_bitfield_o = Data(3, "cr_bitfield_o")
          self.whole_reg = Data(8,  "cr_fxm")
-        self.sv_override = Signal(2, reset_less=True) # do not do EXTRA spec
+        self.sv_override = Signal(2, reset_less=True)  # do not do EXTRA spec
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          op = self.op
          m.submodules.ppick = ppick = PriorityPicker(8, reverse_i=True,
-                                                       reverse_o=True)
+                                                    reverse_o=True)
  
          # zero-initialisation
          comb += self.cr_bitfield.ok.eq(0)
@@ -617,11 +620,11 @@ class DecodeCRIn(Elaboratable):
              with m.Case(CRInSel.NONE):
                  pass  # No bitfield activated
              with m.Case(CRInSel.CR0):
-                comb += self.cr_bitfield.data.eq(0) # CR0 (MSB0 numbering)
+                comb += self.cr_bitfield.data.eq(0)  # CR0 (MSB0 numbering)
                  comb += self.cr_bitfield.ok.eq(1)
                  comb += self.sv_override.eq(1)
              with m.Case(CRInSel.CR1):
-                comb += self.cr_bitfield.data.eq(1) # CR1 (MSB0 numbering)
+                comb += self.cr_bitfield.data.eq(1)  # CR1 (MSB0 numbering)
                  comb += self.cr_bitfield.ok.eq(1)
                  comb += self.sv_override.eq(2)
              with m.Case(CRInSel.BI):
@@ -643,7 +646,7 @@ class DecodeCRIn(Elaboratable):
              with m.Case(CRInSel.WHOLE_REG):
                  comb += self.whole_reg.ok.eq(1)
                  move_one = Signal(reset_less=True)
-                comb += move_one.eq(self.insn_in[20]) # MSB0 bit 11
+                comb += move_one.eq(self.insn_in[20])  # MSB0 bit 11
                  with m.If((op.internal_op == MicrOp.OP_MFCR) & move_one):
                      # must one-hot the FXM field
                      comb += ppick.i.eq(self.dec.FXM)
@@ -670,14 +673,14 @@ class DecodeCROut(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.cr_bitfield = Data(3, "cr_bitfield")
          self.whole_reg = Data(8,  "cr_fxm")
-        self.sv_override = Signal(2, reset_less=True) # do not do EXTRA spec
+        self.sv_override = Signal(2, reset_less=True)  # do not do EXTRA spec
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          op = self.op
          m.submodules.ppick = ppick = PriorityPicker(8, reverse_i=True,
-                                                       reverse_o=True)
+                                                    reverse_o=True)
  
          comb += self.cr_bitfield.ok.eq(0)
          comb += self.whole_reg.ok.eq(0)
@@ -693,11 +696,11 @@ class DecodeCROut(Elaboratable):
              with m.Case(CROutSel.NONE):
                  pass  # No bitfield activated
              with m.Case(CROutSel.CR0):
-                comb += self.cr_bitfield.data.eq(0) # CR0 (MSB0 numbering)
+                comb += self.cr_bitfield.data.eq(0)  # CR0 (MSB0 numbering)
                  comb += self.cr_bitfield.ok.eq(self.rc_in)  # only when RC=1
                  comb += self.sv_override.eq(1)
              with m.Case(CROutSel.CR1):
-                comb += self.cr_bitfield.data.eq(1) # CR1 (MSB0 numbering)
+                comb += self.cr_bitfield.data.eq(1)  # CR1 (MSB0 numbering)
                  comb += self.cr_bitfield.ok.eq(self.rc_in)  # only when RC=1
                  comb += self.sv_override.eq(2)
              with m.Case(CROutSel.BF):
@@ -717,7 +720,7 @@ class DecodeCROut(Elaboratable):
                          with m.If(ppick.en_o):
                              comb += self.whole_reg.data.eq(ppick.o)
                          with m.Else():
-                            comb += self.whole_reg.data.eq(0b00000001) # CR7
+                            comb += self.whole_reg.data.eq(0b00000001)  # CR7
                      with m.Else():
                          comb += self.whole_reg.data.eq(self.dec.FXM)
                  with m.Else():
@@ -726,6 +729,7 @@ class DecodeCROut(Elaboratable):
  
          return m
  
+
  # dictionary of Input Record field names that, if they exist,
  # will need a corresponding CSV Decoder file column (actually, PowerOp)
  # to be decoded (this includes the single bit names)
@@ -746,6 +750,7 @@ record_names = {'insn_type': 'internal_op',
                  'is_signed': 'sgn',
                  'lk': 'lk',
                  'data_len': 'ldst_len',
+                'reserve': 'rsrv',
                  'byte_reverse': 'br',
                  'sign_extend': 'sgn_ext',
                  'ldst_mode': 'upd',
@@ -757,20 +762,20 @@ class PowerDecodeSubset(Elaboratable):
  
      only fields actually requested are copied over. hence, "subset" (duh).
      """
+
      def __init__(self, dec, opkls=None, fn_name=None, final=False, state=None,
-                            svp64_en=True, regreduce_en=False):
+                 svp64_en=True, regreduce_en=False):
  
          self.svp64_en = svp64_en
          self.regreduce_en = regreduce_en
          if svp64_en:
-            self.is_svp64_mode = Signal() # mark decoding as SVP64 Mode
-            self.use_svp64_ldst_dec = Signal() # must use LDST decoder
+            self.is_svp64_mode = Signal()  # mark decoding as SVP64 Mode
              self.use_svp64_fft = Signal()      # FFT Mode
-            self.sv_rm = SVP64Rec(name="dec_svp64") # SVP64 RM field
+            self.sv_rm = SVP64Rec(name="dec_svp64")  # SVP64 RM field
              self.rm_dec = SVP64RMModeDecode("svp64_rm_dec")
              # set these to the predicate mask bits needed for the ALU
-            self.pred_sm = Signal() # TODO expand to SIMD mask width
-            self.pred_dm = Signal() # TODO expand to SIMD mask width
+            self.pred_sm = Signal()  # TODO expand to SIMD mask width
+            self.pred_dm = Signal()  # TODO expand to SIMD mask width
          self.sv_a_nz = Signal(1)
          self.final = final
          self.opkls = opkls
@@ -789,9 +794,9 @@ class PowerDecodeSubset(Elaboratable):
          # alternatives.  useful for PCR (Program Compatibility Register)
          # amongst other things
          if svp64_en:
-            conditions = {'SVP64BREV': self.use_svp64_ldst_dec,
-                          'SVP64FFT': self.use_svp64_fft,
-                         }
+            conditions = {
+                          # XXX NO 'SVP64FFT': self.use_svp64_fft,
+                          }
          else:
              conditions = None
  
@@ -803,8 +808,8 @@ class PowerDecodeSubset(Elaboratable):
          # create decoder if one not already given
          if dec is None:
              dec = create_pdecode(name=fn_name, col_subset=col_subset,
-                                      row_subset=row_subset,
-                                      conditions=conditions)
+                                 row_subset=row_subset,
+                                 conditions=conditions)
          self.dec = dec
  
          # set up a copy of the PowerOp
@@ -816,11 +821,11 @@ class PowerDecodeSubset(Elaboratable):
          self.state = state
  
      def get_col_subset(self, do):
-        subset = { 'cr_in', 'cr_out', 'rc_sel'} # needed, non-optional
+        subset = {'cr_in', 'cr_out', 'rc_sel'}  # needed, non-optional
          for k, v in record_names.items():
              if hasattr(do, k):
                  subset.add(v)
-        log ("get_col_subset", self.fn_name, do.fields, subset)
+        log("get_col_subset", self.fn_name, do.fields, subset)
          return subset
  
      def rowsubsetfn(self, opcode, row):
@@ -838,8 +843,13 @@ class PowerDecodeSubset(Elaboratable):
                  # really this should be done by modifying the CSV syntax
                  # to support multiple tasks (unit column multiple entries)
                  # see https://bugs.libre-soc.org/show_bug.cgi?id=310
-               (self.fn_name == 'MMU' and row['unit'] == 'SPR' and
-                row['internal op'] in ['OP_MTSPR', 'OP_MFSPR'])
+                (self.fn_name == 'MMU' and row['unit'] == 'SPR' and
+                 row['internal op'] in ['OP_MTSPR', 'OP_MFSPR']) or
+                # urrr... and the KAIVB SPR, which must also be redirected
+                # (to the TRAP pipeline)
+                # see https://bugs.libre-soc.org/show_bug.cgi?id=859
+                (self.fn_name == 'TRAP' and row['unit'] == 'SPR' and
+                 row['internal op'] in ['OP_MTSPR', 'OP_MFSPR'])
                  )
  
      def ports(self):
@@ -847,8 +857,7 @@ class PowerDecodeSubset(Elaboratable):
          if self.svp64_en:
              ports += self.sv_rm.ports()
              ports.append(self.is_svp64_mode)
-            ports.append(self.use_svp64_ldst_dec )
-            ports.append(self.use_svp64_fft )
+            ports.append(self.use_svp64_fft)
          return ports
  
      def needs_field(self, field, op_field):
@@ -892,7 +901,7 @@ class PowerDecodeSubset(Elaboratable):
              else:
                  name = self.fn_name + "tmp"
              self.e_tmp = Decode2ToExecute1Type(name=name, opkls=self.opkls,
-                                           regreduce_en=self.regreduce_en)
+                                               regreduce_en=self.regreduce_en)
  
          # set up submodule decoders
          m.submodules.dec = dec = self.dec
@@ -928,24 +937,34 @@ class PowerDecodeSubset(Elaboratable):
          # for SPR set/get
          fn = self.op_get("function_unit")
          spr = Signal(10, reset_less=True)
-        comb += spr.eq(decode_spr_num(self.dec.SPR)) # from XFX
+        comb += spr.eq(decode_spr_num(self.dec.SPR))  # from XFX
  
          # Microwatt doesn't implement the partition table
          # instead has PRTBL register (SPR) to point to process table
+        # Kestrel has a KAIVB SPR to "rebase" exceptions. rebasing is normally
+        # done with Hypervisor Mode which is not implemented (yet)
          is_spr_mv = Signal()
          is_mmu_spr = Signal()
+        is_trap_spr = Signal()
          comb += is_spr_mv.eq((internal_op == MicrOp.OP_MTSPR) |
                               (internal_op == MicrOp.OP_MFSPR))
          comb += is_mmu_spr.eq((spr == SPR.DSISR.value) |
                                (spr == SPR.DAR.value) |
                                (spr == SPR.PRTBL.value) |
                                (spr == SPR.PIDR.value))
+        comb += is_trap_spr.eq((spr == SPR.KAIVB.value)
+                              )
          # MMU must receive MMU SPRs
          with m.If(is_spr_mv & (fn == Function.SPR) & is_mmu_spr):
-            comb += self.do_copy("fn_unit", Function.NONE)
-            comb += self.do_copy("insn_type", MicrOp.OP_ILLEGAL)
-        # SPR pipe must *not* receive MMU SPRs
-        with m.Elif(is_spr_mv & (fn == Function.MMU) & ~is_mmu_spr):
+            comb += self.do_copy("fn_unit", Function.MMU)
+            comb += self.do_copy("insn_type", internal_op)
+        # TRAP must receive TRAP SPR KAIVB
+        with m.If(is_spr_mv & (fn == Function.SPR) & is_trap_spr):
+            comb += self.do_copy("fn_unit", Function.TRAP)
+            comb += self.do_copy("insn_type", internal_op)
+        # SPR pipe must *not* receive MMU or TRAP SPRs
+        with m.Elif(is_spr_mv & ((fn == Function.MMU) & ~is_mmu_spr) &
+                                ((fn == Function.TRAP) & ~is_trap_spr)):
              comb += self.do_copy("fn_unit", Function.NONE)
              comb += self.do_copy("insn_type", MicrOp.OP_ILLEGAL)
          # all others ok
@@ -961,7 +980,7 @@ class PowerDecodeSubset(Elaboratable):
          if self.needs_field("imm_data", "in2_sel"):
              m.submodules.dec_bi = dec_bi = DecodeBImm(self.dec)
              comb += dec_bi.sel_in.eq(self.op_get("in2_sel"))
-            comb += self.do_copy("imm_data", dec_bi.imm_out) # imm in RB
+            comb += self.do_copy("imm_data", dec_bi.imm_out)  # imm in RB
  
          # rc and oe out
          comb += self.do_copy("rc", dec_rc.rc_out)
@@ -977,7 +996,7 @@ class PowerDecodeSubset(Elaboratable):
          rc_out = self.dec_rc.rc_out.data
          with m.Switch(self.op_get("cr_out")):
              with m.Case(CROutSel.CR0, CROutSel.CR1):
-                comb += self.do_copy("write_cr0", rc_out) # only when RC=1
+                comb += self.do_copy("write_cr0", rc_out)  # only when RC=1
              with m.Case(CROutSel.BF, CROutSel.BT):
                  comb += self.do_copy("write_cr0", 1)
  
@@ -992,42 +1011,35 @@ class PowerDecodeSubset(Elaboratable):
              # the alternative decoder, svdecldst. what a mess... *sigh*
              sv_ptype = self.op_get("SV_Ptype")
              fn = self.op_get("function_unit")
-            # detect major opcode for LDs: include 58 here. from CSV files.
-            # BLECH! TODO: these should be done using "mini decoders",
-            # using row and column subsets
-            is_major_ld = Signal()
-            major = Signal(6) # bits... errr... MSB0 0..5 which is 26:32 python
-            comb += major.eq(self.dec.opcode_in[26:32])
-            comb += is_major_ld.eq((major == 34) | (major == 35) |
-                                   (major == 50) | (major == 51) |
-                                   (major == 48) | (major == 49) |
-                                   (major == 42) | (major == 43) |
-                                   (major == 40) | (major == 41) |
-                                   (major == 32) | (major == 33) |
-                                   (major == 58))
-            with m.If(self.is_svp64_mode & is_major_ld):
-                # straight-up: "it's a LD".  this gives enough info
-                # for SVP64 RM Mode decoding to detect LD/ST, and
-                # consequently detect the BITREVERSE mode. sigh
-                comb += rm_dec.fn_in.eq(Function.LDST)
-            with m.Else():
-                comb += rm_dec.fn_in.eq(fn) # decode needs to know Fn type
-            comb += rm_dec.ptype_in.eq(sv_ptype) # Single/Twin predicated
-            comb += rm_dec.rc_in.eq(rc_out) # Rc=1
-            comb += rm_dec.rm_in.eq(self.sv_rm) # SVP64 RM mode
+            comb += rm_dec.fn_in.eq(fn)  # decode needs to know Fn type
+            comb += rm_dec.ptype_in.eq(sv_ptype)  # Single/Twin predicated
+            comb += rm_dec.rc_in.eq(rc_out)  # Rc=1
+            comb += rm_dec.rm_in.eq(self.sv_rm)  # SVP64 RM mode
              if self.needs_field("imm_data", "in2_sel"):
                  bzero = dec_bi.imm_out.ok & ~dec_bi.imm_out.data.bool()
-                comb += rm_dec.ldst_imz_in.eq(bzero) # B immediate is zero
+                comb += rm_dec.ldst_imz_in.eq(bzero)  # B immediate is zero
  
              # main PowerDecoder2 determines if different SVP64 modes enabled
-            if not self.final:
-                # if bit-reverse mode requested
-                bitrev = rm_dec.ldstmode == SVP64LDSTmode.BITREVERSE
-                comb += self.use_svp64_ldst_dec.eq(bitrev)
-            # detect if SVP64 FFT mode enabled (really bad hack)
-            xo = Signal(1) # 1 bit from Major 59 XO field == 0b0XXXX
-            comb += xo.eq(self.dec.opcode_in[5])
-            comb += self.use_svp64_fft.eq((major == 59) & (xo == 0b0))
+            # detect if SVP64 FFT mode enabled (really bad hack),
+            # exclude fcfids and others
+            # XXX this is a REALLY bad hack, REALLY has to be done better.
+            # likely with a sub-decoder.
+            # what this ultimately does is enable the 2nd implicit register
+            # (FRS) for SVP64-decoding.  all of these instructions are
+            # 3-in 2-out but there is not enough room either in the
+            # opcode *or* EXTRA2/3 to specify a 5th operand.
+            major = Signal(6)
+            comb += major.eq(self.dec.opcode_in[26:32])
+            xo = Signal(10)
+            comb += xo.eq(self.dec.opcode_in[1:11])
+            comb += self.use_svp64_fft.eq((major == 59) & xo.matches(
+                '-----00100',  # ffmsubs
+                '-----00101',  # ffmadds
+                '-----00110',  # ffnmsubs
+                '-----00111',  # ffnmadds
+                '1111100000',  # ffadds
+                '-----11011',  # fdmadds
+            ))
  
          # decoded/selected instruction flags
          comb += self.do_copy("data_len", self.op_get("ldst_len"))
@@ -1045,11 +1057,12 @@ class PowerDecodeSubset(Elaboratable):
          comb += self.do_copy("byte_reverse", self.op_get("br"))
          comb += self.do_copy("sign_extend", self.op_get("sgn_ext"))
          comb += self.do_copy("ldst_mode", self.op_get("upd"))  # LD/ST mode
+        comb += self.do_copy("reserve", self.op_get("rsrv"))  # atomic
  
          # copy over SVP64 input record fields (if they exist)
          if self.svp64_en:
              # TODO, really do we have to do these explicitly?? sigh
-            #for (field, _) in sv_input_record_layout:
+            # for (field, _) in sv_input_record_layout:
              #    comb += self.do_copy(field, self.rm_dec.op_get(field))
              comb += self.do_copy("sv_saturate", self.rm_dec.saturate)
              comb += self.do_copy("sv_Ptype", self.rm_dec.ptype_in)
@@ -1093,13 +1106,17 @@ class PowerDecode2(PowerDecodeSubset):
      to make this work, TestIssuer must notice "exception.happened"
      after the (failed) LD/ST and copies the LDSTException info from
      the output, into here (PowerDecoder2).  without incrementing PC.
+
+    also instr_fault works the same way: the instruction is "rewritten"
+    so that the "fake" op that gets created is OP_FETCH_FAILED
      """
  
      def __init__(self, dec, opkls=None, fn_name=None, final=False,
-                            state=None, svp64_en=True, regreduce_en=False):
+                 state=None, svp64_en=True, regreduce_en=False):
          super().__init__(dec, opkls, fn_name, final, state, svp64_en,
                           regreduce_en=False)
-        self.ldst_exc = LDSTException("dec2_exc")
+        self.ldst_exc = LDSTException("dec2_exc")  # rewrites as OP_TRAP
+        self.instr_fault = Signal()  # rewrites instruction as OP_FETCH_FAILED
  
          if self.svp64_en:
              self.cr_out_isvec = Signal(1, name="cr_out_isvec")
@@ -1116,9 +1133,9 @@ class PowerDecode2(PowerDecodeSubset):
              self.in3_step = Signal(7, name="reg_c_step")
              self.o_step = Signal(7, name="reg_o_step")
              self.o2_step = Signal(7, name="reg_o2_step")
-            self.remap_active = Signal(1, name="remap_active")
-            self.no_in_vec = Signal(1, name="no_in_vec") # no inputs vector
-            self.no_out_vec = Signal(1, name="no_out_vec") # no outputs vector
+            self.remap_active = Signal(5, name="remap_active")  # per reg
+            self.no_in_vec = Signal(1, name="no_in_vec")  # no inputs vector
+            self.no_out_vec = Signal(1, name="no_out_vec")  # no outputs vector
              self.loop_continue = Signal(1, name="loop_continue")
          else:
              self.no_in_vec = Const(1, 1)
@@ -1185,8 +1202,9 @@ class PowerDecode2(PowerDecodeSubset):
              m.submodules.o_svdec = o_svdec = SVP64RegExtra()
              m.submodules.o2_svdec = o2_svdec = SVP64RegExtra()
  
-            # debug access to crout_svdec (used in get_pdecode_cr_out)
+            # debug access to cr svdec (used in get_pdecode_cr_in/out)
              self.crout_svdec = crout_svdec
+            self.crin_svdec = crin_svdec
  
          # get the 5-bit reg data before svp64-munging it into 7-bit plus isvec
          reg = Signal(5, reset_less=True)
@@ -1225,7 +1243,8 @@ class PowerDecode2(PowerDecodeSubset):
  
              #######
              # CR out
-            comb += crout_svdec.idx.eq(self.op_get("sv_cr_out")) # SVP64 CR out
+            # SVP64 CR out
+            comb += crout_svdec.idx.eq(self.op_get("sv_cr_out"))
              comb += self.cr_out_isvec.eq(crout_svdec.isvec)
  
              #######
@@ -1248,31 +1267,46 @@ class PowerDecode2(PowerDecodeSubset):
              # indices are slightly different, BA/BB mess sorted above
              comb += crin_svdec.idx.eq(cr_a_idx)       # SVP64 CR in A
              comb += crin_svdec_b.idx.eq(cr_b_idx)     # SVP64 CR in B
-            comb += crin_svdec_o.idx.eq(self.op_get("sv_cr_out")) # SVP64 CR out
+            # SVP64 CR out
+            comb += crin_svdec_o.idx.eq(self.op_get("sv_cr_out"))
  
              # get SVSTATE srcstep (TODO: elwidth etc.) needed below
              vl = Signal.like(self.state.svstate.vl)
+            maxvl = Signal.like(self.state.svstate.maxvl)
+            subvl = Signal.like(self.rm_dec.rm_in.subvl)
              srcstep = Signal.like(self.state.svstate.srcstep)
              dststep = Signal.like(self.state.svstate.dststep)
+            ssubstep = Signal.like(self.state.svstate.ssubstep)
+            dsubstep = Signal.like(self.state.svstate.ssubstep)
              comb += vl.eq(self.state.svstate.vl)
+            comb += maxvl.eq(self.state.svstate.maxvl)
+            comb += subvl.eq(self.rm_dec.rm_in.subvl)
              comb += srcstep.eq(self.state.svstate.srcstep)
              comb += dststep.eq(self.state.svstate.dststep)
+            comb += ssubstep.eq(self.state.svstate.ssubstep)
+            comb += dsubstep.eq(self.state.svstate.dsubstep)
  
              in1_step, in2_step = self.in1_step, self.in2_step
              in3_step = self.in3_step
              o_step, o2_step = self.o_step, self.o2_step
  
+            # multiply vl by subvl - note that this is only 7 bit!
+            # when elwidth overrides get involved this will have to go up
+            vmax = Signal(7)
+            comb += vmax.eq(vl*(subvl+1))
+
              # registers a, b, c and out and out2 (LD/ST EA)
              sv_etype = self.op_get("SV_Etype")
-            for rname, to_reg, fromreg, svdec, remapstep, out in (
+            for i, stuff in enumerate((
                  ("RA", e.read_reg1, dec_a.reg_out, in1_svdec, in1_step, False),
                  ("RB", e.read_reg2, dec_b.reg_out, in2_svdec, in2_step, False),
                  ("RC", e.read_reg3, dec_c.reg_out, in3_svdec, in3_step, False),
                  ("RT", e.write_reg, dec_o.reg_out, o_svdec, o_step, True),
-                ("EA", e.write_ea, dec_o2.reg_out, o2_svdec, o2_step, True)):
+                ("EA", e.write_ea, dec_o2.reg_out, o2_svdec, o2_step, True))):
+                rname, to_reg, fromreg, svdec, remapstep, out = stuff
                  comb += svdec.extra.eq(extra)     # EXTRA field of SVP64 RM
                  comb += svdec.etype.eq(sv_etype)  # EXTRA2/3 for this insn
-                comb += svdec.reg_in.eq(fromreg.data) # 3-bit (CR0/BC/BFA)
+                comb += svdec.reg_in.eq(fromreg.data)  # 3-bit (CR0/BC/BFA)
                  comb += to_reg.ok.eq(fromreg.ok)
                  # *screaam* FFT mode needs an extra offset for RB
                  # similar to FRS/FRT (below).  all of this needs cleanup
@@ -1284,32 +1318,34 @@ class PowerDecode2(PowerDecodeSubset):
                      # however when REMAP is active, the FFT REMAP
                      # schedule takes care of this offset.
                      with m.If(dec_o2.reg_out.ok & dec_o2.fp_madd_en):
-                        with m.If(~self.remap_active):
+                        with m.If(~self.remap_active[i]):
                              with m.If(svdec.isvec):
-                                comb += offs.eq(vl) # VL for Vectors
+                                comb += offs.eq(maxvl)  # MAXVL for Vectors
                  # detect if Vectorised: add srcstep/dststep if yes.
                  # to_reg is 7-bits, outs get dststep added, ins get srcstep
                  with m.If(svdec.isvec):
                      selectstep = dststep if out else srcstep
+                    subselect = dsubstep if out else ssubstep
                      step = Signal(7, name="step_%s" % rname.lower())
-                    with m.If(self.remap_active):
-                        comb += step.eq(remapstep)
+                    with m.If(self.remap_active[i]):
+                        comb += step.eq((remapstep*(subvl+1))+subselect)
                      with m.Else():
-                        comb += step.eq(selectstep)
+                        comb += step.eq((selectstep*(subvl+1))+subselect)
                      # reverse gear goes the opposite way
                      with m.If(self.rm_dec.reverse_gear):
-                        comb += to_reg.data.eq(offs+svdec.reg_out+(vl-1-step))
+                        comb += to_reg.data.eq(offs+svdec.reg_out+(vmax-1-step))
                      with m.Else():
                          comb += to_reg.data.eq(offs+step+svdec.reg_out)
                  with m.Else():
                      comb += to_reg.data.eq(offs+svdec.reg_out)
  
              # SVP64 in/out fields
-            comb += in1_svdec.idx.eq(self.op_get("sv_in1")) # reg #1 (in1_sel)
+            comb += in1_svdec.idx.eq(self.op_get("sv_in1"))  # reg #1 (in1_sel)
              comb += in2_svdec.idx.eq(self.op_get("sv_in2"))  # reg #2 (in2_sel)
              comb += in3_svdec.idx.eq(self.op_get("sv_in3"))  # reg #3 (in3_sel)
              comb += o_svdec.idx.eq(self.op_get("sv_out"))    # output (out_sel)
-            comb += o2_svdec.idx.eq(self.op_get("sv_out2"))  # output (implicit)
+            # output (implicit)
+            comb += o2_svdec.idx.eq(self.op_get("sv_out2"))
              # XXX TODO - work out where this should come from.  the problem is
              # that LD-with-update is implied (computed from "is instruction in
              # "update mode" rather than specified cleanly as its own CSV column
@@ -1327,15 +1363,15 @@ class PowerDecode2(PowerDecodeSubset):
              # same trick is applied to FRA, above, but it's a lot cleaner, there
              with m.If(dec_o2.reg_out.ok & dec_o2.fp_madd_en):
                  comb += offs.eq(0)
-                with m.If(~self.remap_active):
+                with m.If(~self.remap_active[4]):
                      with m.If(o2_svdec.isvec):
-                        comb += offs.eq(vl) # VL for Vectors
+                        comb += offs.eq(vl)  # VL for Vectors
                      with m.Else():
                          comb += offs.eq(1)  # add 1 if scalar
-                svdec = o_svdec # yes take source as o_svdec...
+                svdec = o_svdec  # yes take source as o_svdec...
                  with m.If(svdec.isvec):
                      step = Signal(7, name="step_%s" % rname.lower())
-                    with m.If(self.remap_active):
+                    with m.If(self.remap_active[4]):
                          comb += step.eq(o2_step)
                      with m.Else():
                          comb += step.eq(dststep)
@@ -1353,14 +1389,17 @@ class PowerDecode2(PowerDecodeSubset):
  
              # TODO add SPRs here.  must be True when *all* are scalar
              l = map(lambda svdec: svdec.isvec, [in1_svdec, in2_svdec, in3_svdec,
-                                        crin_svdec, crin_svdec_b, crin_svdec_o])
-            comb += self.no_in_vec.eq(~Cat(*l).bool()) # all input scalar
-            l = map(lambda svdec: svdec.isvec, [o2_svdec, o_svdec, crout_svdec])
+                                                crin_svdec, crin_svdec_b,
+                                                crin_svdec_o])
+            comb += self.no_in_vec.eq(~Cat(*l).bool())  # all input scalar
+            l = map(lambda svdec: svdec.isvec, [
+                    o2_svdec, o_svdec, crout_svdec])
              # in mapreduce mode, scalar out is *allowed*
              with m.If(self.rm_dec.mode == SVP64RMMode.MAPREDUCE.value):
                  comb += self.no_out_vec.eq(0)
              with m.Else():
-                comb += self.no_out_vec.eq(~Cat(*l).bool()) # all output scalar
+                # all output scalar
+                comb += self.no_out_vec.eq(~Cat(*l).bool())
              # now create a general-purpose "test" as to whether looping
              # should continue.  this doesn't include predication bit-tests
              loop = self.loop_continue
@@ -1381,26 +1420,26 @@ class PowerDecode2(PowerDecodeSubset):
                  (e.read_cr1, self.dec_cr_in, "cr_bitfield", crin_svdec, 0),
                  (e.read_cr2, self.dec_cr_in, "cr_bitfield_b", crin_svdec_b, 0),
                  (e.read_cr3, self.dec_cr_in, "cr_bitfield_o", crin_svdec_o, 0),
-                (e.write_cr, self.dec_cr_out, "cr_bitfield", crout_svdec, 1)):
+                    (e.write_cr, self.dec_cr_out, "cr_bitfield", crout_svdec, 1)):
                  fromreg = getattr(cr, name)
                  comb += svdec.extra.eq(extra)     # EXTRA field of SVP64 RM
                  comb += svdec.etype.eq(sv_etype)  # EXTRA2/3 for this insn
-                comb += svdec.cr_in.eq(fromreg.data) # 3-bit (CR0/BC/BFA)
+                comb += svdec.cr_in.eq(fromreg.data)  # 3-bit (CR0/BC/BFA)
                  with m.If(svdec.isvec):
                      # check if this is CR0 or CR1: treated differently
                      # (does not "listen" to EXTRA2/3 spec for a start)
                      # also: the CRs start from completely different locations
                      step = dststep if out else srcstep
-                    with m.If(cr.sv_override == 1): # CR0
+                    with m.If(cr.sv_override == 1):  # CR0
                          offs = SVP64CROffs.CR0
                          comb += to_reg.data.eq(step+offs)
-                    with m.Elif(cr.sv_override == 2): # CR1
+                    with m.Elif(cr.sv_override == 2):  # CR1
                          offs = SVP64CROffs.CR1
                          comb += to_reg.data.eq(step+1)
                      with m.Else():
-                        comb += to_reg.data.eq(step+svdec.cr_out) # 7-bit out
+                        comb += to_reg.data.eq(step+svdec.cr_out)  # 7-bit out
                  with m.Else():
-                    comb += to_reg.data.eq(svdec.cr_out) # 7-bit output
+                    comb += to_reg.data.eq(svdec.cr_out)  # 7-bit output
                  comb += to_reg.ok.eq(fromreg.ok)
  
              # sigh must determine if RA is nonzero (7 bit)
@@ -1417,16 +1456,16 @@ class PowerDecode2(PowerDecodeSubset):
  
              # connect up to/from read/write CRs
              for to_reg, cr, name in (
-                        (e.read_cr1, self.dec_cr_in, "cr_bitfield", ),
-                        (e.read_cr2, self.dec_cr_in, "cr_bitfield_b", ),
-                        (e.read_cr3, self.dec_cr_in, "cr_bitfield_o", ),
-                        (e.write_cr, self.dec_cr_out, "cr_bitfield", )):
+                (e.read_cr1, self.dec_cr_in, "cr_bitfield", ),
+                (e.read_cr2, self.dec_cr_in, "cr_bitfield_b", ),
+                (e.read_cr3, self.dec_cr_in, "cr_bitfield_o", ),
+                    (e.write_cr, self.dec_cr_out, "cr_bitfield", )):
                  fromreg = getattr(cr, name)
                  comb += to_reg.data.eq(fromreg.data)
                  comb += to_reg.ok.eq(fromreg.ok)
  
          if self.svp64_en:
-            comb += self.rm_dec.ldst_ra_vec.eq(self.in1_isvec) # RA is vector
+            comb += self.rm_dec.ldst_ra_vec.eq(self.in1_isvec)  # RA is vector
  
          # SPRs out
          comb += e.read_spr1.eq(dec_a.spr_out)
@@ -1437,16 +1476,19 @@ class PowerDecode2(PowerDecodeSubset):
          comb += e.read_fast2.eq(dec_b.fast_out)
          comb += e.write_fast1.eq(dec_o.fast_out)   # SRR0 (OP_RFID)
          comb += e.write_fast2.eq(dec_o2.fast_out)  # SRR1 (ditto)
-        comb += e.write_fast3.eq(dec_o2.fast_out3) # SVSRR0 (ditto)
+        comb += e.write_fast3.eq(dec_o2.fast_out3)  # SVSRR0 (ditto)
+        # and State regs (DEC, TB)
+        comb += e.read_state1.eq(dec_a.state_out)    # DEC/TB
+        comb += e.write_state1.eq(dec_o.state_out)   # DEC/TB
  
          # sigh this is exactly the sort of thing for which the
          # decoder is designed to not need.  MTSPR, MFSPR and others need
          # access to the XER bits.  however setting e.oe is not appropriate
          internal_op = self.op_get("internal_op")
          with m.If(internal_op == MicrOp.OP_MFSPR):
-            comb += e.xer_in.eq(0b111) # SO, CA, OV
+            comb += e.xer_in.eq(0b111)  # SO, CA, OV
          with m.If(internal_op == MicrOp.OP_CMP):
-            comb += e.xer_in.eq(1<<XERRegsEnum.SO) # SO
+            comb += e.xer_in.eq(1 << XERRegsEnum.SO)  # SO
          with m.If(internal_op == MicrOp.OP_MTSPR):
              comb += e.xer_out.eq(1)
  
@@ -1454,7 +1496,7 @@ class PowerDecode2(PowerDecodeSubset):
          with m.If(op.internal_op == MicrOp.OP_TRAP):
              # *DO NOT* call self.trap here.  that would reset absolutely
              # everything including destroying read of RA and RB.
-            comb += self.do_copy("trapaddr", 0x70) # strip first nibble
+            comb += self.do_copy("trapaddr", 0x70)  # strip first nibble
  
          ####################
          # ok so the instruction's been decoded, blah blah, however
@@ -1474,27 +1516,38 @@ class PowerDecode2(PowerDecodeSubset):
          illeg_ok = Signal()
          ldst_exc = self.ldst_exc
  
-        comb += ext_irq_ok.eq(ext_irq & msr[MSR.EE]) # v3.0B p944 (MSR.EE)
-        comb += dec_irq_ok.eq(dec_spr[63] & msr[MSR.EE]) # 6.5.11 p1076
+        comb += ext_irq_ok.eq(ext_irq & msr[MSR.EE])  # v3.0B p944 (MSR.EE)
+        comb += dec_irq_ok.eq(dec_spr[63] & msr[MSR.EE])  # 6.5.11 p1076
          comb += priv_ok.eq(is_priv_insn & msr[MSR.PR])
          comb += illeg_ok.eq(op.internal_op == MicrOp.OP_ILLEGAL)
  
+        # absolute top priority: check for an instruction failed
+        with m.If(self.instr_fault):
+            comb += self.e.eq(0)  # reset eeeeeverything
+            comb += self.do_copy("insn", self.dec.opcode_in, True)
+            comb += self.do_copy("insn_type", MicrOp.OP_FETCH_FAILED, True)
+            comb += self.do_copy("fn_unit", Function.MMU, True)
+            comb += self.do_copy("cia", self.state.pc, True)  # PC
+            comb += self.do_copy("msr", self.state.msr, True)  # MSR
+            # special override on internal_op, due to being a "fake" op
+            comb += self.dec.op.internal_op.eq(MicrOp.OP_FETCH_FAILED)
+
          # LD/ST exceptions.  TestIssuer copies the exception info at us
          # after a failed LD/ST.
-        with m.If(ldst_exc.happened):
+        with m.Elif(ldst_exc.happened):
              with m.If(ldst_exc.alignment):
-                self.trap(m, TT.PRIV, 0x600)
+                self.trap(m, TT.MEMEXC, 0x600)
              with m.Elif(ldst_exc.instr_fault):
                  with m.If(ldst_exc.segment_fault):
-                    self.trap(m, TT.PRIV, 0x480)
+                    self.trap(m, TT.MEMEXC, 0x480)
                  with m.Else():
                      # pass exception info to trap to create SRR1
                      self.trap(m, TT.MEMEXC, 0x400, ldst_exc)
              with m.Else():
                  with m.If(ldst_exc.segment_fault):
-                    self.trap(m, TT.PRIV, 0x380)
+                    self.trap(m, TT.MEMEXC, 0x380)
                  with m.Else():
-                    self.trap(m, TT.PRIV, 0x300)
+                    self.trap(m, TT.MEMEXC, 0x300)
  
          # decrement counter (v3.0B p1099): TODO 32-bit version (MSR.LPCR)
          with m.Elif(dec_irq_ok):
@@ -1568,21 +1621,24 @@ class PowerDecode2(PowerDecodeSubset):
          comb += self.do_copy("insn", self.dec.opcode_in, True)
          comb += self.do_copy("insn_type", MicrOp.OP_TRAP, True)
          comb += self.do_copy("fn_unit", Function.TRAP, True)
-        comb += self.do_copy("trapaddr", trapaddr >> 4, True) # bottom 4 bits
+        comb += self.do_copy("trapaddr", trapaddr >> 4, True)  # bottom 4 bits
          comb += self.do_copy("traptype", traptype, True)  # request type
          comb += self.do_copy("ldst_exc", ldst_exc, True)  # request type
-        comb += self.do_copy("msr", self.state.msr, True) # copy of MSR "state"
+        comb += self.do_copy("msr", self.state.msr,
+                             True)  # copy of MSR "state"
          comb += self.do_copy("cia", self.state.pc, True)  # copy of PC "state"
          comb += self.do_copy("svstate", self.state.svstate, True)  # SVSTATE
  
  
-
-def get_rdflags(e, cu):
+def get_rdflags(m, e, cu):
+    """returns a sequential list of the read "ok" flags for a given FU.
+    this list is in order of the CompUnit input specs
+    """
      rdl = []
      for idx in range(cu.n_src):
          regfile, regname, _ = cu.get_in_spec(idx)
-        rdflag, read = regspec_decode_read(e, regfile, regname)
-        rdl.append(rdflag)
+        decinfo = regspec_decode_read(m, e, regfile, regname)
+        rdl.append(decinfo.okflag)
      log("rdflags", rdl)
      return Cat(*rdl)