move ffadds to not conflict with fptrans -- makes space for min/max/fmod/remainder ops

[openpower-isa.git] / src / openpower / decoder / power_decoder2.py
diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py

index cb054735fbcb33dd2403c0bb6c1dbb288e95d30f..8cd7db702ba5be574173e704162f643317fefa05 100644 (file)
--- a/src/openpower/decoder/power_decoder2.py
+++ b/src/openpower/decoder/power_decoder2.py
@@ -31,7 +31,7 @@ from openpower.decoder.power_enums import (MicrOp, CryIn, Function,
                                             CRInSel, CROutSel,
                                             LdstLen, In1Sel, In2Sel, In3Sel,
                                             OutSel, SPRfull, SPRreduced,
-                                           RC, SVP64LDSTmode, LDSTMode,
+                                           RCOE, SVP64LDSTmode, LDSTMode,
                                             SVEXTRA, SVEtype, SVPtype)
  from openpower.decoder.decode2execute1 import (Decode2ToExecute1Type, Data,
                                                 Decode2ToOperand)
@@ -41,7 +41,7 @@ from openpower.consts import (MSR, SPEC, EXTRA2, EXTRA3, SVP64P, field,
                                FastRegsEnum, XERRegsEnum, TT)
  
  from openpower.state import CoreState
-from openpower.util import (spr_to_fast, log)
+from openpower.util import (spr_to_fast, spr_to_state, log)
  
  
  def decode_spr_num(spr):
@@ -78,7 +78,8 @@ class SPRMap(Elaboratable):
  
          self.spr_i = Signal(10, reset_less=True)
          self.spr_o = Data(SPR, name="spr_o")
-        self.fast_o = Data(3, name="fast_o")
+        self.fast_o = Data(4, name="fast_o")
+        self.state_o = Data(3, name="state_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -95,6 +96,10 @@ class SPRMap(Elaboratable):
                  with m.Case(x.value):
                      m.d.comb += self.fast_o.data.eq(v)
                      m.d.comb += self.fast_o.ok.eq(1)
+            for x, v in spr_to_state.items():
+                with m.Case(x.value):
+                    m.d.comb += self.state_o.data.eq(v)
+                    m.d.comb += self.state_o.ok.eq(1)
          return m
  
  
@@ -116,7 +121,8 @@ class DecodeA(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, name="reg_a")
          self.spr_out = Data(SPR, "spr_a")
-        self.fast_out = Data(3, "fast_a")
+        self.fast_out = Data(4, "fast_a")
+        self.state_out = Data(3, "state_a")
          self.sv_nz = Signal(1)
  
      def elaborate(self, platform):
@@ -181,6 +187,7 @@ class DecodeA(Elaboratable):
                  comb += sprmap.spr_i.eq(spr)
                  comb += self.spr_out.eq(sprmap.spr_o)
                  comb += self.fast_out.eq(sprmap.fast_o)
+                comb += self.state_out.eq(sprmap.state_o)
  
          return m
  
@@ -229,7 +236,7 @@ class DecodeB(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(7, "reg_b")
          self.reg_isvec = Signal(1, name="reg_b_isvec")  # TODO: in reg_out
-        self.fast_out = Data(3, "fast_b")
+        self.fast_out = Data(4, "fast_b")
  
      def elaborate(self, platform):
          m = Module()
@@ -378,7 +385,7 @@ class DecodeC(Elaboratable):
  class DecodeOut(Elaboratable):
      """DecodeOut from instruction
  
-    decodes output register RA, RT or SPR
+    decodes output register RA, RT, FRS, FRT, or SPR
      """
  
      def __init__(self, dec, op, regreduce_en):
@@ -393,7 +400,8 @@ class DecodeOut(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, "reg_o")
          self.spr_out = Data(SPR, "spr_o")
-        self.fast_out = Data(3, "fast_o")
+        self.fast_out = Data(4, "fast_o")
+        self.state_out = Data(3, "state_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -404,6 +412,9 @@ class DecodeOut(Elaboratable):
  
          # select Register out field
          with m.Switch(self.sel_in):
+            with m.Case(OutSel.FRS):
+                comb += reg.data.eq(self.dec.FRS)
+                comb += reg.ok.eq(1)
              with m.Case(OutSel.FRT):
                  comb += reg.data.eq(self.dec.FRT)
                  comb += reg.ok.eq(1)
@@ -421,6 +432,7 @@ class DecodeOut(Elaboratable):
                      comb += sprmap.spr_i.eq(spr)
                      comb += self.spr_out.eq(sprmap.spr_o)
                      comb += self.fast_out.eq(sprmap.fast_o)
+                    comb += self.state_out.eq(sprmap.state_o)
  
          # determine Fast Reg
          with m.Switch(op.internal_op):
@@ -463,8 +475,8 @@ class DecodeOut2(Elaboratable):
          self.insn_in = Signal(32, reset_less=True)
          self.reg_out = Data(5, "reg_o2")
          self.fp_madd_en = Signal(reset_less=True)  # FFT instruction detected
-        self.fast_out = Data(3, "fast_o2")
-        self.fast_out3 = Data(3, "fast_o3")
+        self.fast_out = Data(4, "fast_o2")
+        self.fast_out3 = Data(4, "fast_o3")
  
      def elaborate(self, platform):
          m = Module()
@@ -517,7 +529,7 @@ class DecodeRC(Elaboratable):
  
      def __init__(self, dec):
          self.dec = dec
-        self.sel_in = Signal(RC, reset_less=True)
+        self.sel_in = Signal(RCOE, reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.rc_out = Data(1, "rc")
  
@@ -527,13 +539,13 @@ class DecodeRC(Elaboratable):
  
          # select Record bit out field
          with m.Switch(self.sel_in):
-            with m.Case(RC.RC):
+            with m.Case(RCOE.RC, RCOE.RC_ONLY):
                  comb += self.rc_out.data.eq(self.dec.Rc)
                  comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.ONE):
+            with m.Case(RCOE.ONE):
                  comb += self.rc_out.data.eq(1)
                  comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.NONE):
+            with m.Case(RCOE.NONE):
                  comb += self.rc_out.data.eq(0)
                  comb += self.rc_out.ok.eq(1)
  
@@ -543,48 +555,30 @@ class DecodeRC(Elaboratable):
  class DecodeOE(Elaboratable):
      """DecodeOE from instruction
  
-    decodes OE field: uses RC decode detection which might not be good
-
-    -- For now, use "rc" in the decode table to decide whether oe exists.
-    -- This is not entirely correct architecturally: For mulhd and
-    -- mulhdu, the OE field is reserved. It remains to be seen what an
-    -- actual POWER9 does if we set it on those instructions, for now we
-    -- test that further down when assigning to the multiplier oe input.
+    decodes OE field: uses RC decode detection which has now been
+    updated to separate out RC_ONLY.  all cases RC_ONLY are *NOT*
+    listening to the OE field, here.
      """
  
      def __init__(self, dec, op):
          self.dec = dec
          self.op = op
-        self.sel_in = Signal(RC, reset_less=True)
+        self.sel_in = Signal(RCOE, reset_less=True)
          self.insn_in = Signal(32, reset_less=True)
          self.oe_out = Data(1, "oe")
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
-        op = self.op
  
-        with m.Switch(op.internal_op):
-
-            # mulhw, mulhwu, mulhd, mulhdu - these *ignore* OE
-            # also rotate
-            # XXX ARGH! ignoring OE causes incompatibility with microwatt
-            # http://lists.libre-soc.org/pipermail/libre-soc-dev/2020-August/000302.html
-            with m.Case(MicrOp.OP_MUL_H64, MicrOp.OP_MUL_H32,
-                        MicrOp.OP_EXTS, MicrOp.OP_CNTZ,
-                        MicrOp.OP_SHL, MicrOp.OP_SHR, MicrOp.OP_RLC,
-                        MicrOp.OP_LOAD, MicrOp.OP_STORE,
-                        MicrOp.OP_RLCL, MicrOp.OP_RLCR,
-                        MicrOp.OP_EXTSWSLI, MicrOp.OP_GREV):
-                pass
-
-            # all other ops decode OE field
+        with m.Switch(self.sel_in):
+            with m.Case(RCOE.RC):
+                comb += self.oe_out.data.eq(self.dec.OE)
+                comb += self.oe_out.ok.eq(1)
              with m.Default():
-                # select OE bit out field
-                with m.Switch(self.sel_in):
-                    with m.Case(RC.RC):
-                        comb += self.oe_out.data.eq(self.dec.OE)
-                        comb += self.oe_out.ok.eq(1)
+                # default: clear OE.
+                comb += self.oe_out.data.eq(0)
+                comb += self.oe_out.ok.eq(0)
  
          return m
  
@@ -776,7 +770,6 @@ class PowerDecodeSubset(Elaboratable):
          self.regreduce_en = regreduce_en
          if svp64_en:
              self.is_svp64_mode = Signal()  # mark decoding as SVP64 Mode
-            self.use_svp64_ldst_dec = Signal()  # must use LDST decoder
              self.use_svp64_fft = Signal()      # FFT Mode
              self.sv_rm = SVP64Rec(name="dec_svp64")  # SVP64 RM field
              self.rm_dec = SVP64RMModeDecode("svp64_rm_dec")
@@ -801,8 +794,8 @@ class PowerDecodeSubset(Elaboratable):
          # alternatives.  useful for PCR (Program Compatibility Register)
          # amongst other things
          if svp64_en:
-            conditions = {'SVP64BREV': self.use_svp64_ldst_dec,
-                          'SVP64FFT': self.use_svp64_fft,
+            conditions = {
+                          # XXX NO 'SVP64FFT': self.use_svp64_fft,
                            }
          else:
              conditions = None
@@ -851,6 +844,11 @@ class PowerDecodeSubset(Elaboratable):
                  # to support multiple tasks (unit column multiple entries)
                  # see https://bugs.libre-soc.org/show_bug.cgi?id=310
                  (self.fn_name == 'MMU' and row['unit'] == 'SPR' and
+                 row['internal op'] in ['OP_MTSPR', 'OP_MFSPR']) or
+                # urrr... and the KAIVB SPR, which must also be redirected
+                # (to the TRAP pipeline)
+                # see https://bugs.libre-soc.org/show_bug.cgi?id=859
+                (self.fn_name == 'TRAP' and row['unit'] == 'SPR' and
                   row['internal op'] in ['OP_MTSPR', 'OP_MFSPR'])
                  )
  
@@ -859,7 +857,6 @@ class PowerDecodeSubset(Elaboratable):
          if self.svp64_en:
              ports += self.sv_rm.ports()
              ports.append(self.is_svp64_mode)
-            ports.append(self.use_svp64_ldst_dec)
              ports.append(self.use_svp64_fft)
          return ports
  
@@ -944,20 +941,30 @@ class PowerDecodeSubset(Elaboratable):
  
          # Microwatt doesn't implement the partition table
          # instead has PRTBL register (SPR) to point to process table
+        # Kestrel has a KAIVB SPR to "rebase" exceptions. rebasing is normally
+        # done with Hypervisor Mode which is not implemented (yet)
          is_spr_mv = Signal()
          is_mmu_spr = Signal()
+        is_trap_spr = Signal()
          comb += is_spr_mv.eq((internal_op == MicrOp.OP_MTSPR) |
                               (internal_op == MicrOp.OP_MFSPR))
          comb += is_mmu_spr.eq((spr == SPR.DSISR.value) |
                                (spr == SPR.DAR.value) |
                                (spr == SPR.PRTBL.value) |
                                (spr == SPR.PIDR.value))
+        comb += is_trap_spr.eq((spr == SPR.KAIVB.value)
+                              )
          # MMU must receive MMU SPRs
          with m.If(is_spr_mv & (fn == Function.SPR) & is_mmu_spr):
              comb += self.do_copy("fn_unit", Function.MMU)
              comb += self.do_copy("insn_type", internal_op)
-        # SPR pipe must *not* receive MMU SPRs
-        with m.Elif(is_spr_mv & (fn == Function.MMU) & ~is_mmu_spr):
+        # TRAP must receive TRAP SPR KAIVB
+        with m.If(is_spr_mv & (fn == Function.SPR) & is_trap_spr):
+            comb += self.do_copy("fn_unit", Function.TRAP)
+            comb += self.do_copy("insn_type", internal_op)
+        # SPR pipe must *not* receive MMU or TRAP SPRs
+        with m.Elif(is_spr_mv & ((fn == Function.MMU) & ~is_mmu_spr) &
+                                ((fn == Function.TRAP) & ~is_trap_spr)):
              comb += self.do_copy("fn_unit", Function.NONE)
              comb += self.do_copy("insn_type", MicrOp.OP_ILLEGAL)
          # all others ok
@@ -1004,27 +1011,7 @@ class PowerDecodeSubset(Elaboratable):
              # the alternative decoder, svdecldst. what a mess... *sigh*
              sv_ptype = self.op_get("SV_Ptype")
              fn = self.op_get("function_unit")
-            # detect major opcode for LDs: include 58 here. from CSV files.
-            # BLECH! TODO: these should be done using "mini decoders",
-            # using row and column subsets
-            is_major_ld = Signal()
-            # bits... errr... MSB0 0..5 which is 26:32 python
-            major = Signal(6)
-            comb += major.eq(self.dec.opcode_in[26:32])
-            comb += is_major_ld.eq((major == 34) | (major == 35) |
-                                   (major == 50) | (major == 51) |
-                                   (major == 48) | (major == 49) |
-                                   (major == 42) | (major == 43) |
-                                   (major == 40) | (major == 41) |
-                                   (major == 32) | (major == 33) |
-                                   (major == 58))
-            with m.If(self.is_svp64_mode & is_major_ld):
-                # straight-up: "it's a LD".  this gives enough info
-                # for SVP64 RM Mode decoding to detect LD/ST, and
-                # consequently detect the SHIFT mode. sigh
-                comb += rm_dec.fn_in.eq(Function.LDST)
-            with m.Else():
-                comb += rm_dec.fn_in.eq(fn)  # decode needs to know Fn type
+            comb += rm_dec.fn_in.eq(fn)  # decode needs to know Fn type
              comb += rm_dec.ptype_in.eq(sv_ptype)  # Single/Twin predicated
              comb += rm_dec.rc_in.eq(rc_out)  # Rc=1
              comb += rm_dec.rm_in.eq(self.sv_rm)  # SVP64 RM mode
@@ -1033,20 +1020,26 @@ class PowerDecodeSubset(Elaboratable):
                  comb += rm_dec.ldst_imz_in.eq(bzero)  # B immediate is zero
  
              # main PowerDecoder2 determines if different SVP64 modes enabled
-            if not self.final:
-                # if shift mode requested
-                shiftmode = rm_dec.ldstmode == SVP64LDSTmode.SHIFT
-                comb += self.use_svp64_ldst_dec.eq(shiftmode)
              # detect if SVP64 FFT mode enabled (really bad hack),
              # exclude fcfids and others
              # XXX this is a REALLY bad hack, REALLY has to be done better.
              # likely with a sub-decoder.
-            xo5 = Signal(1)  # 1 bit from Minor 59 XO field == 0b0XXXX
-            comb += xo5.eq(self.dec.opcode_in[5])
-            xo = Signal(5)  # 5 bits from Minor 59 fcfids == 0b01110
-            comb += xo.eq(self.dec.opcode_in[1:6])
-            comb += self.use_svp64_fft.eq((major == 59) & (xo5 == 0b0) &
-                                          (xo != 0b01110))
+            # what this ultimately does is enable the 2nd implicit register
+            # (FRS) for SVP64-decoding.  all of these instructions are
+            # 3-in 2-out but there is not enough room either in the
+            # opcode *or* EXTRA2/3 to specify a 5th operand.
+            major = Signal(6)
+            comb += major.eq(self.dec.opcode_in[26:32])
+            xo = Signal(10)
+            comb += xo.eq(self.dec.opcode_in[1:11])
+            comb += self.use_svp64_fft.eq((major == 59) & xo.matches(
+                '-----00100',  # ffmsubs
+                '-----00101',  # ffmadds
+                '-----00110',  # ffnmsubs
+                '-----00111',  # ffnmadds
+                '1111100000',  # ffadds
+                '-----11011',  # fdmadds
+            ))
  
          # decoded/selected instruction flags
          comb += self.do_copy("data_len", self.op_get("ldst_len"))
@@ -1279,16 +1272,29 @@ class PowerDecode2(PowerDecodeSubset):
  
              # get SVSTATE srcstep (TODO: elwidth etc.) needed below
              vl = Signal.like(self.state.svstate.vl)
+            maxvl = Signal.like(self.state.svstate.maxvl)
+            subvl = Signal.like(self.rm_dec.rm_in.subvl)
              srcstep = Signal.like(self.state.svstate.srcstep)
              dststep = Signal.like(self.state.svstate.dststep)
+            ssubstep = Signal.like(self.state.svstate.ssubstep)
+            dsubstep = Signal.like(self.state.svstate.ssubstep)
              comb += vl.eq(self.state.svstate.vl)
+            comb += maxvl.eq(self.state.svstate.maxvl)
+            comb += subvl.eq(self.rm_dec.rm_in.subvl)
              comb += srcstep.eq(self.state.svstate.srcstep)
              comb += dststep.eq(self.state.svstate.dststep)
+            comb += ssubstep.eq(self.state.svstate.ssubstep)
+            comb += dsubstep.eq(self.state.svstate.dsubstep)
  
              in1_step, in2_step = self.in1_step, self.in2_step
              in3_step = self.in3_step
              o_step, o2_step = self.o_step, self.o2_step
  
+            # multiply vl by subvl - note that this is only 7 bit!
+            # when elwidth overrides get involved this will have to go up
+            vmax = Signal(7)
+            comb += vmax.eq(vl*(subvl+1))
+
              # registers a, b, c and out and out2 (LD/ST EA)
              sv_etype = self.op_get("SV_Etype")
              for i, stuff in enumerate((
@@ -1296,7 +1302,7 @@ class PowerDecode2(PowerDecodeSubset):
                  ("RB", e.read_reg2, dec_b.reg_out, in2_svdec, in2_step, False),
                  ("RC", e.read_reg3, dec_c.reg_out, in3_svdec, in3_step, False),
                  ("RT", e.write_reg, dec_o.reg_out, o_svdec, o_step, True),
-                    ("EA", e.write_ea, dec_o2.reg_out, o2_svdec, o2_step, True))):
+                ("EA", e.write_ea, dec_o2.reg_out, o2_svdec, o2_step, True))):
                  rname, to_reg, fromreg, svdec, remapstep, out = stuff
                  comb += svdec.extra.eq(extra)     # EXTRA field of SVP64 RM
                  comb += svdec.etype.eq(sv_etype)  # EXTRA2/3 for this insn
@@ -1314,19 +1320,20 @@ class PowerDecode2(PowerDecodeSubset):
                      with m.If(dec_o2.reg_out.ok & dec_o2.fp_madd_en):
                          with m.If(~self.remap_active[i]):
                              with m.If(svdec.isvec):
-                                comb += offs.eq(vl)  # VL for Vectors
+                                comb += offs.eq(maxvl)  # MAXVL for Vectors
                  # detect if Vectorised: add srcstep/dststep if yes.
                  # to_reg is 7-bits, outs get dststep added, ins get srcstep
                  with m.If(svdec.isvec):
                      selectstep = dststep if out else srcstep
+                    subselect = dsubstep if out else ssubstep
                      step = Signal(7, name="step_%s" % rname.lower())
                      with m.If(self.remap_active[i]):
-                        comb += step.eq(remapstep)
+                        comb += step.eq((remapstep*(subvl+1))+subselect)
                      with m.Else():
-                        comb += step.eq(selectstep)
+                        comb += step.eq((selectstep*(subvl+1))+subselect)
                      # reverse gear goes the opposite way
                      with m.If(self.rm_dec.reverse_gear):
-                        comb += to_reg.data.eq(offs+svdec.reg_out+(vl-1-step))
+                        comb += to_reg.data.eq(offs+svdec.reg_out+(vmax-1-step))
                      with m.Else():
                          comb += to_reg.data.eq(offs+step+svdec.reg_out)
                  with m.Else():
@@ -1470,6 +1477,9 @@ class PowerDecode2(PowerDecodeSubset):
          comb += e.write_fast1.eq(dec_o.fast_out)   # SRR0 (OP_RFID)
          comb += e.write_fast2.eq(dec_o2.fast_out)  # SRR1 (ditto)
          comb += e.write_fast3.eq(dec_o2.fast_out3)  # SVSRR0 (ditto)
+        # and State regs (DEC, TB)
+        comb += e.read_state1.eq(dec_a.state_out)    # DEC/TB
+        comb += e.write_state1.eq(dec_o.state_out)   # DEC/TB
  
          # sigh this is exactly the sort of thing for which the
          # decoder is designed to not need.  MTSPR, MFSPR and others need
@@ -1526,18 +1536,18 @@ class PowerDecode2(PowerDecodeSubset):
          # after a failed LD/ST.
          with m.Elif(ldst_exc.happened):
              with m.If(ldst_exc.alignment):
-                self.trap(m, TT.PRIV, 0x600)
+                self.trap(m, TT.MEMEXC, 0x600)
              with m.Elif(ldst_exc.instr_fault):
                  with m.If(ldst_exc.segment_fault):
-                    self.trap(m, TT.PRIV, 0x480)
+                    self.trap(m, TT.MEMEXC, 0x480)
                  with m.Else():
                      # pass exception info to trap to create SRR1
                      self.trap(m, TT.MEMEXC, 0x400, ldst_exc)
              with m.Else():
                  with m.If(ldst_exc.segment_fault):
-                    self.trap(m, TT.PRIV, 0x380)
+                    self.trap(m, TT.MEMEXC, 0x380)
                  with m.Else():
-                    self.trap(m, TT.PRIV, 0x300)
+                    self.trap(m, TT.MEMEXC, 0x300)
  
          # decrement counter (v3.0B p1099): TODO 32-bit version (MSR.LPCR)
          with m.Elif(dec_irq_ok):
@@ -1570,12 +1580,6 @@ class PowerDecode2(PowerDecodeSubset):
          # Note: OP_SC could actually be modified to just be a trap
          with m.If((do_out.insn_type == MicrOp.OP_TRAP) |
                    (do_out.insn_type == MicrOp.OP_SC)):
-            # see fu/trap/main_stage.py trap() function: some bits of SRR1
-            # need to be preserved, rather than just blithely overwrite MSR.
-            # following microwatt, here.
-            # TRAP read fast2 = SRR1
-            comb += e_out.read_fast2.data.eq(FastRegsEnum.SRR1)  # SRR1
-            comb += e_out.read_fast2.ok.eq(1)
              # TRAP write fast1 = SRR0
              comb += e_out.write_fast1.data.eq(FastRegsEnum.SRR0)  # SRR0
              comb += e_out.write_fast1.ok.eq(1)