gas "mbind sections" test

[binutils-gdb.git] / gas / config / tc-arm.c
diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c

index 4e61eb5f1ae1de83966ae2dbf6796b4aa982d7cf..aa082a60c73a3b0f71781e95763f47950d4314e9 100644 (file)
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -510,7 +510,10 @@ struct arm_it
      unsigned isreg     : 1;  /* Operand was a register.  */
      unsigned immisreg  : 2;  /* .imm field is a second register.
                                  0: imm, 1: gpr, 2: MVE Q-register.  */
-    unsigned isscalar   : 1;  /* Operand is a (Neon) scalar.  */
+    unsigned isscalar   : 2;  /* Operand is a (SIMD) scalar:
+                                0) not scalar,
+                                1) Neon scalar,
+                                2) MVE scalar.  */
      unsigned immisalign : 1;  /* Immediate is an alignment specifier.  */
      unsigned immisfloat : 1;  /* Immediate was parsed as a float.  */
      /* Note: we abuse "regisimm" to mean "is Neon register" in VMOV
@@ -519,6 +522,7 @@ struct arm_it
      unsigned isvec      : 1;  /* Is a single, double or quad VFP/Neon reg.  */
      unsigned isquad     : 1;  /* Operand is SIMD quad register.  */
      unsigned issingle   : 1;  /* Operand is VFP single-precision register.  */
+    unsigned iszr      : 1;  /* Operand is ZR register.  */
      unsigned hasreloc  : 1;  /* Operand has relocation suffix.  */
      unsigned writeback : 1;  /* Operand has trailing !  */
      unsigned preind    : 1;  /* Preindexed address.  */
@@ -643,6 +647,7 @@ enum arm_reg_type
    REG_TYPE_MMXWCG,
    REG_TYPE_XSCALE,
    REG_TYPE_RNB,
+  REG_TYPE_ZR
  };
  
  /* Structure for a hash table entry for a register.
@@ -895,6 +900,7 @@ struct asm_opcode
  #define BAD_MVE_SRCDEST        _("Warning: 32-bit element size and same destination "\
                           "and source operands makes instruction UNPREDICTABLE")
  #define BAD_EL_TYPE    _("bad element type for instruction")
+#define MVE_BAD_QREG   _("MVE vector register Q[0..7] expected")
  
  static struct hash_control * arm_ops_hsh;
  static struct hash_control * arm_cond_hsh;
@@ -1003,6 +1009,9 @@ static void it_fsm_post_encode (void);
      }                                                  \
    while (0)
  
+/* Toggle value[pos].  */
+#define TOGGLE_BIT(value, pos) (value ^ (1 << pos))
+
  /* Pure syntax.         */
  
  /* This array holds the chars that always start a comment.  If the
@@ -1653,9 +1662,14 @@ parse_typed_reg_or_scalar (char **ccp, enum arm_reg_type type,
      {
        if (type != REG_TYPE_VFD
           && !(type == REG_TYPE_VFS
-              && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2)))
+              && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2))
+         && !(type == REG_TYPE_NQ
+              && ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)))
         {
-         first_error (_("only D registers may be indexed"));
+         if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+           first_error (_("only D and Q registers may be indexed"));
+         else
+           first_error (_("only D registers may be indexed"));
           return FAIL;
         }
  
@@ -1744,27 +1758,41 @@ arm_typed_reg_parse (char **ccp, enum arm_reg_type type,
     just do easy checks here, and do further checks later.  */
  
  static int
-parse_scalar (char **ccp, int elsize, struct neon_type_el *type)
+parse_scalar (char **ccp, int elsize, struct neon_type_el *type, enum
+             arm_reg_type reg_type)
  {
    int reg;
    char *str = *ccp;
    struct neon_typed_alias atype;
-  enum arm_reg_type reg_type = REG_TYPE_VFD;
-
-  if (elsize == 4)
-    reg_type = REG_TYPE_VFS;
+  unsigned reg_size;
  
    reg = parse_typed_reg_or_scalar (&str, reg_type, NULL, &atype);
  
+  switch (reg_type)
+    {
+    case REG_TYPE_VFS:
+      reg_size = 32;
+      break;
+    case REG_TYPE_VFD:
+      reg_size = 64;
+      break;
+    case REG_TYPE_MQ:
+      reg_size = 128;
+      break;
+    default:
+      gas_assert (0);
+      return FAIL;
+    }
+
    if (reg == FAIL || (atype.defined & NTA_HASINDEX) == 0)
      return FAIL;
  
-  if (atype.index == NEON_ALL_LANES)
+  if (reg_type != REG_TYPE_MQ && atype.index == NEON_ALL_LANES)
      {
        first_error (_("scalar must have an index"));
        return FAIL;
      }
-  else if (atype.index >= 64 / elsize)
+  else if (atype.index >= reg_size / elsize)
      {
        first_error (_("scalar index out of range"));
        return FAIL;
@@ -6539,7 +6567,61 @@ parse_neon_mov (char **str, int *which_operand)
    char *ptr = *str;
    struct neon_type_el optype;
  
-  if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+   if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+    {
+      /* Cases 17 or 19.  */
+      inst.operands[i].reg = val;
+      inst.operands[i].isvec = 1;
+      inst.operands[i].isscalar = 2;
+      inst.operands[i].vectype = optype;
+      inst.operands[i++].present = 1;
+
+      if (skip_past_comma (&ptr) == FAIL)
+       goto wanted_comma;
+
+      if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
+       {
+         /* Case 17: VMOV<c>.<dt> <Qd[idx]>, <Rt>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i].present = 1;
+       }
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+       {
+         /* Case 19: VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isvec = 1;
+         inst.operands[i].isscalar = 2;
+         inst.operands[i].vectype = optype;
+         inst.operands[i++].present = 1;
+
+         if (skip_past_comma (&ptr) == FAIL)
+           goto wanted_comma;
+
+         if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+           goto wanted_arm;
+
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i++].present = 1;
+
+         if (skip_past_comma (&ptr) == FAIL)
+           goto wanted_comma;
+
+         if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+           goto wanted_arm;
+
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i].present = 1;
+       }
+      else
+       {
+         first_error (_("expected ARM or MVE vector register"));
+         return FAIL;
+       }
+    }
+   else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
      {
        /* Case 4: VMOV<c><q>.<size> <Dn[x]>, <Rd>.  */
        inst.operands[i].reg = val;
@@ -6557,8 +6639,10 @@ parse_neon_mov (char **str, int *which_operand)
        inst.operands[i].isreg = 1;
        inst.operands[i].present = 1;
      }
-  else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
-          != FAIL)
+  else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
+           != FAIL)
+          || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype, &optype))
+              != FAIL))
      {
        /* Cases 0, 1, 2, 3, 5 (D only).  */
        if (skip_past_comma (&ptr) == FAIL)
@@ -6655,7 +6739,7 @@ parse_neon_mov (char **str, int *which_operand)
      }
    else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
      {
-      /* Cases 6, 7.  */
+      /* Cases 6, 7, 16, 18.  */
        inst.operands[i].reg = val;
        inst.operands[i].isreg = 1;
        inst.operands[i++].present = 1;
@@ -6663,7 +6747,15 @@ parse_neon_mov (char **str, int *which_operand)
        if (skip_past_comma (&ptr) == FAIL)
         goto wanted_comma;
  
-      if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+      if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+       {
+         /* Case 18: VMOV<c>.<dt> <Rt>, <Qn[idx]>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isscalar = 2;
+         inst.operands[i].present = 1;
+         inst.operands[i].vectype = optype;
+       }
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
         {
           /* Case 6: VMOV<c><q>.<dt> <Rd>, <Dn[x]>  */
           inst.operands[i].reg = val;
@@ -6673,7 +6765,6 @@ parse_neon_mov (char **str, int *which_operand)
         }
        else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
         {
-         /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
           inst.operands[i].reg = val;
           inst.operands[i].isreg = 1;
           inst.operands[i++].present = 1;
@@ -6682,37 +6773,70 @@ parse_neon_mov (char **str, int *which_operand)
             goto wanted_comma;
  
           if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFSD, &rtype, &optype))
-             == FAIL)
+             != FAIL)
             {
-             first_error (_(reg_expected_msgs[REG_TYPE_VFSD]));
-             return FAIL;
-           }
-
-         inst.operands[i].reg = val;
-         inst.operands[i].isreg = 1;
-         inst.operands[i].isvec = 1;
-         inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
-         inst.operands[i].vectype = optype;
-         inst.operands[i].present = 1;
+             /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
  
-         if (rtype == REG_TYPE_VFS)
-           {
-             /* Case 14.  */
-             i++;
-             if (skip_past_comma (&ptr) == FAIL)
-               goto wanted_comma;
-             if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
-                                             &optype)) == FAIL)
-               {
-                 first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
-                 return FAIL;
-               }
               inst.operands[i].reg = val;
               inst.operands[i].isreg = 1;
               inst.operands[i].isvec = 1;
-             inst.operands[i].issingle = 1;
+             inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
               inst.operands[i].vectype = optype;
               inst.operands[i].present = 1;
+
+             if (rtype == REG_TYPE_VFS)
+               {
+                 /* Case 14.  */
+                 i++;
+                 if (skip_past_comma (&ptr) == FAIL)
+                   goto wanted_comma;
+                 if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
+                                                 &optype)) == FAIL)
+                   {
+                     first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
+                     return FAIL;
+                   }
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isreg = 1;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].issingle = 1;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i].present = 1;
+               }
+           }
+         else
+           {
+             if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+                      != FAIL)
+               {
+                 /* Case 16: VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>  */
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].isscalar = 2;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i++].present = 1;
+
+                 if (skip_past_comma (&ptr) == FAIL)
+                   goto wanted_comma;
+
+                 if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+                     == FAIL)
+                   {
+                     first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+                     return FAIL;
+                   }
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].isscalar = 2;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i].present = 1;
+               }
+             else
+               {
+                 first_error (_("VFP single, double or MVE vector register"
+                              " expected"));
+                 return FAIL;
+               }
             }
         }
        else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL, &optype))
@@ -6776,10 +6900,12 @@ enum operand_parse_code
    OP_RNQ,      /* Neon quad precision register */
    OP_RNQMQ,    /* Neon quad or MVE vector register.  */
    OP_RVSD,     /* VFP single or double precision register */
+  OP_RVSD_COND,        /* VFP single, double precision register or condition code.  */
    OP_RVSDMQ,   /* VFP single, double precision or MVE vector register.  */
    OP_RNSD,      /* Neon single or double precision register */
    OP_RNDQ,      /* Neon double or quad precision register */
    OP_RNDQMQ,     /* Neon double, quad or MVE vector register.  */
+  OP_RNDQMQR,   /* Neon double, quad, MVE vector or ARM register.  */
    OP_RNSDQ,    /* Neon single, double or quad precision register */
    OP_RNSC,      /* Neon scalar D[X] */
    OP_RVC,      /* VFP control register */
@@ -6799,12 +6925,15 @@ enum operand_parse_code
    OP_RNSDQMQR, /* Neon single, double or quad register, MVE vector register or
                    GPR (no SP/SP)  */
    OP_RMQ,      /* MVE vector register.  */
+  OP_RMQRZ,    /* MVE vector or ARM register including ZR.  */
+  OP_RMQRR,     /* MVE vector or ARM register.  */
  
    /* New operands for Armv8.1-M Mainline.  */
    OP_LR,       /* ARM LR register */
    OP_RRe,      /* ARM register, only even numbered.  */
    OP_RRo,      /* ARM register, only odd numbered, not r13 or r15.  */
    OP_RRnpcsp_I32, /* ARM register (no BadReg) or literal 1 .. 32 */
+  OP_RR_ZR,    /* ARM register or ZR but no PC */
  
    OP_REGLST,   /* ARM register list */
    OP_CLRMLST,  /* CLRM register list */
@@ -6820,16 +6949,28 @@ enum operand_parse_code
    OP_RNDQ_I0,   /* Neon D or Q reg, or immediate zero.  */
    OP_RVSD_I0,  /* VFP S or D reg, or immediate zero.  */
    OP_RSVD_FI0, /* VFP S or D reg, or floating point immediate zero.  */
+  OP_RSVDMQ_FI0, /* VFP S, D, MVE vector register or floating point immediate
+                   zero.  */
    OP_RR_RNSC,   /* ARM reg or Neon scalar.  */
    OP_RNSD_RNSC, /* Neon S or D reg, or Neon scalar.  */
    OP_RNSDQ_RNSC, /* Vector S, D or Q reg, or Neon scalar.  */
    OP_RNSDQ_RNSC_MQ, /* Vector S, D or Q reg, Neon scalar or MVE vector register.
                      */
+  OP_RNSDQ_RNSC_MQ_RR, /* Vector S, D or Q reg, or MVE vector reg , or Neon
+                         scalar, or ARM register.  */
    OP_RNDQ_RNSC, /* Neon D or Q reg, or Neon scalar.  */
+  OP_RNDQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, or ARM register.  */
+  OP_RNDQMQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, MVE vector or ARM
+                       register.  */
+  OP_RNDQMQ_RNSC, /* Neon D, Q or MVE vector reg, or Neon scalar.  */
    OP_RND_RNSC,  /* Neon D reg, or Neon scalar.  */
    OP_VMOV,      /* Neon VMOV operands.  */
    OP_RNDQ_Ibig,        /* Neon D or Q reg, or big immediate for logic and VMVN.  */
+  /* Neon D, Q or MVE vector register, or big immediate for logic and VMVN.  */
+  OP_RNDQMQ_Ibig,
    OP_RNDQ_I63b, /* Neon D or Q reg, or immediate for shift.  */
+  OP_RNDQMQ_I63b_RR, /* Neon D or Q reg, immediate for shift, MVE vector or
+                       ARM register.  */
    OP_RIWR_I32z, /* iWMMXt wR register, or immediate 0 .. 32 for iWMMXt2.  */
    OP_VLDR,     /* VLDR operand.  */
  
@@ -6910,6 +7051,8 @@ enum operand_parse_code
    OP_oROR,      /* ROR 0/8/16/24 */
    OP_oBARRIER_I15, /* Option argument for a barrier instruction.  */
  
+  OP_oRMQRZ,   /* optional MVE vector or ARM register including ZR.  */
+
    /* Some pre-defined mixed (ARM/THUMB) operands.  */
    OP_RR_npcsp          = MIX_ARM_THUMB_OPERANDS (OP_RR, OP_RRnpcsp),
    OP_RRnpc_npcsp       = MIX_ARM_THUMB_OPERANDS (OP_RRnpc, OP_RRnpcsp),
@@ -6959,6 +7102,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
        inst.operands[i].isvec = (rtype == REG_TYPE_VFS          \
                              || rtype == REG_TYPE_VFD           \
                              || rtype == REG_TYPE_NQ);          \
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);          \
      }                                                          \
    while (0)
  
@@ -6977,6 +7121,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
        inst.operands[i].isvec = (rtype == REG_TYPE_VFS          \
                              || rtype == REG_TYPE_VFD           \
                              || rtype == REG_TYPE_NQ);          \
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);          \
      }                                                          \
    while (0)
  
@@ -6989,10 +7134,11 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
      }                                                          \
    while (0)
  
-#define po_scalar_or_goto(elsz, label)                                 \
+#define po_scalar_or_goto(elsz, label, reg_type)                       \
    do                                                                   \
      {                                                                  \
-      val = parse_scalar (& str, elsz, & inst.operands[i].vectype);    \
+      val = parse_scalar (& str, elsz, & inst.operands[i].vectype,     \
+                         reg_type);                                    \
        if (val == FAIL)                                                 \
         goto label;                                                     \
        inst.operands[i].reg = val;                                      \
@@ -7090,7 +7236,20 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           break;
           /* Also accept generic coprocessor regs for unknown registers.  */
           coproc_reg:
-         po_reg_or_fail (REG_TYPE_CN);
+         po_reg_or_goto (REG_TYPE_CN, vpr_po);
+         break;
+         /* Also accept P0 or p0 for VPR.P0.  Since P0 is already an
+            existing register with a value of 0, this seems like the
+            best way to parse P0.  */
+         vpr_po:
+         if (strncasecmp (str, "P0", 2) == 0)
+           {
+             str += 2;
+             inst.operands[i].isreg = 1;
+             inst.operands[i].reg = 13;
+           }
+         else
+           goto failure;
           break;
         case OP_RMF:   po_reg_or_fail (REG_TYPE_MVF);     break;
         case OP_RMD:   po_reg_or_fail (REG_TYPE_MVD);     break;
@@ -7109,6 +7268,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         try_nq:
         case OP_RNQ:   po_reg_or_fail (REG_TYPE_NQ);      break;
         case OP_RNSD:  po_reg_or_fail (REG_TYPE_NSD);     break;
+       case OP_RNDQMQR:
+         po_reg_or_goto (REG_TYPE_RN, try_rndqmq);
+         break;
+       try_rndqmq:
         case OP_oRNDQMQ:
         case OP_RNDQMQ:
           po_reg_or_goto (REG_TYPE_MQ, try_rndq);
@@ -7121,6 +7284,9 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           break;
         try_rvsd:
         case OP_RVSD:  po_reg_or_fail (REG_TYPE_VFSD);    break;
+       case OP_RVSD_COND:
+         po_reg_or_goto (REG_TYPE_VFSD, try_cond);
+         break;
         case OP_oRNSDQ:
         case OP_RNSDQ: po_reg_or_fail (REG_TYPE_NSDQ);    break;
         case OP_RNSDQMQR:
@@ -7135,12 +7301,16 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_reg_or_fail (REG_TYPE_NSDQ);
           inst.error = 0;
           break;
+       case OP_RMQRR:
+         po_reg_or_goto (REG_TYPE_RN, try_rmq);
+         break;
+       try_rmq:
         case OP_RMQ:
           po_reg_or_fail (REG_TYPE_MQ);
           break;
         /* Neon scalar. Using an element size of 8 means that some invalid
            scalars are accepted here, so deal with those in later code.  */
-       case OP_RNSC:  po_scalar_or_goto (8, failure);    break;
+       case OP_RNSC:  po_scalar_or_goto (8, failure, REG_TYPE_VFD);    break;
  
         case OP_RNDQ_I0:
           {
@@ -7155,6 +7325,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_reg_or_goto (REG_TYPE_VFSD, try_imm0);
           break;
  
+       case OP_RSVDMQ_FI0:
+         po_reg_or_goto (REG_TYPE_MQ, try_rsvd_fi0);
+         break;
+       try_rsvd_fi0:
         case OP_RSVD_FI0:
           {
             po_reg_or_goto (REG_TYPE_VFSD, try_ifimm0);
@@ -7173,41 +7347,58 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_RR_RNSC:
           {
-           po_scalar_or_goto (8, try_rr);
+           po_scalar_or_goto (8, try_rr, REG_TYPE_VFD);
             break;
             try_rr:
             po_reg_or_fail (REG_TYPE_RN);
           }
           break;
  
+       case OP_RNSDQ_RNSC_MQ_RR:
+         po_reg_or_goto (REG_TYPE_RN, try_rnsdq_rnsc_mq);
+         break;
+       try_rnsdq_rnsc_mq:
         case OP_RNSDQ_RNSC_MQ:
           po_reg_or_goto (REG_TYPE_MQ, try_rnsdq_rnsc);
           break;
         try_rnsdq_rnsc:
         case OP_RNSDQ_RNSC:
           {
-           po_scalar_or_goto (8, try_nsdq);
+           po_scalar_or_goto (8, try_nsdq, REG_TYPE_VFD);
+           inst.error = 0;
             break;
             try_nsdq:
             po_reg_or_fail (REG_TYPE_NSDQ);
+           inst.error = 0;
           }
           break;
  
         case OP_RNSD_RNSC:
           {
-           po_scalar_or_goto (8, try_s_scalar);
+           po_scalar_or_goto (8, try_s_scalar, REG_TYPE_VFD);
             break;
             try_s_scalar:
-           po_scalar_or_goto (4, try_nsd);
+           po_scalar_or_goto (4, try_nsd, REG_TYPE_VFS);
             break;
             try_nsd:
             po_reg_or_fail (REG_TYPE_NSD);
           }
           break;
  
+       case OP_RNDQMQ_RNSC_RR:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc_rr);
+         break;
+       try_rndq_rnsc_rr:
+       case OP_RNDQ_RNSC_RR:
+         po_reg_or_goto (REG_TYPE_RN, try_rndq_rnsc);
+         break;
+       case OP_RNDQMQ_RNSC:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc);
+         break;
+       try_rndq_rnsc:
         case OP_RNDQ_RNSC:
           {
-           po_scalar_or_goto (8, try_ndq);
+           po_scalar_or_goto (8, try_ndq, REG_TYPE_VFD);
             break;
             try_ndq:
             po_reg_or_fail (REG_TYPE_NDQ);
@@ -7216,7 +7407,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_RND_RNSC:
           {
-           po_scalar_or_goto (8, try_vfd);
+           po_scalar_or_goto (8, try_vfd, REG_TYPE_VFD);
             break;
             try_vfd:
             po_reg_or_fail (REG_TYPE_VFD);
@@ -7229,6 +7420,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_misc_or_fail (parse_neon_mov (&str, &i) == FAIL);
           break;
  
+       case OP_RNDQMQ_Ibig:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_ibig);
+         break;
+       try_rndq_ibig:
         case OP_RNDQ_Ibig:
           {
             po_reg_or_goto (REG_TYPE_NDQ, try_immbig);
@@ -7245,6 +7440,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           }
           break;
  
+       case OP_RNDQMQ_I63b_RR:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_i63b_rr);
+         break;
+       try_rndq_i63b_rr:
+         po_reg_or_goto (REG_TYPE_RN, try_rndq_i63b);
+         break;
+       try_rndq_i63b:
         case OP_RNDQ_I63b:
           {
             po_reg_or_goto (REG_TYPE_NDQ, try_shimm);
@@ -7374,6 +7576,9 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_RRnpc_I0: po_reg_or_goto (REG_TYPE_RN, I0);   break;
         I0:               po_imm_or_fail (0, 0, FALSE);       break;
  
+       case OP_RRnpcsp_I32: po_reg_or_goto (REG_TYPE_RN, I32); break;
+       I32:                 po_imm_or_fail (1, 32, FALSE);     break;
+
         case OP_RF_IF:    po_reg_or_goto (REG_TYPE_FN, IF);   break;
         IF:
           if (!is_immediate_prefix (*str))
@@ -7427,6 +7632,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_CPSF:    val = parse_cps_flags (&str);          break;
         case OP_ENDI:    val = parse_endian_specifier (&str);   break;
         case OP_oROR:    val = parse_ror (&str);                break;
+       try_cond:
         case OP_COND:    val = parse_cond (&str);               break;
         case OP_oBARRIER_I15:
           po_barrier_or_imm (str); break;
@@ -7600,6 +7806,19 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_misc_or_fail (parse_shift (&str, i, SHIFT_LSL_OR_ASR_IMMEDIATE));
           break;
  
+       case OP_RMQRZ:
+       case OP_oRMQRZ:
+         po_reg_or_goto (REG_TYPE_MQ, try_rr_zr);
+         break;
+
+       case OP_RR_ZR:
+       try_rr_zr:
+         po_reg_or_goto (REG_TYPE_RN, ZR);
+         break;
+       ZR:
+         po_reg_or_fail (REG_TYPE_ZR);
+         break;
+
         default:
           as_fatal (_("unhandled operand code %d"), op_parse_code);
         }
@@ -7621,6 +7840,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_oRRnpcsp:
         case OP_RRnpcsp:
+       case OP_RRnpcsp_I32:
           if (inst.operands[i].isreg)
             {
               if (inst.operands[i].reg == REG_PC)
@@ -7643,10 +7863,12 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
             inst.error = BAD_PC;
           break;
  
+       case OP_RVSD_COND:
         case OP_VLDR:
           if (inst.operands[i].isreg)
             break;
         /* fall through.  */
+
         case OP_CPSF:
         case OP_ENDI:
         case OP_oROR:
@@ -7675,6 +7897,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
             inst.error = _("operand must be LR register");
           break;
  
+       case OP_RMQRZ:
+       case OP_oRMQRZ:
+       case OP_RR_ZR:
+         if (!inst.operands[i].iszr && inst.operands[i].reg == REG_PC)
+           inst.error = BAD_PC;
+         break;
+
         case OP_RRe:
           if (inst.operands[i].isreg
               && (inst.operands[i].reg & 0x00000001) != 0)
@@ -8491,6 +8720,11 @@ move_or_literal_pool (int i, enum lit_type t, bfd_boolean mode_3)
                       inst.instruction |= (imm & 0x0800) << 15;
                       inst.instruction |= (imm & 0x0700) << 4;
                       inst.instruction |= (imm & 0x00ff);
+                     /*  In case this replacement is being done on Armv8-M
+                         Baseline we need to make sure to disable the
+                         instruction size check, as otherwise GAS will reject
+                         the use of this T32 instruction.  */
+                     inst.size_req = 0;
                       return TRUE;
                     }
                 }
@@ -9615,10 +9849,42 @@ do_vmrs (void)
        return;
      }
  
-  /* MVFR2 is only valid at ARMv8-A.  */
-  if (inst.operands[1].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+  switch (inst.operands[1].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+                 _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case 1: /* fpscr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                   || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+                 _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+                 || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                     && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+         && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
  
    /* APSR_ sets isvec. All other refs to PC are illegal.  */
    if (!inst.operands[0].isvec && Rt == REG_PC)
@@ -9646,10 +9912,42 @@ do_vmsr (void)
        return;
      }
  
-  /* MVFR2 is only valid for ARMv8-A.  */
-  if (inst.operands[0].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+  switch (inst.operands[0].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+                 _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case  1: /* fpcr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                   || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+                 _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+                 || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                     && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+         && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
  
    /* If we get through parsing the register name, we just insert the number
       generated into the instruction without further validation.  */
@@ -9949,6 +10247,9 @@ do_shift (void)
  static void
  do_smc (void)
  {
+  unsigned int value = inst.relocs[0].exp.X_add_number;
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
    inst.relocs[0].type = BFD_RELOC_ARM_SMC;
    inst.relocs[0].pc_rel = 0;
  }
@@ -10169,6 +10470,10 @@ do_sxth (void)
  static void
  do_vfp_sp_monadic (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sm);
  }
@@ -10204,6 +10509,10 @@ do_vfp_sp_dp_cvt (void)
  static void
  do_vfp_reg_from_sp (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+            && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+            _(BAD_FPU));
+
    inst.instruction |= inst.operands[0].reg << 12;
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sn);
  }
@@ -10221,6 +10530,10 @@ do_vfp_reg2_from_sp2 (void)
  static void
  do_vfp_sp_from_reg (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+            && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+            _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sn);
    inst.instruction |= inst.operands[1].reg << 12;
  }
@@ -10323,6 +10636,10 @@ do_vfp_xp_ldstmdb (void)
  static void
  do_vfp_dp_rd_rm (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dm);
  }
@@ -10344,6 +10661,10 @@ do_vfp_dp_rd_rn (void)
  static void
  do_vfp_dp_rd_rn_rm (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dn);
    encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dm);
@@ -10358,6 +10679,10 @@ do_vfp_dp_rd (void)
  static void
  do_vfp_dp_rm_rd_rn (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dm);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dn);
@@ -10869,7 +11194,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
      inst.error = _("instruction does not accept unindexed addressing");
  }
  
-/* Table of Thumb instructions which exist in both 16- and 32-bit
+/* Table of Thumb instructions which exist in 16- and/or 32-bit
     encodings (the latter only in post-V6T2 cores).  The index is the
     value used in the insns table below.  When there is more than one
     possible 16-bit encoding for the instruction, this table always
@@ -10898,16 +11223,27 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_bflx,  0000, f070e001),                   \
    X(_bic,   4380, ea200000),                   \
    X(_bics,  4380, ea300000),                   \
+  X(_cinc,  0000, ea509000),                   \
+  X(_cinv,  0000, ea50a000),                   \
    X(_cmn,   42c0, eb100f00),                   \
    X(_cmp,   2800, ebb00f00),                   \
+  X(_cneg,  0000, ea50b000),                   \
    X(_cpsie, b660, f3af8400),                   \
    X(_cpsid, b670, f3af8600),                   \
    X(_cpy,   4600, ea4f0000),                   \
+  X(_csel,  0000, ea508000),                   \
+  X(_cset,  0000, ea5f900f),                   \
+  X(_csetm, 0000, ea5fa00f),                   \
+  X(_csinc, 0000, ea509000),                   \
+  X(_csinv, 0000, ea50a000),                   \
+  X(_csneg, 0000, ea50b000),                   \
    X(_dec_sp,80dd, f1ad0d00),                   \
    X(_dls,   0000, f040e001),                   \
+  X(_dlstp, 0000, f000e001),                   \
    X(_eor,   4040, ea800000),                   \
    X(_eors,  4040, ea900000),                   \
    X(_inc_sp,00dd, f10d0d00),                   \
+  X(_lctp,  0000, f00fe001),                   \
    X(_ldmia, c800, e8900000),                   \
    X(_ldr,   6800, f8500000),                   \
    X(_ldrb,  7800, f8100000),                   \
@@ -10918,6 +11254,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_ldr_pc2,4800, f85f0000),                  \
    X(_ldr_sp,9800, f85d0000),                   \
    X(_le,    0000, f00fc001),                   \
+  X(_letp,  0000, f01fc001),                   \
    X(_lsl,   0000, fa00f000),                   \
    X(_lsls,  0000, fa10f000),                   \
    X(_lsr,   0800, fa20f000),                   \
@@ -10960,6 +11297,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_wfe,   bf20, f3af8002),                   \
    X(_wfi,   bf30, f3af8003),                   \
    X(_wls,   0000, f040c001),                   \
+  X(_wlstp, 0000, f000c001),                   \
    X(_sev,   bf40, f3af8004),                    \
    X(_sevl,  bf50, f3af8005),                   \
    X(_udf,   de00, f7f0a000)
@@ -11713,6 +12051,60 @@ do_t_clz (void)
    inst.instruction |= Rm;
  }
  
+/* For the Armv8.1-M conditional instructions.  */
+static void
+do_t_cond (void)
+{
+  unsigned Rd, Rn, Rm;
+  signed int cond;
+
+  constraint (inst.cond != COND_ALWAYS, BAD_COND);
+
+  Rd = inst.operands[0].reg;
+  switch (inst.instruction)
+    {
+      case T_MNEM_csinc:
+      case T_MNEM_csinv:
+      case T_MNEM_csneg:
+      case T_MNEM_csel:
+       Rn = inst.operands[1].reg;
+       Rm = inst.operands[2].reg;
+       cond = inst.operands[3].imm;
+       constraint (Rn == REG_SP, BAD_SP);
+       constraint (Rm == REG_SP, BAD_SP);
+       break;
+
+      case T_MNEM_cinc:
+      case T_MNEM_cinv:
+      case T_MNEM_cneg:
+       Rn = inst.operands[1].reg;
+       cond = inst.operands[2].imm;
+       /* Invert the last bit to invert the cond.  */
+       cond = TOGGLE_BIT (cond, 0);
+       constraint (Rn == REG_SP, BAD_SP);
+       Rm = Rn;
+       break;
+
+      case T_MNEM_csetm:
+      case T_MNEM_cset:
+       cond = inst.operands[1].imm;
+       /* Invert the last bit to invert the cond.  */
+       cond = TOGGLE_BIT (cond, 0);
+       Rn = REG_PC;
+       Rm = REG_PC;
+       break;
+
+      default: abort ();
+    }
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= Rd << 8;
+  inst.instruction |= Rn << 16;
+  inst.instruction |= Rm;
+  inst.instruction |= cond << 4;
+}
+
  static void
  do_t_csdb (void)
  {
@@ -11861,18 +12253,6 @@ do_t_it (void)
    inst.instruction |= cond << 4;
  }
  
-static void
-do_mve_vpt (void)
-{
-  /* We are dealing with a vector predicated block.  */
-  set_pred_insn_type (VPT_INSN);
-  now_pred.cc = 0;
-  now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
-                 | ((inst.instruction & 0xe000) >> 13);
-  now_pred.warn_deprecated = FALSE;
-  now_pred.type = VECTOR_PRED;
-}
-
  /* Helper function used for both push/pop and ldm/stm.  */
  static void
  encode_thumb2_multi (bfd_boolean do_io, int base, unsigned mask,
@@ -13500,10 +13880,11 @@ do_t_smc (void)
               _("SMC is not permitted on this architecture"));
    constraint (inst.relocs[0].exp.X_op != O_constant,
               _("expression too complex"));
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
    inst.relocs[0].type = BFD_RELOC_UNUSED;
-  inst.instruction |= (value & 0xf000) >> 12;
-  inst.instruction |= (value & 0x0ff0);
    inst.instruction |= (value & 0x000f) << 16;
+
    /* PR gas/15623: SMC instructions must be last in an IT block.  */
    set_pred_insn_type_last ();
  }
@@ -13897,35 +14278,34 @@ v8_1_loop_reloc (int is_le)
      }
  }
  
-/* To handle the Scalar Low Overhead Loop instructions
-   in Armv8.1-M Mainline.  */
+/* For shifts in MVE.  */
  static void
-do_t_loloop (void)
+do_mve_scalar_shift (void)
  {
-  unsigned long insn = inst.instruction;
-
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
-  inst.instruction = THUMB_OP32 (inst.instruction);
-
-  switch (insn)
+  if (!inst.operands[2].present)
      {
-    case T_MNEM_le:
-      /* le <label>.  */
-      if (!inst.operands[0].present)
-       inst.instruction |= 1 << 21;
-
-      v8_1_loop_reloc (TRUE);
-      break;
+      inst.operands[2] = inst.operands[1];
+      inst.operands[1].reg = 0xf;
+    }
  
-    case T_MNEM_wls:
-      v8_1_loop_reloc (FALSE);
-      /* Fall through.  */
-    case T_MNEM_dls:
-      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
-      inst.instruction |= (inst.operands[1].reg << 16);
-      break;
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
  
-    default: abort();
+  if (inst.operands[2].isreg)
+    {
+      /* Assuming Rm is already checked not to be 11x1.  */
+      constraint (inst.operands[2].reg == inst.operands[0].reg, BAD_OVERLAP);
+      constraint (inst.operands[2].reg == inst.operands[1].reg, BAD_OVERLAP);
+      inst.instruction |= inst.operands[2].reg << 12;
+    }
+  else
+    {
+      /* Assuming imm is already checked as [1,32].  */
+      unsigned int value = inst.operands[2].imm;
+      inst.instruction |= (value & 0x1c) << 10;
+      inst.instruction |= (value & 0x03) << 6;
+      /* Change last 4 bits from 0xd to 0xf.  */
+      inst.instruction |= 0x2;
      }
  }
  
@@ -13961,6 +14341,55 @@ do_t_loloop (void)
  #define M_MNEM_vldrh   0xec100e10
  #define M_MNEM_vldrw   0xec100e40
  #define M_MNEM_vldrd   0xec100e50
+#define M_MNEM_vmovlt  0xeea01f40
+#define M_MNEM_vmovlb  0xeea00f40
+#define M_MNEM_vmovnt  0xfe311e81
+#define M_MNEM_vmovnb  0xfe310e81
+#define M_MNEM_vadc    0xee300f00
+#define M_MNEM_vadci   0xee301f00
+#define M_MNEM_vbrsr   0xfe011e60
+#define M_MNEM_vaddlv  0xee890f00
+#define M_MNEM_vaddlva 0xee890f20
+#define M_MNEM_vaddv   0xeef10f00
+#define M_MNEM_vaddva  0xeef10f20
+#define M_MNEM_vddup   0xee011f6e
+#define M_MNEM_vdwdup  0xee011f60
+#define M_MNEM_vidup   0xee010f6e
+#define M_MNEM_viwdup  0xee010f60
+#define M_MNEM_vmaxv   0xeee20f00
+#define M_MNEM_vmaxav  0xeee00f00
+#define M_MNEM_vminv   0xeee20f80
+#define M_MNEM_vminav  0xeee00f80
+#define M_MNEM_vmlaldav          0xee800e00
+#define M_MNEM_vmlaldava  0xee800e20
+#define M_MNEM_vmlaldavx  0xee801e00
+#define M_MNEM_vmlaldavax 0xee801e20
+#define M_MNEM_vmlsldav          0xee800e01
+#define M_MNEM_vmlsldava  0xee800e21
+#define M_MNEM_vmlsldavx  0xee801e01
+#define M_MNEM_vmlsldavax 0xee801e21
+#define M_MNEM_vrmlaldavhx  0xee801f00
+#define M_MNEM_vrmlaldavhax 0xee801f20
+#define M_MNEM_vrmlsldavh   0xfe800e01
+#define M_MNEM_vrmlsldavha  0xfe800e21
+#define M_MNEM_vrmlsldavhx  0xfe801e01
+#define M_MNEM_vrmlsldavhax 0xfe801e21
+#define M_MNEM_vqmovnt   0xee331e01
+#define M_MNEM_vqmovnb   0xee330e01
+#define M_MNEM_vqmovunt          0xee311e81
+#define M_MNEM_vqmovunb          0xee310e81
+#define M_MNEM_vshrnt      0xee801fc1
+#define M_MNEM_vshrnb      0xee800fc1
+#define M_MNEM_vrshrnt     0xfe801fc1
+#define M_MNEM_vqshrnt     0xee801f40
+#define M_MNEM_vqshrnb     0xee800f40
+#define M_MNEM_vqshrunt            0xee801fc0
+#define M_MNEM_vqshrunb            0xee800fc0
+#define M_MNEM_vrshrnb     0xfe800fc1
+#define M_MNEM_vqrshrnt            0xee801f41
+#define M_MNEM_vqrshrnb            0xee800f41
+#define M_MNEM_vqrshrunt    0xfe801fc0
+#define M_MNEM_vqrshrunb    0xfe800fc0
  
  /* Neon instruction encoder helpers.  */
  
@@ -14125,6 +14554,13 @@ NEON_ENC_TAB
       - a table used to drive neon_select_shape.  */
  
  #define NEON_SHAPE_DEF                 \
+  X(4, (R, R, Q, Q), QUAD),            \
+  X(4, (Q, R, R, I), QUAD),            \
+  X(4, (R, R, S, S), QUAD),            \
+  X(4, (S, S, R, R), QUAD),            \
+  X(3, (Q, R, I), QUAD),               \
+  X(3, (I, Q, Q), QUAD),               \
+  X(3, (I, Q, R), QUAD),               \
    X(3, (R, Q, Q), QUAD),               \
    X(3, (D, D, D), DOUBLE),             \
    X(3, (Q, Q, Q), QUAD),               \
@@ -14133,6 +14569,8 @@ NEON_ENC_TAB
    X(3, (D, D, S), DOUBLE),             \
    X(3, (Q, Q, S), QUAD),               \
    X(3, (Q, Q, R), QUAD),               \
+  X(3, (R, R, Q), QUAD),               \
+  X(2, (R, Q),   QUAD),                \
    X(2, (D, D), DOUBLE),                        \
    X(2, (Q, Q), QUAD),                  \
    X(2, (D, S), DOUBLE),                        \
@@ -14169,6 +14607,8 @@ NEON_ENC_TAB
    X(2, (R, S), SINGLE),                        \
    X(2, (F, R), SINGLE),                        \
    X(2, (R, F), SINGLE),                        \
+/* Used for MVE tail predicated loop instructions.  */\
+  X(2, (R, R), QUAD),                  \
  /* Half float shape supported so far.  */\
    X (2, (H, D), MIXED),                        \
    X (2, (D, H), MIXED),                        \
@@ -15163,21 +15603,640 @@ do_vfp_nsyn_nmul (void)
  
  }
  
-static void
-do_vfp_nsyn_cmp (void)
+/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
+   (0, 1, 2, 3).  */
+
+static unsigned
+neon_logbits (unsigned x)
  {
-  enum neon_shape rs;
-  if (inst.operands[1].isreg)
-    {
-      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
-      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
+  return ffs (x) - 4;
+}
  
-      if (rs == NS_FF || rs == NS_HH)
-       {
-         NEON_ENCODE (SINGLE, inst);
-         do_vfp_sp_monadic ();
-       }
-      else
+#define LOW4(R) ((R) & 0xf)
+#define HI1(R) (((R) >> 4) & 1)
+
+static unsigned
+mve_get_vcmp_vpt_cond (struct neon_type_el et)
+{
+  switch (et.type)
+    {
+    default:
+      first_error (BAD_EL_TYPE);
+      return 0;
+    case NT_float:
+      switch (inst.operands[0].imm)
+       {
+       default:
+         first_error (_("invalid condition"));
+         return 0;
+       case 0x0:
+         /* eq.  */
+         return 0;
+       case 0x1:
+         /* ne.  */
+         return 1;
+       case 0xa:
+         /* ge/  */
+         return 4;
+       case 0xb:
+         /* lt.  */
+         return 5;
+       case 0xc:
+         /* gt.  */
+         return 6;
+       case 0xd:
+         /* le.  */
+         return 7;
+       }
+    case NT_integer:
+      /* only accept eq and ne.  */
+      if (inst.operands[0].imm > 1)
+       {
+         first_error (_("invalid condition"));
+         return 0;
+       }
+      return inst.operands[0].imm;
+    case NT_unsigned:
+      if (inst.operands[0].imm == 0x2)
+       return 2;
+      else if (inst.operands[0].imm == 0x8)
+       return 3;
+      else
+       {
+         first_error (_("invalid condition"));
+         return 0;
+       }
+    case NT_signed:
+      switch (inst.operands[0].imm)
+       {
+         default:
+           first_error (_("invalid condition"));
+           return 0;
+         case 0xa:
+           /* ge.  */
+           return 4;
+         case 0xb:
+           /* lt.  */
+           return 5;
+         case 0xc:
+           /* gt.  */
+           return 6;
+         case 0xd:
+           /* le.  */
+           return 7;
+       }
+    }
+  /* Should be unreachable.  */
+  abort ();
+}
+
+static void
+do_mve_vpt (void)
+{
+  /* We are dealing with a vector predicated block.  */
+  if (inst.operands[0].present)
+    {
+      enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+      struct neon_type_el et
+       = neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+                          N_EQK);
+
+      unsigned fcond = mve_get_vcmp_vpt_cond (et);
+
+      constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+      if (et.type == NT_invtype)
+       return;
+
+      if (et.type == NT_float)
+       {
+         constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                     BAD_FPU);
+         constraint (et.size != 16 && et.size != 32, BAD_EL_TYPE);
+         inst.instruction |= (et.size == 16) << 28;
+         inst.instruction |= 0x3 << 20;
+       }
+      else
+       {
+         constraint (et.size != 8 && et.size != 16 && et.size != 32,
+                     BAD_EL_TYPE);
+         inst.instruction |= 1 << 28;
+         inst.instruction |= neon_logbits (et.size) << 20;
+       }
+
+      if (inst.operands[2].isquad)
+       {
+         inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+         inst.instruction |= LOW4 (inst.operands[2].reg);
+         inst.instruction |= (fcond & 0x2) >> 1;
+       }
+      else
+       {
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         inst.instruction |= 1 << 6;
+         inst.instruction |= (fcond & 0x2) << 4;
+         inst.instruction |= inst.operands[2].reg;
+       }
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= (fcond & 0x4) << 10;
+      inst.instruction |= (fcond & 0x1) << 7;
+
+    }
+    set_pred_insn_type (VPT_INSN);
+    now_pred.cc = 0;
+    now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
+                   | ((inst.instruction & 0xe000) >> 13);
+    now_pred.warn_deprecated = FALSE;
+    now_pred.type = VECTOR_PRED;
+    inst.is_neon = 1;
+}
+
+static void
+do_mve_vcmp (void)
+{
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+  if (!inst.operands[1].isreg || !inst.operands[1].isquad)
+    first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+  if (!inst.operands[2].present)
+    first_error (_("MVE vector or ARM register expected"));
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  /* Deal with 'else' conditional MVE's vcmp, it will be parsed as vcmpe.  */
+  if ((inst.instruction & 0xffffffff) == N_MNEM_vcmpe
+      && inst.operands[1].isquad)
+    {
+      inst.instruction = N_MNEM_vcmp;
+      inst.cond = 0x10;
+    }
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+                      N_EQK);
+
+  constraint (rs == NS_IQR && inst.operands[2].reg == REG_PC
+             && !inst.operands[2].iszr, BAD_PC);
+
+  unsigned fcond = mve_get_vcmp_vpt_cond (et);
+
+  inst.instruction = 0xee010f00;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= (fcond & 0x4) << 10;
+  inst.instruction |= (fcond & 0x1) << 7;
+  if (et.type == NT_float)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                 BAD_FPU);
+      inst.instruction |= (et.size == 16) << 28;
+      inst.instruction |= 0x3 << 20;
+    }
+  else
+    {
+      inst.instruction |= 1 << 28;
+      inst.instruction |= neon_logbits (et.size) << 20;
+    }
+  if (inst.operands[2].isquad)
+    {
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= (fcond & 0x2) >> 1;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+    }
+  else
+    {
+      if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+      inst.instruction |= 1 << 6;
+      inst.instruction |= (fcond & 0x2) << 4;
+      inst.instruction |= inst.operands[2].reg;
+    }
+
+  inst.is_neon = 1;
+  return;
+}
+
+static void
+do_mve_vmaxa_vmina (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_KEY | N_S8 | N_S16 | N_S32);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vfmas (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK, N_EQK);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_viddup (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned imm = inst.relocs[0].exp.X_add_number;
+  constraint (imm != 1 && imm != 2 && imm != 4 && imm != 8,
+             _("immediate must be either 1, 2, 4 or 8"));
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  unsigned Rm;
+  if (inst.instruction == M_MNEM_vddup || inst.instruction == M_MNEM_vidup)
+    {
+      rs = neon_select_shape (NS_QRI, NS_NULL);
+      et = neon_check_type (2, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK);
+      Rm = 7;
+    }
+  else
+    {
+      constraint ((inst.operands[2].reg % 2) != 1, BAD_EVEN);
+      if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+      else if (inst.operands[2].reg == REG_PC)
+       first_error (BAD_PC);
+
+      rs = neon_select_shape (NS_QRRI, NS_NULL);
+      et = neon_check_type (3, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK, N_EQK);
+      Rm = inst.operands[2].reg >> 1;
+    }
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (imm > 2) << 7;
+  inst.instruction |= Rm << 1;
+  inst.instruction |= (imm == 2 || imm == 8);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vmlas (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshll (void)
+{
+  struct neon_type_el et
+    = neon_check_type (2, NS_QQI, N_EQK, N_S8 | N_U8 | N_S16 | N_U16 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate value out of range"));
+
+  if ((unsigned)imm == et.size)
+    {
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= 0x110001;
+    }
+  else
+    {
+      inst.instruction |= (et.size + imm) << 16;
+      inst.instruction |= 0x800140;
+    }
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshlc (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[1].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[1].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || imm > 32, _("immediate value out of range"));
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (imm & 0x1f) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= inst.operands[1].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshrn (void)
+{
+  unsigned types;
+  switch (inst.instruction)
+    {
+    case M_MNEM_vshrnt:
+    case M_MNEM_vshrnb:
+    case M_MNEM_vrshrnt:
+    case M_MNEM_vrshrnb:
+      types = N_I16 | N_I32;
+      break;
+    case M_MNEM_vqshrnt:
+    case M_MNEM_vqshrnb:
+    case M_MNEM_vqrshrnt:
+    case M_MNEM_vqrshrnb:
+      types = N_U16 | N_U32 | N_S16 | N_S32;
+      break;
+    case M_MNEM_vqshrunt:
+    case M_MNEM_vqshrunb:
+    case M_MNEM_vqrshrunt:
+    case M_MNEM_vqrshrunb:
+      types = N_S16 | N_S32;
+      break;
+    default:
+      abort ();
+    }
+
+  struct neon_type_el et = neon_check_type (2, NS_QQI, N_EQK, types | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned Qd = inst.operands[0].reg;
+  unsigned Qm = inst.operands[1].reg;
+  unsigned imm = inst.operands[2].imm;
+  constraint (imm < 1 || ((unsigned) imm) > (et.size / 2),
+             et.size == 16
+             ? _("immediate operand expected in the range [1,8]")
+             : _("immediate operand expected in the range [1,16]"));
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (Qd) << 22;
+  inst.instruction |= (et.size - imm) << 16;
+  inst.instruction |= LOW4 (Qd) << 12;
+  inst.instruction |= HI1 (Qm) << 5;
+  inst.instruction |= LOW4 (Qm);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vqmovn (void)
+{
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vqmovnt
+     || inst.instruction == M_MNEM_vqmovnb)
+    et = neon_check_type (2, NS_QQ, N_EQK,
+                         N_U16 | N_U32 | N_S16 | N_S32 | N_KEY);
+  else
+    et = neon_check_type (2, NS_QQ, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (et.size == 32) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpsel (void)
+{
+  neon_select_shape (NS_QQQ, NS_NULL);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpnot (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+}
+
+static void
+do_mve_vmaxnma_vminnma (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vcmul (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+             _("immediate out of range"));
+
+  if (et.size == 32 && (inst.operands[0].reg == inst.operands[1].reg
+                       || inst.operands[0].reg == inst.operands[2].reg))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  inst.instruction |= (et.size == 32) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (rot > 90) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.instruction |= (rot == 90 || rot == 270);
+  inst.is_neon = 1;
+}
+
+/* To handle the Low Overhead Loop instructions
+   in Armv8.1-M Mainline and MVE.  */
+static void
+do_t_loloop (void)
+{
+  unsigned long insn = inst.instruction;
+
+  inst.instruction = THUMB_OP32 (inst.instruction);
+
+  if (insn == T_MNEM_lctp)
+    return;
+
+  set_pred_insn_type (MVE_OUTSIDE_PRED_INSN);
+
+  if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+    {
+      struct neon_type_el et
+       = neon_check_type (2, NS_RR, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.is_neon = 1;
+    }
+
+  switch (insn)
+    {
+    case T_MNEM_letp:
+      constraint (!inst.operands[0].present,
+                 _("expected LR"));
+      /* fall through.  */
+    case T_MNEM_le:
+      /* le <label>.  */
+      if (!inst.operands[0].present)
+       inst.instruction |= 1 << 21;
+
+      v8_1_loop_reloc (TRUE);
+      break;
+
+    case T_MNEM_wls:
+    case T_MNEM_wlstp:
+      v8_1_loop_reloc (FALSE);
+      /* fall through.  */
+    case T_MNEM_dlstp:
+    case T_MNEM_dls:
+      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
+
+      if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+       constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+      else if (inst.operands[1].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+
+      inst.instruction |= (inst.operands[1].reg << 16);
+      break;
+
+    default:
+      abort ();
+    }
+}
+
+
+static void
+do_vfp_nsyn_cmp (void)
+{
+  enum neon_shape rs;
+  if (!inst.operands[0].isreg)
+    {
+      do_mve_vcmp ();
+      return;
+    }
+  else
+    {
+      constraint (inst.operands[2].present, BAD_SYNTAX);
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+                 BAD_FPU);
+    }
+
+  if (inst.operands[1].isreg)
+    {
+      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
+      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
+
+      if (rs == NS_FF || rs == NS_HH)
+       {
+         NEON_ENCODE (SINGLE, inst);
+         do_vfp_sp_monadic ();
+       }
+      else
         {
           NEON_ENCODE (DOUBLE, inst);
           do_vfp_dp_rd_rm ();
@@ -15284,20 +16343,8 @@ neon_dp_fixup (struct arm_it* insn)
    insn->instruction = i;
  }
  
-/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
-   (0, 1, 2, 3).  */
-
-static unsigned
-neon_logbits (unsigned x)
-{
-  return ffs (x) - 4;
-}
-
-#define LOW4(R) ((R) & 0xf)
-#define HI1(R) (((R) >> 4) & 1)
-
  static void
-mve_encode_qqr (int size, int fp)
+mve_encode_qqr (int size, int U, int fp)
  {
    if (inst.operands[2].reg == REG_SP)
      as_tsktsk (MVE_BAD_SP);
@@ -15312,6 +16359,9 @@ mve_encode_qqr (int size, int fp)
        /* vsub.  */
        else if (((unsigned)inst.instruction) == 0x200d00)
         inst.instruction = 0xee301f40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x1000d10)
+       inst.instruction = 0xee310e60;
  
        /* Setting size which is 1 for F16 and 0 for F32.  */
        inst.instruction |= (size == 16) << 28;
@@ -15324,6 +16374,37 @@ mve_encode_qqr (int size, int fp)
        /* vsub.  */
        else if (((unsigned)inst.instruction) == 0x1000800)
         inst.instruction = 0xee011f40;
+      /* vhadd.  */
+      else if (((unsigned)inst.instruction) == 0)
+       inst.instruction = 0xee000f40;
+      /* vhsub.  */
+      else if (((unsigned)inst.instruction) == 0x200)
+       inst.instruction = 0xee001f40;
+      /* vmla.  */
+      else if (((unsigned)inst.instruction) == 0x900)
+       inst.instruction = 0xee010e40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x910)
+       inst.instruction = 0xee011e60;
+      /* vqadd.  */
+      else if (((unsigned)inst.instruction) == 0x10)
+       inst.instruction = 0xee000f60;
+      /* vqsub.  */
+      else if (((unsigned)inst.instruction) == 0x210)
+       inst.instruction = 0xee001f60;
+      /* vqrdmlah.  */
+      else if (((unsigned)inst.instruction) == 0x3000b10)
+       inst.instruction = 0xee000e40;
+      /* vqdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x0000b00)
+       inst.instruction = 0xee010e60;
+      /* vqrdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x1000b00)
+       inst.instruction = 0xfe010e60;
+
+      /* Set U-bit.  */
+      inst.instruction |= U << 28;
+
        /* Setting bits for size.  */
        inst.instruction |= neon_logbits (size) << 20;
      }
@@ -15364,6 +16445,30 @@ mve_encode_qqq (int ubit, int size)
    inst.is_neon = 1;
  }
  
+static void
+mve_encode_rq (unsigned bit28, unsigned size)
+{
+  inst.instruction |= bit28 << 28;
+  inst.instruction |= neon_logbits (size) << 18;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+mve_encode_rrqq (unsigned U, unsigned size)
+{
+  constraint (inst.operands[3].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= U << 28;
+  inst.instruction |= (inst.operands[1].reg >> 1) << 20;
+  inst.instruction |= LOW4 (inst.operands[2].reg) << 16;
+  inst.instruction |= (size == 32) << 16;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 7;
+  inst.instruction |= inst.operands[3].reg;
+  inst.is_neon = 1;
+}
  
  /* Encode insns with bit pattern:
  
@@ -15413,24 +16518,141 @@ neon_two_same (int qbit, int ubit, int size)
    neon_dp_fixup (&inst);
  }
  
+enum vfp_or_neon_is_neon_bits
+{
+NEON_CHECK_CC = 1,
+NEON_CHECK_ARCH = 2,
+NEON_CHECK_ARCH8 = 4
+};
+
+/* Call this function if an instruction which may have belonged to the VFP or
+ Neon instruction sets, but turned out to be a Neon instruction (due to the
+ operand types involved, etc.). We have to check and/or fix-up a couple of
+ things:
+
+   - Make sure the user hasn't attempted to make a Neon instruction
+     conditional.
+   - Alter the value in the condition code field if necessary.
+   - Make sure that the arch supports Neon instructions.
+
+ Which of these operations take place depends on bits from enum
+ vfp_or_neon_is_neon_bits.
+
+ WARNING: This function has side effects! If NEON_CHECK_CC is used and the
+ current instruction's condition is COND_ALWAYS, the condition field is
+ changed to inst.uncond_value.  This is necessary because instructions shared
+ between VFP and Neon may be conditional for the VFP variants only, and the
+ unconditional Neon version must have, e.g., 0xF in the condition field.  */
+
+static int
+vfp_or_neon_is_neon (unsigned check)
+{
+/* Conditions are always legal in Thumb mode (IT blocks).  */
+if (!thumb_mode && (check & NEON_CHECK_CC))
+  {
+    if (inst.cond != COND_ALWAYS)
+      {
+       first_error (_(BAD_COND));
+       return FAIL;
+      }
+    if (inst.uncond_value != -1)
+      inst.instruction |= inst.uncond_value << 28;
+  }
+
+
+  if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
+      || ((check & NEON_CHECK_ARCH8)
+         && !mark_feature_used (&fpu_neon_ext_armv8)))
+    {
+      first_error (_(BAD_FPU));
+      return FAIL;
+    }
+
+return SUCCESS;
+}
+
+
+/* Return TRUE if the SIMD instruction is available for the current
+   cpu_variant.  FP is set to TRUE if this is a SIMD floating-point
+   instruction.  CHECK contains th.  CHECK contains the set of bits to pass to
+   vfp_or_neon_is_neon for the NEON specific checks.  */
+
+static bfd_boolean
+check_simd_pred_availability (int fp, unsigned check)
+{
+if (inst.cond > COND_ALWAYS)
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      {
+       inst.error = BAD_FPU;
+       return FALSE;
+      }
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  }
+else if (inst.cond < COND_ALWAYS)
+  {
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+    else if (vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+  }
+else
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
+       && vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  }
+return TRUE;
+}
+
  /* Neon instruction encoders, in approximate order of appearance.  */
  
  static void
  do_neon_dyadic_i_su (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_32 | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+
+  et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_32 | N_KEY);
+
+
+  if (rs != NS_QQR)
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  else
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
  }
  
  static void
  do_neon_dyadic_i64_su (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+  if (rs == NS_QQR)
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+  else
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
  }
  
  static void
@@ -15453,12 +16675,25 @@ neon_imm_shift (int write_ubit, int uval, int isquad, struct neon_type_el et,
  }
  
  static void
-do_neon_shl_imm (void)
+do_neon_shl (void)
  {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
    if (!inst.operands[2].isreg)
      {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_MVE);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+       }
        int imm = inst.operands[2].imm;
  
        constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15468,33 +16703,77 @@ do_neon_shl_imm (void)
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
-
-      /* VSHL/VQSHL 3-register variants have syntax such as:
-          vshl.xx Dd, Dm, Dn
-        whereas other 3-register operations encoded by neon_three_same have
-        syntax like:
-          vadd.xx Dd, Dn, Dm
-        (i.e. with Dn & Dm reversed). Swap operands[1].reg and operands[2].reg
-        here.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+       }
+
+
+      if (rs == NS_QQR)
+       {
+         constraint (inst.operands[0].reg != inst.operands[1].reg,
+                      _("invalid instruction shape"));
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee311e60;
+         inst.instruction |= (et.type == NT_unsigned) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= neon_logbits (et.size) << 18;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
+       }
+      else
+       {
+         unsigned int tmp;
+
+         /* VSHL/VQSHL 3-register variants have syntax such as:
+              vshl.xx Dd, Dm, Dn
+            whereas other 3-register operations encoded by neon_three_same have
+            syntax like:
+              vadd.xx Dd, Dn, Dm
+            (i.e. with Dn & Dm reversed). Swap operands[1].reg and
+            operands[2].reg here.  */
+         tmp = inst.operands[2].reg;
+         inst.operands[2].reg = inst.operands[1].reg;
+         inst.operands[1].reg = tmp;
+         NEON_ENCODE (INTEGER, inst);
+         neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+       }
      }
  }
  
  static void
-do_neon_qshl_imm (void)
+do_neon_qshl (void)
  {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
    if (!inst.operands[2].isreg)
      {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_SU_MVE);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+       }
        int imm = inst.operands[2].imm;
  
        constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15504,32 +16783,103 @@ do_neon_qshl_imm (void)
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
+      enum neon_shape rs;
+      struct neon_type_el et;
  
-      /* See note in do_neon_shl_imm.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+       }
+
+      if (rs == NS_QQR)
+       {
+         constraint (inst.operands[0].reg != inst.operands[1].reg,
+                      _("invalid instruction shape"));
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee311ee0;
+         inst.instruction |= (et.type == NT_unsigned) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= neon_logbits (et.size) << 18;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
+       }
+      else
+       {
+         unsigned int tmp;
+
+         /* See note in do_neon_shl.  */
+         tmp = inst.operands[2].reg;
+         inst.operands[2].reg = inst.operands[1].reg;
+         inst.operands[1].reg = tmp;
+         NEON_ENCODE (INTEGER, inst);
+         neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+       }
      }
  }
  
  static void
  do_neon_rshl (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+
    unsigned int tmp;
  
-  tmp = inst.operands[2].reg;
-  inst.operands[2].reg = inst.operands[1].reg;
-  inst.operands[1].reg = tmp;
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (rs == NS_QQR)
+    {
+      if (inst.operands[2].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+
+      constraint (inst.operands[0].reg != inst.operands[1].reg,
+                 _("invalid instruction shape"));
+
+      if (inst.instruction == 0x0000510)
+       /* We are dealing with vqrshl.  */
+       inst.instruction = 0xee331ee0;
+      else
+       /* We are dealing with vrshl.  */
+       inst.instruction = 0xee331e60;
+
+      inst.instruction |= (et.type == NT_unsigned) << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= inst.operands[2].reg;
+      inst.is_neon = 1;
+    }
+  else
+    {
+      tmp = inst.operands[2].reg;
+      inst.operands[2].reg = inst.operands[1].reg;
+      inst.operands[1].reg = tmp;
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+    }
  }
  
  static int
@@ -15594,6 +16944,14 @@ do_neon_logic (void)
    if (inst.operands[2].present && inst.operands[2].isreg)
      {
        enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      if (rs == NS_QQQ
+         && !check_simd_pred_availability (FALSE,
+                                           NEON_CHECK_ARCH | NEON_CHECK_CC))
+       return;
+      else if (rs != NS_QQQ
+              && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+       first_error (BAD_FPU);
+
        neon_check_type (3, rs, N_IGNORE_TYPE);
        /* U bit and size field were set as part of the bitmask.  */
        NEON_ENCODE (INTEGER, inst);
@@ -15607,14 +16965,29 @@ do_neon_logic (void)
        enum neon_shape rs = (three_ops_form
                             ? neon_select_shape (NS_DDI, NS_QQI, NS_NULL)
                             : neon_select_shape (NS_DI, NS_QI, NS_NULL));
-      struct neon_type_el et = neon_check_type (2, rs,
-       N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
+      /* Because neon_select_shape makes the second operand a copy of the first
+        if the second operand is not present.  */
+      if (rs == NS_QQI
+         && !check_simd_pred_availability (FALSE,
+                                           NEON_CHECK_ARCH | NEON_CHECK_CC))
+       return;
+      else if (rs != NS_QQI
+              && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+       first_error (BAD_FPU);
+
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       et = neon_check_type (2, rs, N_I32 | N_I16 | N_KEY, N_EQK);
+      else
+       et = neon_check_type (2, rs, N_I8 | N_I16 | N_I32 | N_I64 | N_F32
+                             | N_KEY, N_EQK);
+
+      if (et.type == NT_invtype)
+       return;
        enum neon_opc opcode = (enum neon_opc) inst.instruction & 0x0fffffff;
        unsigned immbits;
        int cmode;
  
-      if (et.type == NT_invtype)
-       return;
  
        if (three_ops_form)
         constraint (inst.operands[0].reg != inst.operands[1].reg,
@@ -15693,7 +17066,7 @@ neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
      {
        NEON_ENCODE (FLOAT, inst);
        if (rs == NS_QQR)
-       mve_encode_qqr (et.size, 1);
+       mve_encode_qqr (et.size, 0, 1);
        else
         neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
      }
@@ -15701,7 +17074,7 @@ neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
      {
        NEON_ENCODE (INTEGER, inst);
        if (rs == NS_QQR)
-       mve_encode_qqr (et.size, 0);
+       mve_encode_qqr (et.size, et.type == ubit_meaning, 0);
        else
         neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
      }
@@ -15709,103 +17082,19 @@ neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
  
  
  static void
-do_neon_dyadic_if_su_d (void)
-{
-  /* This version only allow D registers, but that constraint is enforced during
-     operand parsing so we don't need to do anything extra here.  */
-  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
-}
-
-static void
-do_neon_dyadic_if_i_d (void)
-{
-  /* The "untyped" case can't happen. Do this to stop the "U" bit being
-     affected if we specify unsigned args.  */
-  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
-}
-
-enum vfp_or_neon_is_neon_bits
-{
-  NEON_CHECK_CC = 1,
-  NEON_CHECK_ARCH = 2,
-  NEON_CHECK_ARCH8 = 4
-};
-
-/* Call this function if an instruction which may have belonged to the VFP or
-   Neon instruction sets, but turned out to be a Neon instruction (due to the
-   operand types involved, etc.). We have to check and/or fix-up a couple of
-   things:
-
-     - Make sure the user hasn't attempted to make a Neon instruction
-       conditional.
-     - Alter the value in the condition code field if necessary.
-     - Make sure that the arch supports Neon instructions.
-
-   Which of these operations take place depends on bits from enum
-   vfp_or_neon_is_neon_bits.
-
-   WARNING: This function has side effects! If NEON_CHECK_CC is used and the
-   current instruction's condition is COND_ALWAYS, the condition field is
-   changed to inst.uncond_value. This is necessary because instructions shared
-   between VFP and Neon may be conditional for the VFP variants only, and the
-   unconditional Neon version must have, e.g., 0xF in the condition field.  */
-
-static int
-vfp_or_neon_is_neon (unsigned check)
-{
-  /* Conditions are always legal in Thumb mode (IT blocks).  */
-  if (!thumb_mode && (check & NEON_CHECK_CC))
-    {
-      if (inst.cond != COND_ALWAYS)
-       {
-         first_error (_(BAD_COND));
-         return FAIL;
-       }
-      if (inst.uncond_value != -1)
-       inst.instruction |= inst.uncond_value << 28;
-    }
-
-
-    if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
-       || ((check & NEON_CHECK_ARCH8)
-           && !mark_feature_used (&fpu_neon_ext_armv8)))
-      {
-       first_error (_(BAD_FPU));
-       return FAIL;
-      }
-
-  return SUCCESS;
-}
-
-static int
-check_simd_pred_availability (int fp, unsigned check)
-{
-  if (inst.cond > COND_ALWAYS)
-    {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       {
-         inst.error = BAD_FPU;
-         return 1;
-       }
-      inst.pred_insn_type = INSIDE_VPT_INSN;
-    }
-  else if (inst.cond < COND_ALWAYS)
-    {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
-      else if (vfp_or_neon_is_neon (check) == FAIL)
-       return 2;
-    }
-  else
-    {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
-         && vfp_or_neon_is_neon (check) == FAIL)
-       return 3;
+do_neon_dyadic_if_su_d (void)
+{
+  /* This version only allow D registers, but that constraint is enforced during
+     operand parsing so we don't need to do anything extra here.  */
+  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
+}
  
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
-    }
-  return 0;
+static void
+do_neon_dyadic_if_i_d (void)
+{
+  /* The "untyped" case can't happen. Do this to stop the "U" bit being
+     affected if we specify unsigned args.  */
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
  }
  
  static void
@@ -16087,6 +17376,30 @@ do_mve_vst_vld (void)
    inst.is_neon = 1;
  }
  
+static void
+do_mve_vaddlv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S32 | N_U32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= inst.operands[1].reg << 19;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
  static void
  do_neon_dyadic_if_su (void)
  {
@@ -16094,8 +17407,13 @@ do_neon_dyadic_if_su (void)
    struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
                                             N_SUF_32 | N_KEY);
  
-  if (check_simd_pred_availability (et.type == NT_float,
-                                   NEON_CHECK_ARCH | NEON_CHECK_CC))
+  constraint ((inst.instruction == ((unsigned) N_MNEM_vmax)
+              || inst.instruction == ((unsigned) N_MNEM_vmin))
+             && et.type == NT_float
+             && !ARM_CPU_HAS_FEATURE (cpu_variant,fpu_neon_ext_v1), BAD_FPU);
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH | NEON_CHECK_CC))
      return;
  
    neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
@@ -16119,8 +17437,8 @@ do_neon_addsub_if_i (void)
       they are predicated or not.  */
    if ((rs == NS_QQQ || rs == NS_QQR) && et.size != 64)
      {
-      if (check_simd_pred_availability (et.type == NT_float,
-                                       NEON_CHECK_ARCH | NEON_CHECK_CC))
+      if (!check_simd_pred_availability (et.type == NT_float,
+                                        NEON_CHECK_ARCH | NEON_CHECK_CC))
         return;
      }
    else
@@ -16281,19 +17599,30 @@ do_neon_mac_maybe_scalar (void)
    if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
      return;
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
      return;
  
    if (inst.operands[2].isscalar)
      {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
        enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
        struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
        NEON_ENCODE (SCALAR, inst);
        neon_mul_mac (et, neon_quad (rs));
      }
+  else if (!inst.operands[2].isvec)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+      neon_dyadic_misc (NT_unsigned, N_SU_MVE, 0);
+    }
    else
      {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
        /* The "untyped" case can't happen.  Do this to stop the "U" bit being
          affected if we specify unsigned args.  */
        neon_dyadic_misc (NT_untyped, N_IF_32, 0);
@@ -16303,12 +17632,42 @@ do_neon_mac_maybe_scalar (void)
  static void
  do_neon_fmac (void)
  {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_fma)
+      && try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
      return;
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH))
      return;
  
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+    {
+      enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK,
+                                               N_EQK);
+
+      if (rs == NS_QQR)
+       {
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee310e40;
+         inst.instruction |= (et.size == 16) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= HI1 (inst.operands[1].reg) << 6;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
+         return;
+       }
+    }
+  else
+    {
+      constraint (!inst.operands[2].isvec, BAD_FPU);
+    }
+
    neon_dyadic_misc (NT_untyped, N_IF_32, 0);
  }
  
@@ -16331,20 +17690,45 @@ do_neon_mul (void)
    if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
      return;
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
      return;
  
    if (inst.operands[2].isscalar)
-    do_neon_mac_maybe_scalar ();
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      do_neon_mac_maybe_scalar ();
+    }
    else
-    neon_dyadic_misc (NT_poly, N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+    {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         enum neon_shape rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+         struct neon_type_el et
+           = neon_check_type (3, rs, N_EQK, N_EQK, N_I_MVE | N_F_MVE | N_KEY);
+         if (et.type == NT_float)
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                       BAD_FPU);
+
+         neon_dyadic_misc (NT_float, N_I_MVE | N_F_MVE, 0);
+       }
+      else
+       {
+         constraint (!inst.operands[2].isvec, BAD_FPU);
+         neon_dyadic_misc (NT_poly,
+                           N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+       }
+    }
  }
  
  static void
  do_neon_qdmulh (void)
  {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
    if (inst.operands[2].isscalar)
      {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
        enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
        struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
@@ -16353,15 +17737,195 @@ do_neon_qdmulh (void)
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs,
+           N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs,
+           N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+       }
+
        NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      if (rs == NS_QQR)
+       mve_encode_qqr (et.size, 0, 0);
+      else
+       /* The U bit (rounding) comes from bit mask.  */
+       neon_three_same (neon_quad (rs), 0, et.size);
+    }
+}
+
+static void
+do_mve_vaddv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK,  N_SU_32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vhcadd (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
+
+  if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+    as_tsktsk (_("Warning: 32-bit element size and same first and third "
+                "operand makes instruction UNPREDICTABLE"));
+
+  mve_encode_qqq (0, et.size);
+  inst.instruction |= (rot == 270) << 12;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vqdmull (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (et.size == 32
+      && (inst.operands[0].reg == inst.operands[1].reg
+         || (rs == NS_QQQ && inst.operands[0].reg == inst.operands[2].reg)))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (rs == NS_QQQ)
+    {
+      mve_encode_qqq (et.size == 32, 64);
+      inst.instruction |= 1;
+    }
+  else
+    {
+      mve_encode_qqr (64, et.size == 32, 0);
+      inst.instruction |= 0x3 << 5;
      }
  }
  
+static void
+do_mve_vadc (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_KEY | N_I32, N_EQK, N_EQK);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, 64);
+}
+
+static void
+do_mve_vbrsr (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, 0, 0);
+}
+
+static void
+do_mve_vsbc (void)
+{
+  neon_check_type (3, NS_QQQ, N_EQK, N_EQK, N_I32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (1, 64);
+}
+
+static void
+do_mve_vmulh (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vqdmlah (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+}
+
+static void
+do_mve_vqdmladh (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, et.size);
+}
+
+
  static void
  do_mve_vmull (void)
  {
@@ -16477,34 +18041,159 @@ do_mve_vmladav (void)
  }
  
  static void
-do_neon_qrdmlah (void)
+do_mve_vmlaldav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (4, rs, N_EQK, N_EQK, N_EQK,
+                      N_S16 | N_S32 | N_U16 | N_U32 | N_KEY);
+
+  if (et.type == NT_unsigned
+      && (inst.instruction == M_MNEM_vmlsldav
+         || inst.instruction == M_MNEM_vmlsldava
+         || inst.instruction == M_MNEM_vmlsldavx
+         || inst.instruction == M_MNEM_vmlsldavax))
+    first_error (BAD_SIMD_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vrmlaldavh (void)
  {
-  /* Check we're on the correct architecture.  */
-  if (!mark_feature_used (&fpu_neon_ext_armv8))
-    inst.error =
-      _("instruction form not available on this architecture.");
-  else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vrmlsldavh
+     || inst.instruction == M_MNEM_vrmlsldavha
+     || inst.instruction == M_MNEM_vrmlsldavhx
+     || inst.instruction == M_MNEM_vrmlsldavhax)
      {
-      as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
-      record_feature_use (&fpu_neon_ext_v8_1);
+      et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
      }
+  else
+    {
+      if (inst.instruction == M_MNEM_vrmlaldavhx
+         || inst.instruction == M_MNEM_vrmlaldavhax)
+       et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      else
+       et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK,
+                             N_U32 | N_S32 | N_KEY);
+      /* vrmlaldavh's encoding with SP as the second, odd, GPR operand may alias
+        with vmax/min instructions, making the use of SP in assembly really
+        nonsensical, so instead of issuing a warning like we do for other uses
+        of SP for the odd register operand we error out.  */
+      constraint (inst.operands[1].reg == REG_SP, BAD_SP);
+    }
+
+  /* Make sure we still check the second operand is an odd one and that PC is
+     disallowed.  This because we are parsing for any GPR operand, to be able
+     to distinguish between giving a warning or an error for SP as described
+     above.  */
+  constraint ((inst.operands[1].reg % 2) != 1, BAD_EVEN);
+  constraint (inst.operands[1].reg == REG_PC, BAD_PC);
  
-  if (inst.operands[2].isscalar)
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, 0);
+}
+
+
+static void
+do_mve_vmaxnmv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.size == 16, 64);
+}
+
+static void
+do_mve_vmaxv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et;
+
+  if (inst.instruction == M_MNEM_vmaxv || inst.instruction == M_MNEM_vminv)
+    et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+  else
+    et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
+
+
+static void
+do_neon_qrdmlah (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
      {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      /* Check we're on the correct architecture.  */
+      if (!mark_feature_used (&fpu_neon_ext_armv8))
+       inst.error
+         = _("instruction form not available on this architecture.");
+      else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+       {
+         as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
+         record_feature_use (&fpu_neon_ext_v8_1);
+       }
+       if (inst.operands[2].isscalar)
+         {
+           enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+           struct neon_type_el et = neon_check_type (3, rs,
+             N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+           NEON_ENCODE (SCALAR, inst);
+           neon_mul_mac (et, neon_quad (rs));
+         }
+       else
+         {
+           enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+           struct neon_type_el et = neon_check_type (3, rs,
+             N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+           NEON_ENCODE (INTEGER, inst);
+           /* The U bit (rounding) comes from bit mask.  */
+           neon_three_same (neon_quad (rs), 0, et.size);
+         }
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      struct neon_type_el et
+       = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
        NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
      }
  }
  
@@ -16546,8 +18235,8 @@ do_neon_abs_neg (void)
    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
    et = neon_check_type (2, rs, N_EQK, N_S_32 | N_F_16_32 | N_KEY);
  
-  if (check_simd_pred_availability (et.type == NT_float,
-                                   NEON_CHECK_ARCH | NEON_CHECK_CC))
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH | NEON_CHECK_CC))
      return;
  
    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -16564,9 +18253,23 @@ do_neon_abs_neg (void)
  static void
  do_neon_sli (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
+
    int imm = inst.operands[2].imm;
    constraint (imm < 0 || (unsigned)imm >= et.size,
               _("immediate out of range for insert"));
@@ -16576,9 +18279,22 @@ do_neon_sli (void)
  static void
  do_neon_sri (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
    int imm = inst.operands[2].imm;
    constraint (imm < 1 || (unsigned)imm > et.size,
               _("immediate out of range for insert"));
@@ -16588,9 +18304,23 @@ do_neon_sri (void)
  static void
  do_neon_qshlu_imm (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK | N_UNS, N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK | N_UNS,
+                           N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+    }
+
    int imm = inst.operands[2].imm;
    constraint (imm < 0 || (unsigned)imm >= et.size,
               _("immediate out of range for shift"));
@@ -17047,7 +18777,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
               || flavour == neon_cvt_flavour_s32_f32
               || flavour == neon_cvt_flavour_u32_f32))
         {
-         if (check_simd_pred_availability (1, NEON_CHECK_CC | NEON_CHECK_ARCH))
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH))
             return;
         }
        else if (mode == neon_cvt_mode_n)
@@ -17134,8 +18865,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
               || flavour == neon_cvt_flavour_s32_f32
               || flavour == neon_cvt_flavour_u32_f32))
         {
-         if (check_simd_pred_availability (1,
-                                           NEON_CHECK_CC | NEON_CHECK_ARCH8))
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH8))
             return;
         }
        else if (mode == neon_cvt_mode_z
@@ -17148,8 +18879,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
                    || flavour == neon_cvt_flavour_s32_f32
                    || flavour == neon_cvt_flavour_u32_f32))
         {
-         if (check_simd_pred_availability (1,
-                                           NEON_CHECK_CC | NEON_CHECK_ARCH))
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH))
             return;
         }
        /* fall through.  */
@@ -17158,8 +18889,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
         {
  
           NEON_ENCODE (FLOAT, inst);
-         if (check_simd_pred_availability (1,
-                                           NEON_CHECK_CC | NEON_CHECK_ARCH8))
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH8))
             return;
  
           inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17319,7 +19050,7 @@ do_neon_cvttb_1 (bfd_boolean t)
    else if (rs == NS_QQ || rs == NS_QQI)
      {
        int single_to_half = 0;
-      if (check_simd_pred_availability (1, NEON_CHECK_ARCH))
+      if (!check_simd_pred_availability (TRUE, NEON_CHECK_ARCH))
         return;
  
        enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
@@ -17459,9 +19190,16 @@ neon_move_immediate (void)
  static void
  do_neon_mvn (void)
  {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
    if (inst.operands[1].isreg)
      {
-      enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       rs = neon_select_shape (NS_QQ, NS_NULL);
+      else
+       rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
  
        NEON_ENCODE (INTEGER, inst);
        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17477,6 +19215,13 @@ do_neon_mvn (void)
      }
  
    neon_dp_fixup (&inst);
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      constraint (!inst.operands[1].isreg && !inst.operands[0].isquad, BAD_FPU);
+      constraint ((inst.instruction & 0xd00) == 0xd00,
+                 _("immediate value out of range"));
+    }
  }
  
  /* Encode instructions of form:
@@ -17789,14 +19534,29 @@ do_neon_ext (void)
  static void
  do_neon_rev (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
    unsigned op = (inst.instruction >> 7) & 3;
    /* N (width of reversed regions) is encoded as part of the bitmask. We
       extract it here to check the elements to be reversed are smaller.
       Otherwise we'd get a reserved instruction.  */
    unsigned elsize = (op == 2) ? 16 : (op == 1) ? 32 : (op == 0) ? 64 : 0;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext) && elsize == 64
+      && inst.operands[0].reg == inst.operands[1].reg)
+    as_tsktsk (_("Warning: 64-bit element size and same destination and source"
+                " operands makes instruction UNPREDICTABLE"));
+
    gas_assert (elsize != 0);
    constraint (et.size >= elsize,
               _("elements must be smaller than reversal region"));
@@ -17808,6 +19568,8 @@ do_neon_dup (void)
  {
    if (inst.operands[1].isscalar)
      {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+                 BAD_FPU);
        enum neon_shape rs = neon_select_shape (NS_DS, NS_QS, NS_NULL);
        struct neon_type_el et = neon_check_type (2, rs,
         N_EQK, N_8 | N_16 | N_32 | N_KEY);
@@ -17835,6 +19597,23 @@ do_neon_dup (void)
        enum neon_shape rs = neon_select_shape (NS_DR, NS_QR, NS_NULL);
        struct neon_type_el et = neon_check_type (2, rs,
         N_8 | N_16 | N_32 | N_KEY, N_EQK);
+      if (rs == NS_QR)
+       {
+         if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH))
+           return;
+       }
+      else
+       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+                   BAD_FPU);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         if (inst.operands[1].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[1].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+       }
+
        /* Duplicate ARM register to lanes of vector.  */
        NEON_ENCODE (ARMREG, inst);
        switch (et.size)
@@ -17854,6 +19633,67 @@ do_neon_dup (void)
      }
  }
  
+static void
+do_mve_mov (int toQ)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = MVE_UNPREDICABLE_INSN;
+
+  unsigned Rt = 0, Rt2 = 1, Q0 = 2, Q1 = 3;
+  if (toQ)
+    {
+      Q0 = 0;
+      Q1 = 1;
+      Rt = 2;
+      Rt2 = 3;
+    }
+
+  constraint (inst.operands[Q0].reg != inst.operands[Q1].reg + 2,
+             _("Index one must be [2,3] and index two must be two less than"
+               " index one."));
+  constraint (inst.operands[Rt].reg == inst.operands[Rt2].reg,
+             _("General purpose registers may not be the same"));
+  constraint (inst.operands[Rt].reg == REG_SP
+             || inst.operands[Rt2].reg == REG_SP,
+             BAD_SP);
+  constraint (inst.operands[Rt].reg == REG_PC
+             || inst.operands[Rt2].reg == REG_PC,
+             BAD_PC);
+
+  inst.instruction = 0xec000f00;
+  inst.instruction |= HI1 (inst.operands[Q1].reg / 32) << 23;
+  inst.instruction |= !!toQ << 20;
+  inst.instruction |= inst.operands[Rt2].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[Q1].reg / 32) << 13;
+  inst.instruction |= (inst.operands[Q1].reg % 4) << 4;
+  inst.instruction |= inst.operands[Rt].reg;
+}
+
+static void
+do_mve_movn (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_I16 | N_I32
+                                           | N_KEY);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) - 1) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+
+}
+
  /* VMOV has particularly many variations. It can be one of:
       0. VMOV<c><q> <Qd>, <Qm>
       1. VMOV<c><q> <Dd>, <Dm>
@@ -17883,6 +19723,10 @@ do_neon_dup (void)
     (Two ARM regs to two VFP singles.)
      15. VMOV <Sd>, <Se>, <Rn>, <Rm>
     (Two VFP singles to two ARM regs.)
+   16. VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>
+   17. VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>
+   18. VMOV<c>.<dt> <Rt>, <Qn[idx]>
+   19. VMOV<c>.<dt> <Qd[idx]>, <Rt>
  
     These cases can be disambiguated using neon_select_shape, except cases 1/9
     and 3/11 which depend on the operand type too.
@@ -17898,10 +19742,11 @@ do_neon_dup (void)
  static void
  do_neon_mov (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_RRFF, NS_FFRR, NS_DRR, NS_RRD,
-                                         NS_QQ, NS_DD, NS_QI, NS_DI, NS_SR,
-                                         NS_RS, NS_FF, NS_FI, NS_RF, NS_FR,
-                                         NS_HR, NS_RH, NS_HI, NS_NULL);
+  enum neon_shape rs = neon_select_shape (NS_RRSS, NS_SSRR, NS_RRFF, NS_FFRR,
+                                         NS_DRR, NS_RRD, NS_QQ, NS_DD, NS_QI,
+                                         NS_DI, NS_SR, NS_RS, NS_FF, NS_FI,
+                                         NS_RF, NS_FR, NS_HR, NS_RH, NS_HI,
+                                         NS_NULL);
    struct neon_type_el et;
    const char *ldconst = 0;
  
@@ -17920,7 +19765,8 @@ do_neon_mov (void)
  
      case NS_QQ:  /* case 0/1.  */
        {
-       if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+       if (!check_simd_pred_availability (FALSE,
+                                          NEON_CHECK_CC | NEON_CHECK_ARCH))
           return;
         /* The architecture manual I have doesn't explicitly state which
            value the U bit should have for register->register moves, but
@@ -17950,7 +19796,8 @@ do_neon_mov (void)
        /* fall through.  */
  
      case NS_QI:  /* case 2/3.  */
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+      if (!check_simd_pred_availability (FALSE,
+                                        NEON_CHECK_CC | NEON_CHECK_ARCH))
         return;
        inst.instruction = 0x0800010;
        neon_move_immediate ();
@@ -17977,12 +19824,31 @@ do_neon_mov (void)
         et = neon_check_type (2, NS_NULL, N_8 | N_16 | N_32 | N_KEY, N_EQK);
         logsize = neon_logbits (et.size);
  
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                   _(BAD_FPU));
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                   && et.size != 32, _(BAD_FPU));
+       if (et.size != 32)
+         {
+           if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+               && vfp_or_neon_is_neon (NEON_CHECK_ARCH) == FAIL)
+             return;
+         }
+       else
+         {
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+                       && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                       _(BAD_FPU));
+         }
+
+       if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+         {
+           if (inst.operands[1].reg == REG_SP)
+             as_tsktsk (MVE_BAD_SP);
+           else if (inst.operands[1].reg == REG_PC)
+             as_tsktsk (MVE_BAD_PC);
+         }
+       unsigned size = inst.operands[0].isscalar == 1 ? 64 : 128;
+
         constraint (et.type == NT_invtype, _("bad type for scalar"));
-       constraint (x >= 64 / et.size, _("scalar index out of range"));
+       constraint (x >= size / et.size, _("scalar index out of range"));
+
  
         switch (et.size)
           {
@@ -17992,7 +19858,7 @@ do_neon_mov (void)
           default: ;
           }
  
-       bcdebits |= x << logsize;
+       bcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
  
         inst.instruction = 0xe000b10;
         do_vfp_cond_or_thumb ();
@@ -18000,12 +19866,14 @@ do_neon_mov (void)
         inst.instruction |= HI1 (dn) << 7;
         inst.instruction |= inst.operands[1].reg << 12;
         inst.instruction |= (bcdebits & 3) << 5;
-       inst.instruction |= (bcdebits >> 2) << 21;
+       inst.instruction |= ((bcdebits >> 2) & 3) << 21;
+       inst.instruction |= (x >> (3-logsize)) << 16;
        }
        break;
  
      case NS_DRR:  /* case 5 (fmdrr).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
                   _(BAD_FPU));
  
        inst.instruction = 0xc400b10;
@@ -18037,12 +19905,32 @@ do_neon_mov (void)
                               N_EQK, N_S8 | N_S16 | N_U8 | N_U16 | N_32 | N_KEY);
         logsize = neon_logbits (et.size);
  
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                   _(BAD_FPU));
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                   && et.size != 32, _(BAD_FPU));
+       if (et.size != 32)
+         {
+           if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+               && vfp_or_neon_is_neon (NEON_CHECK_CC
+                                       | NEON_CHECK_ARCH) == FAIL)
+             return;
+         }
+       else
+         {
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+                       && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                       _(BAD_FPU));
+         }
+
+       if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+         {
+           if (inst.operands[0].reg == REG_SP)
+             as_tsktsk (MVE_BAD_SP);
+           else if (inst.operands[0].reg == REG_PC)
+             as_tsktsk (MVE_BAD_PC);
+         }
+
+       unsigned size = inst.operands[1].isscalar == 1 ? 64 : 128;
+
         constraint (et.type == NT_invtype, _("bad type for scalar"));
-       constraint (x >= 64 / et.size, _("scalar index out of range"));
+       constraint (x >= size / et.size, _("scalar index out of range"));
  
         switch (et.size)
           {
@@ -18052,7 +19940,7 @@ do_neon_mov (void)
           default: ;
           }
  
-       abcdebits |= x << logsize;
+       abcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
         inst.instruction = 0xe100b10;
         do_vfp_cond_or_thumb ();
         inst.instruction |= LOW4 (dn) << 16;
@@ -18060,11 +19948,13 @@ do_neon_mov (void)
         inst.instruction |= inst.operands[0].reg << 12;
         inst.instruction |= (abcdebits & 3) << 5;
         inst.instruction |= (abcdebits >> 2) << 21;
+       inst.instruction |= (x >> (3-logsize)) << 16;
        }
        break;
  
      case NS_RRD:  /* case 7 (fmrrd).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
                   _(BAD_FPU));
  
        inst.instruction = 0xc500b10;
@@ -18131,11 +20021,21 @@ do_neon_mov (void)
         do_scalar_fp16_v82_encode ();
        break;
  
+    case NS_RRSS:
+      do_mve_mov (0);
+      break;
+    case NS_SSRR:
+      do_mve_mov (1);
+      break;
+
      /* The encoders for the fmrrs and fmsrr instructions expect three operands
         (one of which is a list), but we have parsed four.  Do some fiddling to
         make the operands what do_vfp_reg2_from_sp2 and do_vfp_sp2_from_reg2
         expect.  */
      case NS_RRFF:  /* case 14 (fmrrs).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                 _(BAD_FPU));
        constraint (inst.operands[3].reg != inst.operands[2].reg + 1,
                   _("VFP registers must be adjacent"));
        inst.operands[2].imm = 2;
@@ -18144,6 +20044,9 @@ do_neon_mov (void)
        break;
  
      case NS_FFRR:  /* case 15 (fmsrr).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                 _(BAD_FPU));
        constraint (inst.operands[1].reg != inst.operands[0].reg + 1,
                   _("VFP registers must be adjacent"));
        inst.operands[1] = inst.operands[2];
@@ -18163,11 +20066,58 @@ do_neon_mov (void)
      }
  }
  
+static void
+do_mve_movl (void)
+{
+  if (!(inst.operands[0].present && inst.operands[0].isquad
+      && inst.operands[1].present && inst.operands[1].isquad
+      && !inst.operands[2].present))
+    {
+      inst.instruction = 0;
+      inst.cond = 0xb;
+      if (thumb_mode)
+       set_pred_insn_type (INSIDE_IT_INSN);
+      do_neon_mov ();
+      return;
+    }
+
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond != COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_S8 | N_U8
+                                           | N_S16 | N_U16 | N_KEY);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) + 1) << 19;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
  static void
  do_neon_rshift_round_imm (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+    }
    int imm = inst.operands[2].imm;
  
    /* imm == 0 case is encoded as VMOV for V{R}SHR.  */
@@ -18249,7 +20199,14 @@ do_neon_zip_uzp (void)
  static void
  do_neon_sat_abs_neg (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18278,7 +20235,15 @@ do_neon_recip_est (void)
  static void
  do_neon_cls (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18287,7 +20252,15 @@ do_neon_cls (void)
  static void
  do_neon_clz (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_I8 | N_I16 | N_I32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18826,12 +20799,13 @@ do_vsel (void)
  static void
  do_vmaxnm (void)
  {
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    set_pred_insn_type (OUTSIDE_PRED_INSN);
  
    if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) == SUCCESS)
      return;
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH8))
      return;
  
    neon_dyadic_misc (NT_untyped, N_F_16_32, 0);
@@ -18895,12 +20869,12 @@ do_vrint_1 (enum neon_cvt_mode mode)
        if (et.type == NT_invtype)
         return;
  
-      set_pred_insn_type (OUTSIDE_PRED_INSN);
-      NEON_ENCODE (FLOAT, inst);
-
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+      if (!check_simd_pred_availability (TRUE,
+                                        NEON_CHECK_CC | NEON_CHECK_ARCH8))
         return;
  
+      NEON_ENCODE (FLOAT, inst);
+
        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
        inst.instruction |= HI1 (inst.operands[0].reg) << 22;
        inst.instruction |= LOW4 (inst.operands[1].reg);
@@ -18989,16 +20963,24 @@ neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
  static void
  do_vcmla (void)
  {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext)
+             && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+                 || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
    constraint (inst.relocs[0].exp.X_op != O_constant,
               _("expression too complex"));
    unsigned rot = inst.relocs[0].exp.X_add_number;
    constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
               _("immediate out of range"));
    rot /= 90;
+
+  if (!check_simd_pred_availability (TRUE,
+                                    NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
    if (inst.operands[2].isscalar)
      {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+       first_error (_("invalid instruction shape"));
        enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
        unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
                                        N_KEY | N_F16 | N_F32).size;
@@ -19017,9 +20999,19 @@ do_vcmla (void)
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+       rs = neon_select_shape (NS_QQQI, NS_NULL);
+      else
+       rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+
        unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
                                        N_KEY | N_F16 | N_F32).size;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext) && size == 32
+         && (inst.operands[0].reg == inst.operands[1].reg
+             || inst.operands[0].reg == inst.operands[2].reg))
+       as_tsktsk (BAD_MVE_SRCDEST);
+
        neon_three_same (neon_quad (rs), 0, -1);
        inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
        inst.instruction |= 0xfc200800;
@@ -19031,20 +21023,60 @@ do_vcmla (void)
  static void
  do_vcadd (void)
  {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+             && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+                 || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
    constraint (inst.relocs[0].exp.X_op != O_constant,
               _("expression too complex"));
+
    unsigned rot = inst.relocs[0].exp.X_add_number;
    constraint (rot != 90 && rot != 270, _("immediate out of range"));
-  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
-  unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-                                  N_KEY | N_F16 | N_F32).size;
-  neon_three_same (neon_quad (rs), 0, -1);
-  inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
-  inst.instruction |= 0xfc800800;
-  inst.instruction |= (rot == 270) << 24;
-  inst.instruction |= (size == 32) << 20;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32 | N_I8
+                           | N_I16 | N_I32);
+      if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+       as_tsktsk (_("Warning: 32-bit element size and same first and third "
+                    "operand makes instruction UNPREDICTABLE"));
+    }
+
+  if (et.type == NT_invtype)
+    return;
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
+  if (et.type == NT_float)
+    {
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc800800;
+      inst.instruction |= (rot == 270) << 24;
+      inst.instruction |= (et.size == 32) << 20;
+    }
+  else
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = 0xfe000f00;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= (rot == 270) << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+      inst.is_neon = 1;
+    }
  }
  
  /* Dot Product instructions encoding support.  */
@@ -20790,6 +22822,10 @@ static const struct reg_entry reg_names[] =
    REGDEF(WR, 7,RN), REGDEF(SB, 9,RN), REGDEF(SL,10,RN), REGDEF(FP,11,RN),
    REGDEF(IP,12,RN), REGDEF(SP,13,RN), REGDEF(LR,14,RN), REGDEF(PC,15,RN),
  
+  /* Defining the new Zero register from ARMv8.1-M.  */
+  REGDEF(zr,15,ZR),
+  REGDEF(ZR,15,ZR),
+
    /* Coprocessor numbers.  */
    REGSET(p, CP), REGSET(P, CP),
  
@@ -20853,6 +22889,10 @@ static const struct reg_entry reg_names[] =
    REGDEF(mvfr0,7,VFC), REGDEF(mvfr1,6,VFC),
    REGDEF(MVFR0,7,VFC), REGDEF(MVFR1,6,VFC),
    REGDEF(mvfr2,5,VFC), REGDEF(MVFR2,5,VFC),
+  REGDEF(fpscr_nzcvqc,2,VFC), REGDEF(FPSCR_nzcvqc,2,VFC),
+  REGDEF(vpr,12,VFC), REGDEF(VPR,12,VFC),
+  REGDEF(fpcxt_ns,14,VFC), REGDEF(FPCXT_NS,14,VFC),
+  REGDEF(fpcxt_s,15,VFC), REGDEF(FPCXT_S,15,VFC),
  
    /* Maverick DSP coprocessor registers.  */
    REGSET(mvf,MVF),  REGSET(mvd,MVD),  REGSET(mvfx,MVFX),  REGSET(mvdx,MVDX),
@@ -21208,6 +23248,10 @@ static struct asm_barrier_opt barrier_opt_names[] =
  #define cCE(mnem,  op, nops, ops, ae)  \
    { mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, ARM_VARIANT, do_##ae, do_##ae, 0 }
  
+/* mov instructions that are shared between coprocessor and MVE.  */
+#define mcCE(mnem,  op, nops, ops, ae) \
+  { #mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, THUMB_VARIANT, do_##ae, do_##ae, 0 }
+
  /* Legacy coprocessor instructions where conditional infix and conditional
     suffix are ambiguous.  For consistency this includes all FPA instructions,
     not just the potentially ambiguous ones.  */
@@ -21969,15 +24013,13 @@ static const struct asm_opcode insns[] =
    nUF(vselvs, _vselvs, 3, (RVSD, RVSD, RVSD),          vsel),
    nUF(vselge, _vselge, 3, (RVSD, RVSD, RVSD),          vsel),
    nUF(vselgt, _vselgt, 3, (RVSD, RVSD, RVSD),          vsel),
-  nUF(vmaxnm, _vmaxnm, 3, (RNSDQ, oRNSDQ, RNSDQ),      vmaxnm),
-  nUF(vminnm, _vminnm, 3, (RNSDQ, oRNSDQ, RNSDQ),      vmaxnm),
    nCE(vrintr, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintr),
-  nCE(vrintz, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintz),
-  nCE(vrintx, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintx),
-  nUF(vrinta, _vrinta, 2, (RNSDQ, oRNSDQ),             vrinta),
-  nUF(vrintn, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintn),
-  nUF(vrintp, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintp),
-  nUF(vrintm, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintm),
+  mnCE(vrintz, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),                vrintz),
+  mnCE(vrintx, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),                vrintx),
+  mnUF(vrinta, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrinta),
+  mnUF(vrintn, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintn),
+  mnUF(vrintp, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintp),
+  mnUF(vrintm, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintm),
  
    /* Crypto v1 extensions.  */
  #undef  ARM_VARIANT
@@ -22023,8 +24065,6 @@ static const struct asm_opcode insns[] =
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & arm_ext_v8_3
   NCE (vjcvt, eb90bc0, 2, (RVS, RVD), vjcvt),
- NUF (vcmla, 0, 4, (RNDQ, RNDQ, RNDQ_RNSC, EXPi), vcmla),
- NUF (vcadd, 0, 4, (RNDQ, RNDQ, RNDQ, EXPi), vcadd),
  
  #undef  ARM_VARIANT
  #define ARM_VARIANT   & fpu_neon_ext_dotprod
@@ -22480,14 +24520,14 @@ static const struct asm_opcode insns[] =
  
  #undef  ARM_VARIANT
  #define ARM_VARIANT  & fpu_vfp_ext_v1xd  /* VFP V1xD (single precision).  */
+#undef THUMB_VARIANT
+#define THUMB_VARIANT  & arm_ext_v6t2
+ mcCE(vmrs,    ef00a10, 2, (APSR_RR, RVC),   vmrs),
+ mcCE(vmsr,    ee00a10, 2, (RVC, RR),        vmsr),
+#undef THUMB_VARIANT
  
    /* Moves and type conversions.  */
- cCE("fcpys",  eb00a40, 2, (RVS, RVS),       vfp_sp_monadic),
- cCE("fmrs",   e100a10, 2, (RR, RVS),        vfp_reg_from_sp),
- cCE("fmsr",   e000a10, 2, (RVS, RR),        vfp_sp_from_reg),
   cCE("fmstat", ef1fa10, 0, (),               noargs),
- cCE("vmrs",   ef00a10, 2, (APSR_RR, RVC),   vmrs),
- cCE("vmsr",   ee00a10, 2, (RVC, RR),        vmsr),
   cCE("fsitos", eb80ac0, 2, (RVS, RVS),       vfp_sp_monadic),
   cCE("fuitos", eb80a40, 2, (RVS, RVS),       vfp_sp_monadic),
   cCE("ftosis", ebd0a40, 2, (RVS, RVS),       vfp_sp_monadic),
@@ -22556,7 +24596,6 @@ static const struct asm_opcode insns[] =
  #define ARM_VARIANT  & fpu_vfp_ext_v1 /* VFP V1 (Double precision).  */
  
    /* Moves and type conversions.  */
- cCE("fcpyd",  eb00b40, 2, (RVD, RVD),       vfp_dp_rd_rm),
   cCE("fcvtds", eb70ac0, 2, (RVD, RVS),       vfp_dp_sp_cvt),
   cCE("fcvtsd", eb70bc0, 2, (RVS, RVD),       vfp_sp_dp_cvt),
   cCE("fmdhr",  e200b10, 2, (RVD, RR),        vfp_dp_rn_rd),
@@ -22592,14 +24631,6 @@ static const struct asm_opcode insns[] =
   cCE("fcmped", eb40bc0, 2, (RVD, RVD),       vfp_dp_rd_rm),
   cCE("fcmpezd",        eb50bc0, 1, (RVD),            vfp_dp_rd),
  
-#undef  ARM_VARIANT
-#define ARM_VARIANT  & fpu_vfp_ext_v2
-
- cCE("fmsrr",  c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
- cCE("fmrrs",  c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
- cCE("fmdrr",  c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
- cCE("fmrrd",  c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
-
  /* Instructions which may belong to either the Neon or VFP instruction sets.
     Individual encoder functions perform additional architecture checks.  */
  #undef  ARM_VARIANT
@@ -22613,15 +24644,11 @@ static const struct asm_opcode insns[] =
   nCE(vnmul,     _vnmul,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
   nCE(vnmla,     _vnmla,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
   nCE(vnmls,     _vnmls,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
- nCE(vcmp,      _vcmp,    2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
- nCE(vcmpe,     _vcmpe,   2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
   NCE(vpush,     0,       1, (VRSDLST),          vfp_nsyn_push),
   NCE(vpop,      0,       1, (VRSDLST),          vfp_nsyn_pop),
   NCE(vcvtz,     0,       2, (RVSD, RVSD),       vfp_nsyn_cvtz),
  
    /* Mnemonics shared by Neon and VFP.  */
- nCEF(vmul,     _vmul,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mul),
- nCEF(vmla,     _vmla,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
   nCEF(vmls,     _vmls,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
  
   NCE(vldm,      c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
@@ -22638,7 +24665,6 @@ static const struct asm_opcode insns[] =
  
  
    /* NOTE: All VMOV encoding is special-cased!  */
- NCE(vmov,      0,       1, (VMOV), neon_mov),
   NCE(vmovq,     0,       1, (VMOV), neon_mov),
  
  #undef  THUMB_VARIANT
@@ -22672,38 +24698,24 @@ static const struct asm_opcode insns[] =
    /* integer ops, valid types S8 S16 S32 U8 U16 U32.  */
   NUF(vaba,      0000710, 3, (RNDQ, RNDQ,  RNDQ), neon_dyadic_i_su),
   NUF(vabaq,     0000710, 3, (RNQ,  RNQ,   RNQ),  neon_dyadic_i_su),
- NUF(vhadd,     0000000, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vhaddq,    0000000, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vrhadd,    0000100, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vrhaddq,   0000100, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vhsub,     0000200, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vhsubq,    0000200, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
    /* integer ops, valid types S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqadd,     0000010, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
   NUF(vqaddq,    0000010, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vqsub,     0000210, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
   NUF(vqsubq,    0000210, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vrshl,     0000500, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
   NUF(vrshlq,    0000500, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
- NUF(vqrshl,    0000510, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
   NUF(vqrshlq,   0000510, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
    /* If not immediate, fall back to neon_dyadic_i64_su.
-     shl_imm should accept I8 I16 I32 I64,
-     qshl_imm should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
- nUF(vshl,      _vshl,    3, (RNDQ, oRNDQ, RNDQ_I63b), neon_shl_imm),
- nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl_imm),
- nUF(vqshl,     _vqshl,   3, (RNDQ, oRNDQ, RNDQ_I63b), neon_qshl_imm),
- nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl_imm),
+     shl should accept I8 I16 I32 I64,
+     qshl should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
+ nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl),
+ nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl),
    /* Logic ops, types optional & ignored.  */
- nUF(vand,      _vand,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vandq,     _vand,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vbic,      _vbic,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vbicq,     _vbic,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorr,      _vorr,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vorrq,     _vorr,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorn,      _vorn,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vornq,     _vorn,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(veor,      _veor,    3, (RNDQ, oRNDQ, RNDQ),      neon_logic),
   nUF(veorq,     _veor,    3, (RNQ,  oRNQ,  RNQ),       neon_logic),
    /* Bitfield ops, untyped.  */
   NUF(vbsl,      1100110, 3, (RNDQ, RNDQ, RNDQ), neon_bitfield),
@@ -22714,9 +24726,7 @@ static const struct asm_opcode insns[] =
   NUF(vbifq,     1300110, 3, (RNQ,  RNQ,  RNQ),  neon_bitfield),
    /* Int and float variants, types S8 S16 S32 U8 U16 U32 F16 F32.  */
   nUF(vabdq,     _vabd,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmax,      _vmax,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
   nUF(vmaxq,     _vmax,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmin,      _vmin,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
   nUF(vminq,     _vmin,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
    /* Comparisons. Types S8 S16 S32 U8 U16 U32 F32. Non-immediate versions fall
       back to neon_dyadic_if_su.  */
@@ -22747,9 +24757,7 @@ static const struct asm_opcode insns[] =
    /* VMUL takes I8 I16 I32 F32 P8.  */
   nUF(vmulq,     _vmul,     3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
    /* VQD{R}MULH takes S16 S32.  */
- nUF(vqdmulh,   _vqdmulh,  3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
   nUF(vqdmulhq,  _vqdmulh,  3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
- nUF(vqrdmulh,  _vqrdmulh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
   nUF(vqrdmulhq, _vqrdmulh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
   NUF(vacge,     0000e10,  3, (RNDQ, oRNDQ, RNDQ), neon_fcmp_absolute),
   NUF(vacgeq,    0000e10,  3, (RNQ,  oRNQ,  RNQ),  neon_fcmp_absolute),
@@ -22764,7 +24772,6 @@ static const struct asm_opcode insns[] =
   NUF(vrsqrts,   0200f10,  3, (RNDQ, oRNDQ, RNDQ), neon_step),
   NUF(vrsqrtsq,  0200f10,  3, (RNQ,  oRNQ,  RNQ),  neon_step),
   /* ARM v8.1 extension.  */
- nUF (vqrdmlah,  _vqrdmlah, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlahq, _vqrdmlah, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlsh,  _vqrdmlsh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlshq, _vqrdmlsh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
@@ -22776,21 +24783,16 @@ static const struct asm_opcode insns[] =
    /* Data processing with two registers and a shift amount.  */
    /* Right shifts, and variants with rounding.
       Types accepted S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vshr,      0800010, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
   NUF(vshrq,     0800010, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
- NUF(vrshr,     0800210, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
   NUF(vrshrq,    0800210, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
   NUF(vsra,      0800110, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
   NUF(vsraq,     0800110, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
   NUF(vrsra,     0800310, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
   NUF(vrsraq,    0800310, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
    /* Shift and insert. Sizes accepted 8 16 32 64.  */
- NUF(vsli,      1800510, 3, (RNDQ, oRNDQ, I63), neon_sli),
   NUF(vsliq,     1800510, 3, (RNQ,  oRNQ,  I63), neon_sli),
- NUF(vsri,      1800410, 3, (RNDQ, oRNDQ, I64), neon_sri),
   NUF(vsriq,     1800410, 3, (RNQ,  oRNQ,  I64), neon_sri),
    /* QSHL{U} immediate accepts S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqshlu,    1800610, 3, (RNDQ, oRNDQ, I63), neon_qshlu_imm),
   NUF(vqshluq,   1800610, 3, (RNQ,  oRNQ,  I63), neon_qshlu_imm),
    /* Right shift immediate, saturating & narrowing, with rounding variants.
       Types accepted S16 S32 S64 U16 U32 U64.  */
@@ -22807,7 +24809,6 @@ static const struct asm_opcode insns[] =
    /* CVT with optional immediate for fixed-point variant.  */
   nUF(vcvtq,     _vcvt,    3, (RNQ, RNQ, oI32b), neon_cvt),
  
- nUF(vmvn,      _vmvn,    2, (RNDQ, RNDQ_Ibig), neon_mvn),
   nUF(vmvnq,     _vmvn,    2, (RNQ,  RNDQ_Ibig), neon_mvn),
  
    /* Data processing, three registers of different lengths.  */
@@ -22839,14 +24840,10 @@ static const struct asm_opcode insns[] =
  
    /* Two registers, miscellaneous.  */
    /* Reverse. Sizes 8 16 32 (must be < size in opcode).  */
- NUF(vrev64,    1b00000, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev64q,   1b00000, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev32,    1b00080, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev32q,   1b00080, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev16,    1b00100, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev16q,   1b00100, 2, (RNQ,  RNQ),      neon_rev),
    /* Vector replicate. Sizes 8 16 32.  */
- nCE(vdup,      _vdup,    2, (RNDQ, RR_RNSC),  neon_dup),
   nCE(vdupq,     _vdup,    2, (RNQ,  RR_RNSC),  neon_dup),
    /* VMOVL. Types S8 S16 S32 U8 U16 U32.  */
   NUF(vmovl,     0800a10, 2, (RNQ, RND),       neon_movl),
@@ -22862,9 +24859,7 @@ static const struct asm_opcode insns[] =
   NUF(vuzp,      1b20100, 2, (RNDQ, RNDQ),     neon_zip_uzp),
   NUF(vuzpq,     1b20100, 2, (RNQ,  RNQ),      neon_zip_uzp),
    /* VQABS / VQNEG. Types S8 S16 S32.  */
- NUF(vqabs,     1b00700, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
   NUF(vqabsq,    1b00700, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
- NUF(vqneg,     1b00780, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
   NUF(vqnegq,    1b00780, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
    /* Pairwise, lengthening. Types S8 S16 S32 U8 U16 U32.  */
   NUF(vpadal,    1b00600, 2, (RNDQ, RNDQ),     neon_pair_long),
@@ -22877,10 +24872,8 @@ static const struct asm_opcode insns[] =
   NUF(vrsqrte,   1b30480, 2, (RNDQ, RNDQ),     neon_recip_est),
   NUF(vrsqrteq,  1b30480, 2, (RNQ,  RNQ),      neon_recip_est),
    /* VCLS. Types S8 S16 S32.  */
- NUF(vcls,      1b00400, 2, (RNDQ, RNDQ),     neon_cls),
   NUF(vclsq,     1b00400, 2, (RNQ,  RNQ),      neon_cls),
    /* VCLZ. Types I8 I16 I32.  */
- NUF(vclz,      1b00480, 2, (RNDQ, RNDQ),     neon_clz),
   NUF(vclzq,     1b00480, 2, (RNQ,  RNQ),      neon_clz),
    /* VCNT. Size 8.  */
   NUF(vcnt,      1b00500, 2, (RNDQ, RNDQ),     neon_cnt),
@@ -22944,11 +24937,12 @@ static const struct asm_opcode insns[] =
  #define ARM_VARIANT    & fpu_vfp_ext_fma
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & fpu_vfp_ext_fma
- /* Mnemonics shared by Neon and VFP.  These are included in the
+ /* Mnemonics shared by Neon, VFP and MVE.  These are included in the
      VFP FMA variant; NEON and VFP FMA always includes the NEON
      FMA instructions.  */
- nCEF(vfma,     _vfma,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
- nCEF(vfms,     _vfms,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
+ mnCEF(vfma,     _vfma,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR), neon_fmac),
+ mnCEF(vfms,     _vfms,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),  neon_fmac),
+
   /* ffmas/ffmad/ffmss/ffmsd are dummy mnemonics to satisfy gas;
      the v form should always be used.  */
   cCE("ffmas",  ea00a00, 3, (RVS, RVS, RVS),  vfp_sp_dyadic),
@@ -23316,6 +25310,16 @@ static const struct asm_opcode insns[] =
   /* Armv8.1-M Mainline instructions.  */
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & arm_ext_v8_1m_main
+ toU("cinc",  _cinc,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("cinv",  _cinv,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("cneg",  _cneg,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("csel",  _csel,  4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csetm", _csetm, 2, (RRnpcsp, COND),              t_cond),
+ toU("cset",  _cset,  2, (RRnpcsp, COND),              t_cond),
+ toU("csinc", _csinc, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csinv", _csinv, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csneg", _csneg, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+
   toC("bf",     _bf,    2, (EXPs, EXPs),             t_branch_future),
   toU("bfcsel", _bfcsel,        4, (EXPs, EXPs, EXPs, COND), t_branch_future),
   toC("bfx",    _bfx,   2, (EXPs, RRnpcsp),          t_branch_future),
@@ -23331,6 +25335,38 @@ static const struct asm_opcode insns[] =
  
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & mve_ext
+ ToC("lsll",   ea50010d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("lsrl",   ea50011f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("asrl",   ea50012d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("uqrshll",        ea51010d, 3, (RRe, RRo, RRnpcsp),     mve_scalar_shift),
+ ToC("sqrshrl",        ea51012d, 3, (RRe, RRo, RRnpcsp),     mve_scalar_shift),
+ ToC("uqshll", ea51010f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("urshrl", ea51011f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("srshrl", ea51012f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("sqshll", ea51013f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("uqrshl", ea500f0d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("sqrshr", ea500f2d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("uqshl",  ea500f0f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("urshr",  ea500f1f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("srshr",  ea500f2f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("sqshl",  ea500f3f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+
+ ToC("vpt",    ee410f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptt",   ee018f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpte",   ee418f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttt",  ee014f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptte",  ee01cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptet",  ee41cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptee",  ee414f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptttt", ee012f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttte", ee016f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttet", ee01ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttee", ee01af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptett", ee41af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptete", ee41ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteet", ee416f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteee", ee412f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+
   ToC("vpst",   fe710f4d, 0, (), mve_vpt),
   ToC("vpstt",  fe318f4d, 0, (), mve_vpt),
   ToC("vpste",  fe718f4d, 0, (), mve_vpt),
@@ -23348,6 +25384,11 @@ static const struct asm_opcode insns[] =
   ToC("vpsteee",        fe712f4d, 0, (), mve_vpt),
  
   /* MVE and MVE FP only.  */
+ mToC("vhcadd",        ee000f00,   4, (RMQ, RMQ, RMQ, EXPi),             mve_vhcadd),
+ mCEF(vadc,    _vadc,      3, (RMQ, RMQ, RMQ),                   mve_vadc),
+ mCEF(vadci,   _vadci,     3, (RMQ, RMQ, RMQ),                   mve_vadc),
+ mToC("vsbc",  fe300f00,   3, (RMQ, RMQ, RMQ),                   mve_vsbc),
+ mToC("vsbci", fe301f00,   3, (RMQ, RMQ, RMQ),                   mve_vsbc),
   mCEF(vmullb,  _vmullb,    3, (RMQ, RMQ, RMQ),                   mve_vmull),
   mCEF(vabav,   _vabav,     3, (RRnpcsp, RMQ, RMQ),               mve_vabav),
   mCEF(vmladav,   _vmladav,     3, (RRe, RMQ, RMQ),             mve_vmladav),
@@ -23382,10 +25423,118 @@ static const struct asm_opcode insns[] =
   mCEF(vldrw,   _vldrw,     2, (RMQ, ADDRMVE),                  mve_vstr_vldr),
   mCEF(vldrd,   _vldrd,     2, (RMQ, ADDRMVE),                  mve_vstr_vldr),
  
+ mCEF(vmovnt,  _vmovnt,    2, (RMQ, RMQ),                        mve_movn),
+ mCEF(vmovnb,  _vmovnb,    2, (RMQ, RMQ),                        mve_movn),
+ mCEF(vbrsr,   _vbrsr,     3, (RMQ, RMQ, RR),                    mve_vbrsr),
+ mCEF(vaddlv,  _vaddlv,    3, (RRe, RRo, RMQ),                   mve_vaddlv),
+ mCEF(vaddlva, _vaddlva,   3, (RRe, RRo, RMQ),                   mve_vaddlv),
+ mCEF(vaddv,   _vaddv,     2, (RRe, RMQ),                        mve_vaddv),
+ mCEF(vaddva,  _vaddva,    2, (RRe, RMQ),                        mve_vaddv),
+ mCEF(vddup,   _vddup,     3, (RMQ, RRe, EXPi),                  mve_viddup),
+ mCEF(vdwdup,  _vdwdup,    4, (RMQ, RRe, RR, EXPi),              mve_viddup),
+ mCEF(vidup,   _vidup,     3, (RMQ, RRe, EXPi),                  mve_viddup),
+ mCEF(viwdup,  _viwdup,    4, (RMQ, RRe, RR, EXPi),              mve_viddup),
+ mToC("vmaxa", ee330e81,   2, (RMQ, RMQ),                        mve_vmaxa_vmina),
+ mToC("vmina", ee331e81,   2, (RMQ, RMQ),                        mve_vmaxa_vmina),
+ mCEF(vmaxv,   _vmaxv,   2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vmaxav,  _vmaxav,  2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vminv,   _vminv,   2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vminav,  _vminav,  2, (RR, RMQ),                           mve_vmaxv),
+
+ mCEF(vmlaldav,          _vmlaldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldava,  _vmlaldava,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldavx,  _vmlaldavx,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldavax, _vmlaldavax, 4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlalv,    _vmlaldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlalva,   _vmlaldava,   4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldav,          _vmlsldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldava,  _vmlsldava,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldavx,  _vmlsldavx,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldavax, _vmlsldavax, 4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mToC("vrmlaldavh", ee800f00,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlaldavha",ee800f20,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhx,  _vrmlaldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhax, _vrmlaldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvh",   ee800f00,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvha",  ee800f20,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavh,   _vrmlsldavh,   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavha,  _vrmlsldavha,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhx,  _vrmlsldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhax, _vrmlsldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+
+ mToC("vmlas",   ee011e40,     3, (RMQ, RMQ, RR),              mve_vmlas),
+ mToC("vmulh",   ee010e01,     3, (RMQ, RMQ, RMQ),             mve_vmulh),
+ mToC("vrmulh",          ee011e01,     3, (RMQ, RMQ, RMQ),             mve_vmulh),
+ mToC("vpnot",   fe310f4d,     0, (),                          mve_vpnot),
+ mToC("vpsel",   fe310f01,     3, (RMQ, RMQ, RMQ),             mve_vpsel),
+
+ mToC("vqdmladh",  ee000e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmladhx", ee001e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmladh", ee000e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmladhx",ee001e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlsdh",  fe000e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlsdhx", fe001e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmlsdh", fe000e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmlsdhx",fe001e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlah",   ee000e60,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqdmlash",  ee001e60,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqrdmlash", ee001e40,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqdmullt",  ee301f00,   3, (RMQ, RMQ, RMQRR),           mve_vqdmull),
+ mToC("vqdmullb",  ee300f00,   3, (RMQ, RMQ, RMQRR),           mve_vqdmull),
+ mCEF(vqmovnt,   _vqmovnt,     2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovnb,   _vqmovnb,     2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovunt,          _vqmovunt,    2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovunb,          _vqmovunb,    2, (RMQ, RMQ),                  mve_vqmovn),
+
+ mCEF(vshrnt,    _vshrnt,      3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vshrnb,    _vshrnb,      3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vrshrnt,   _vrshrnt,     3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vrshrnb,   _vrshrnb,     3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrnt,   _vqrshrnt,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrnb,   _vqrshrnb,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrunt,          _vqrshrunt,   3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrunb,          _vqrshrunb,   3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrnt,          _vqrshrnt,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrnb,          _vqrshrnb,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrunt,  _vqrshrunt,  3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrunb,  _vqrshrunb,  3, (RMQ, RMQ, I32z),    mve_vshrn),
+
+ mToC("vshlc",     eea00fc0,      3, (RMQ, RR, I32z),      mve_vshlc),
+ mToC("vshllt",            ee201e00,      3, (RMQ, RMQ, I32),      mve_vshll),
+ mToC("vshllb",            ee200e00,      3, (RMQ, RMQ, I32),      mve_vshll),
+
+ toU("dlstp",  _dlstp, 2, (LR, RR),      t_loloop),
+ toU("wlstp",  _wlstp, 3, (LR, RR, EXP), t_loloop),
+ toU("letp",   _letp,  2, (LR, EXP),     t_loloop),
+ toU("lctp",   _lctp,  0, (),            t_loloop),
+
+#undef THUMB_VARIANT
+#define THUMB_VARIANT & mve_fp_ext
+ mToC("vcmul", ee300e00,   4, (RMQ, RMQ, RMQ, EXPi),             mve_vcmul),
+ mToC("vfmas", ee311e40,   3, (RMQ, RMQ, RR),                    mve_vfmas),
+ mToC("vmaxnma", ee3f0e81, 2, (RMQ, RMQ),                        mve_vmaxnma_vminnma),
+ mToC("vminnma", ee3f1e81, 2, (RMQ, RMQ),                        mve_vmaxnma_vminnma),
+ mToC("vmaxnmv", eeee0f00, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vmaxnmav",eeec0f00, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vminnmv", eeee0f80, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vminnmav",eeec0f80, 2, (RR, RMQ),                         mve_vmaxnmv),
+
  #undef  ARM_VARIANT
-#define ARM_VARIANT    & fpu_vfp_ext_v1xd
+#define ARM_VARIANT  & fpu_vfp_ext_v1
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & arm_ext_v6t2
+ mnCEF(vmla,     _vmla,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mac_maybe_scalar),
+ mnCEF(vmul,     _vmul,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mul),
+
+ mcCE(fcpyd,   eb00b40, 2, (RVD, RVD),       vfp_dp_rd_rm),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v1xd
+
+ MNCE(vmov,   0,       1, (VMOV),            neon_mov),
+ mcCE(fmrs,    e100a10, 2, (RR, RVS),        vfp_reg_from_sp),
+ mcCE(fmsr,    e000a10, 2, (RVS, RR),        vfp_sp_from_reg),
+ mcCE(fcpys,   eb00a40, 2, (RVS, RVS),       vfp_sp_monadic),
  
   mCEF(vmullt, _vmullt, 3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ),  mve_vmull),
   mnCEF(vadd,  _vadd,   3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR),       neon_addsub_if_i),
@@ -23394,19 +25543,75 @@ static const struct asm_opcode insns[] =
   MNCEF(vabs,  1b10300, 2, (RNSDQMQ, RNSDQMQ),  neon_abs_neg),
   MNCEF(vneg,  1b10380, 2, (RNSDQMQ, RNSDQMQ),  neon_abs_neg),
  
+ mCEF(vmovlt, _vmovlt, 1, (VMOV),              mve_movl),
+ mCEF(vmovlb, _vmovlb, 1, (VMOV),              mve_movl),
+
+ mnCE(vcmp,      _vcmp,    3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+ mnCE(vcmpe,     _vcmpe,   3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v2
+
+ mcCE(fmsrr,   c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
+ mcCE(fmrrs,   c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
+ mcCE(fmdrr,   c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
+ mcCE(fmrrd,   c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
+
  #undef  ARM_VARIANT
  #define ARM_VARIANT    & fpu_vfp_ext_armv8xd
   mnUF(vcvta,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvta),
   mnUF(vcvtp,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvtp),
   mnUF(vcvtn,  _vcvta,  3, (RNSDQMQ, oRNSDQMQ, oI32z),  neon_cvtn),
   mnUF(vcvtm,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvtm),
+ mnUF(vmaxnm, _vmaxnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),        vmaxnm),
+ mnUF(vminnm, _vminnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),        vmaxnm),
  
  #undef ARM_VARIANT
  #define ARM_VARIANT & fpu_neon_ext_v1
- mnUF(vabd,      _vabd,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vabd,      _vabd,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
   mnUF(vabdl,     _vabdl,         3, (RNQMQ, RNDMQ, RNDMQ),   neon_dyadic_long),
   mnUF(vaddl,     _vaddl,         3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
   mnUF(vsubl,     _vsubl,         3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
+ mnUF(vand,      _vand,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vbic,      _vbic,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorr,      _vorr,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorn,      _vorn,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(veor,      _veor,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ),      neon_logic),
+ MNUF(vcls,      1b00400,        2, (RNDQMQ, RNDQMQ),               neon_cls),
+ MNUF(vclz,      1b00480,        2, (RNDQMQ, RNDQMQ),               neon_clz),
+ mnCE(vdup,      _vdup,                  2, (RNDQMQ, RR_RNSC),              neon_dup),
+ MNUF(vhadd,     00000000,       3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ MNUF(vrhadd,    00000100,       3, (RNDQMQ, oRNDQMQ, RNDQMQ),   neon_dyadic_i_su),
+ MNUF(vhsub,     00000200,       3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ mnUF(vmin,      _vmin,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vmax,      _vmax,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ MNUF(vqadd,     0000010,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ MNUF(vqsub,     0000210,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ mnUF(vmvn,      _vmvn,    2, (RNDQMQ, RNDQMQ_Ibig), neon_mvn),
+ MNUF(vqabs,     1b00700,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ MNUF(vqneg,     1b00780,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ mnUF(vqrdmlah,  _vqrdmlah,3, (RNDQMQ, oRNDQMQ, RNDQ_RNSC_RR), neon_qrdmlah),
+ mnUF(vqdmulh,   _vqdmulh, 3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ mnUF(vqrdmulh,  _vqrdmulh,3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ MNUF(vqrshl,    0000510,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vrshl,     0000500,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vshr,      0800010,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vrshr,     0800210,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vsli,      1800510,  3, (RNDQMQ, oRNDQMQ, I63),  neon_sli),
+ MNUF(vsri,      1800410,  3, (RNDQMQ, oRNDQMQ, I64z), neon_sri),
+ MNUF(vrev64,    1b00000,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev32,    1b00080,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev16,    1b00100,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ mnUF(vshl,     _vshl,    3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_shl),
+ mnUF(vqshl,     _vqshl,   3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_qshl),
+ MNUF(vqshlu,    1800610,  3, (RNDQMQ, oRNDQMQ, I63),           neon_qshlu_imm),
+
+#undef ARM_VARIANT
+#define ARM_VARIANT & arm_ext_v8_3
+#undef THUMB_VARIANT
+#define        THUMB_VARIANT & arm_ext_v6t2_v8m
+ MNUF (vcadd, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ, EXPi), vcadd),
+ MNUF (vcmla, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ_RNSC, EXPi), vcmla),
  };
  #undef ARM_VARIANT
  #undef THUMB_VARIANT
@@ -25559,11 +27764,12 @@ md_apply_fix (fixS *  fixP,
        break;
  
      case BFD_RELOC_ARM_SMC:
-      if (((unsigned long) value) > 0xffff)
+      if (((unsigned long) value) > 0xf)
         as_bad_where (fixP->fx_file, fixP->fx_line,
                       _("invalid smc expression"));
+
        newval = md_chars_to_number (buf, INSN_SIZE);
-      newval |= (value & 0xf) | ((value & 0xfff0) << 4);
+      newval |= (value & 0xf);
        md_number_to_chars (buf, newval, INSN_SIZE);
        break;
  
@@ -26704,9 +28910,10 @@ md_apply_fix (fixS *   fixP,
         }
  
        bfd_vma insn = get_thumb32_insn (buf);
-      /* le lr, <label> or le <label> */
+      /* le lr, <label>, le <label> or letp lr, <label> */
        if (((insn & 0xffffffff) == 0xf00fc001)
-         || ((insn & 0xffffffff) == 0xf02fc001))
+         || ((insn & 0xffffffff) == 0xf02fc001)
+         || ((insn & 0xffffffff) == 0xf01fc001))
         value = -value;
  
        if (v8_1_branch_value_check (value, 12, FALSE) == FAIL)