x86: Display default x86-specific options for "as --help"
[binutils-gdb.git] / gas / config / tc-i386.c
index f9dccdbd9e91ccf3a7809fe478bab34af14cd313..ddbffb95b113b705d410f5859ba745711cb506eb 100644 (file)
@@ -81,9 +81,6 @@
 #define SHORT_MNEM_SUFFIX 's'
 #define LONG_MNEM_SUFFIX  'l'
 #define QWORD_MNEM_SUFFIX  'q'
-#define XMMWORD_MNEM_SUFFIX  'x'
-#define YMMWORD_MNEM_SUFFIX 'y'
-#define ZMMWORD_MNEM_SUFFIX 'z'
 /* Intel Syntax.  Use a non-ascii letter since since it never appears
    in instructions.  */
 #define LONG_DOUBLE_MNEM_SUFFIX '\1'
@@ -228,11 +225,14 @@ static struct Mask_Operation mask_op;
    broadcast factor.  */
 struct Broadcast_Operation
 {
-  /* Type of broadcast: no broadcast, {1to8}, or {1to16}.  */
+  /* Type of broadcast: {1to2}, {1to4}, {1to8}, or {1to16}.  */
   int type;
 
   /* Index of broadcasted operand.  */
   int operand;
+
+  /* Number of bytes to broadcast.  */
+  int bytes;
 };
 
 static struct Broadcast_Operation broadcast_op;
@@ -265,7 +265,6 @@ enum i386_error
     number_of_operands_mismatch,
     invalid_instruction_suffix,
     bad_imm4,
-    old_gcc_only,
     unsupported_with_intel_mnemonic,
     unsupported_syntax,
     unsupported,
@@ -273,7 +272,6 @@ enum i386_error
     invalid_vector_register_set,
     unsupported_vector_index_register,
     unsupported_broadcast,
-    broadcast_not_on_src_operand,
     broadcast_needed,
     unsupported_masking,
     mask_not_on_destination,
@@ -311,6 +309,7 @@ struct _i386_insn
     /* Flags for operands.  */
     unsigned int flags[MAX_OPERANDS];
 #define Operand_PCrel 1
+#define Operand_Mem   2
 
     /* Relocation type for operand */
     enum bfd_reloc_code_real reloc[MAX_OPERANDS];
@@ -436,7 +435,6 @@ const char extra_symbol_chars[] = "*%-([{}"
         && !defined (TE_GNU)                           \
         && !defined (TE_LINUX)                         \
         && !defined (TE_NACL)                          \
-        && !defined (TE_NETWARE)                       \
         && !defined (TE_FreeBSD)                       \
         && !defined (TE_DragonFly)                     \
         && !defined (TE_NetBSD)))
@@ -565,9 +563,6 @@ static int intel64;
    0 if att mnemonic.  */
 static int intel_mnemonic = !SYSV386_COMPAT;
 
-/* 1 if support old (<= 2.8.1) versions of gcc.  */
-static int old_gcc = OLDGCC_COMPAT;
-
 /* 1 if pseudo registers are permitted.  */
 static int allow_pseudo_reg = 0;
 
@@ -851,6 +846,8 @@ static const arch_entry cpu_arch[] =
     CPU_BDVER4_FLAGS, 0 },
   { STRING_COMMA_LEN ("znver1"), PROCESSOR_ZNVER,
     CPU_ZNVER1_FLAGS, 0 },
+  { STRING_COMMA_LEN ("znver2"), PROCESSOR_ZNVER,
+    CPU_ZNVER2_FLAGS, 0 },
   { STRING_COMMA_LEN ("btver1"), PROCESSOR_BT,
     CPU_BTVER1_FLAGS, 0 },
   { STRING_COMMA_LEN ("btver2"), PROCESSOR_BT,
@@ -1033,6 +1030,14 @@ static const arch_entry cpu_arch[] =
     CPU_WBNOINVD_FLAGS, 0 },
   { STRING_COMMA_LEN (".pconfig"), PROCESSOR_UNKNOWN,
     CPU_PCONFIG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".waitpkg"), PROCESSOR_UNKNOWN,
+    CPU_WAITPKG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".cldemote"), PROCESSOR_UNKNOWN,
+    CPU_CLDEMOTE_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdiri"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIRI_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdir64b"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIR64B_FLAGS, 0 },
 };
 
 static const noarch_entry cpu_noarch[] =
@@ -1068,6 +1073,8 @@ static const noarch_entry cpu_noarch[] =
   { STRING_COMMA_LEN ("noavx512_bitalg"), CPU_ANY_AVX512_BITALG_FLAGS },
   { STRING_COMMA_LEN ("noibt"), CPU_ANY_IBT_FLAGS },
   { STRING_COMMA_LEN ("noshstk"), CPU_ANY_SHSTK_FLAGS },
+  { STRING_COMMA_LEN ("nomovdiri"), CPU_ANY_MOVDIRI_FLAGS },
+  { STRING_COMMA_LEN ("nomovdir64b"), CPU_ANY_MOVDIR64B_FLAGS },
 };
 
 #ifdef I386COFF
@@ -1179,63 +1186,27 @@ static const unsigned char f32_3[] =
   {0x8d,0x76,0x00};                    /* leal 0(%esi),%esi    */
 static const unsigned char f32_4[] =
   {0x8d,0x74,0x26,0x00};               /* leal 0(%esi,1),%esi  */
-static const unsigned char f32_5[] =
-  {0x90,                               /* nop                  */
-   0x8d,0x74,0x26,0x00};               /* leal 0(%esi,1),%esi  */
 static const unsigned char f32_6[] =
   {0x8d,0xb6,0x00,0x00,0x00,0x00};     /* leal 0L(%esi),%esi   */
 static const unsigned char f32_7[] =
   {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};        /* leal 0L(%esi,1),%esi */
-static const unsigned char f32_8[] =
-  {0x90,                               /* nop                  */
-   0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};        /* leal 0L(%esi,1),%esi */
-static const unsigned char f32_9[] =
-  {0x89,0xf6,                          /* movl %esi,%esi       */
-   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};        /* leal 0L(%edi,1),%edi */
-static const unsigned char f32_10[] =
-  {0x8d,0x76,0x00,                     /* leal 0(%esi),%esi    */
-   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};        /* leal 0L(%edi,1),%edi */
-static const unsigned char f32_11[] =
-  {0x8d,0x74,0x26,0x00,                        /* leal 0(%esi,1),%esi  */
-   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};        /* leal 0L(%edi,1),%edi */
-static const unsigned char f32_12[] =
-  {0x8d,0xb6,0x00,0x00,0x00,0x00,      /* leal 0L(%esi),%esi   */
-   0x8d,0xbf,0x00,0x00,0x00,0x00};     /* leal 0L(%edi),%edi   */
-static const unsigned char f32_13[] =
-  {0x8d,0xb6,0x00,0x00,0x00,0x00,      /* leal 0L(%esi),%esi   */
-   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};        /* leal 0L(%edi,1),%edi */
-static const unsigned char f32_14[] =
-  {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00, /* leal 0L(%esi,1),%esi */
-   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};        /* leal 0L(%edi,1),%edi */
 static const unsigned char f16_3[] =
-  {0x8d,0x74,0x00};                    /* lea 0(%esi),%esi     */
+  {0x8d,0x74,0x00};                    /* lea 0(%si),%si       */
 static const unsigned char f16_4[] =
-  {0x8d,0xb4,0x00,0x00};               /* lea 0w(%si),%si      */
-static const unsigned char f16_5[] =
-  {0x90,                               /* nop                  */
-   0x8d,0xb4,0x00,0x00};               /* lea 0w(%si),%si      */
-static const unsigned char f16_6[] =
-  {0x89,0xf6,                          /* mov %si,%si          */
-   0x8d,0xbd,0x00,0x00};               /* lea 0w(%di),%di      */
-static const unsigned char f16_7[] =
-  {0x8d,0x74,0x00,                     /* lea 0(%si),%si       */
-   0x8d,0xbd,0x00,0x00};               /* lea 0w(%di),%di      */
-static const unsigned char f16_8[] =
-  {0x8d,0xb4,0x00,0x00,                        /* lea 0w(%si),%si      */
-   0x8d,0xbd,0x00,0x00};               /* lea 0w(%di),%di      */
-static const unsigned char jump_31[] =
-  {0xeb,0x1d,0x90,0x90,0x90,0x90,0x90, /* jmp .+31; lotsa nops */
-   0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
-   0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
-   0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
+  {0x8d,0xb4,0x00,0x00};               /* lea 0W(%si),%si      */
+static const unsigned char jump_disp8[] =
+  {0xeb};                              /* jmp disp8           */
+static const unsigned char jump32_disp32[] =
+  {0xe9};                              /* jmp disp32          */
+static const unsigned char jump16_disp32[] =
+  {0x66,0xe9};                         /* jmp disp32          */
 /* 32-bit NOPs patterns.  */
 static const unsigned char *const f32_patt[] = {
-  f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
-  f32_9, f32_10, f32_11, f32_12, f32_13, f32_14
+  f32_1, f32_2, f32_3, f32_4, NULL, f32_6, f32_7
 };
 /* 16-bit NOPs patterns.  */
 static const unsigned char *const f16_patt[] = {
-  f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8
+  f32_1, f32_2, f16_3, f16_4
 };
 /* nopl (%[re]ax) */
 static const unsigned char alt_3[] =
@@ -1261,18 +1232,13 @@ static const unsigned char alt_9[] =
 /* nopw %cs:0L(%[re]ax,%[re]ax,1) */
 static const unsigned char alt_10[] =
   {0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+/* data16 nopw %cs:0L(%eax,%eax,1) */
+static const unsigned char alt_11[] =
+  {0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
 /* 32-bit and 64-bit NOPs patterns.  */
 static const unsigned char *const alt_patt[] = {
   f32_1, f32_2, alt_3, alt_4, alt_5, alt_6, alt_7, alt_8,
-  alt_9, alt_10
-};
-/* 64-bit only: nopw %cs:0L(%eax,%eax,1) */
-static const unsigned char alt64_11[] =
-  {0x67,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
-/* 64-bit NOPs patterns.  */
-static const unsigned char *const alt64_patt[] = {
-  f32_1, f32_2, alt_3, alt_4, alt_5, alt_6, alt_7, alt_8,
-  alt_9, alt_10, alt64_11
+  alt_9, alt_10, alt_11
 };
 
 /* Genenerate COUNT bytes of NOPs to WHERE from PATT with the maximum
@@ -1283,63 +1249,73 @@ i386_output_nops (char *where, const unsigned char *const *patt,
                  int count, int max_single_nop_size)
 
 {
-  while (count > max_single_nop_size)
+  /* Place the longer NOP first.  */
+  int last;
+  int offset;
+  const unsigned char *nops =  patt[max_single_nop_size - 1];
+
+  /* Use the smaller one if the requsted one isn't available.  */
+  if (nops == NULL)
     {
-      count -= max_single_nop_size;
-      memcpy (where + count, patt[max_single_nop_size - 1],
-             max_single_nop_size);
+      max_single_nop_size--;
+      nops = patt[max_single_nop_size - 1];
     }
 
-  if (count)
-    memcpy (where, patt[count - 1], count);
+  last = count % max_single_nop_size;
+
+  count -= last;
+  for (offset = 0; offset < count; offset += max_single_nop_size)
+    memcpy (where + offset, nops, max_single_nop_size);
+
+  if (last)
+    {
+      nops = patt[last - 1];
+      if (nops == NULL)
+       {
+         /* Use the smaller one plus one-byte NOP if the needed one
+            isn't available.  */
+         last--;
+         nops = patt[last - 1];
+         memcpy (where + offset, nops, last);
+         where[offset + last] = *patt[0];
+       }
+      else
+       memcpy (where + offset, nops, last);
+    }
 }
 
+static INLINE int
+fits_in_imm7 (offsetT num)
+{
+  return (num & 0x7f) == num;
+}
+
+static INLINE int
+fits_in_imm31 (offsetT num)
+{
+  return (num & 0x7fffffff) == num;
+}
 
 /* Genenerate COUNT bytes of NOPs to WHERE with the maximum size of a
    single NOP instruction LIMIT.  */
 
 void
-i386_generate_nops (fragS *f, char *where, offsetT count, int limit)
+i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
 {
-  /* Output NOPs for .nop directive.  */
+  const unsigned char *const *patt = NULL;
   int max_single_nop_size;
-  const unsigned char *const *patt;
+  /* Maximum number of NOPs before switching to jump over NOPs.  */
+  int max_number_of_nops;
 
-  if (flag_code == CODE_16BIT)
-    {
-      patt = f16_patt;
-      max_single_nop_size = sizeof (f16_patt) / sizeof (f16_patt[0]);
-    }
-  else if (flag_code == CODE_64BIT)
+  switch (fragP->fr_type)
     {
-      patt = alt64_patt;
-      max_single_nop_size = sizeof (alt64_patt) / sizeof (alt64_patt[0]);
-    }
-  else
-    {
-      patt = alt_patt;
-      max_single_nop_size = sizeof (alt_patt) / sizeof (alt_patt[0]);
-    }
-  if (limit == 0)
-    limit = max_single_nop_size;
-  else if (limit > max_single_nop_size)
-    {
-      as_bad_where (f->fr_file, f->fr_line,
-                   _("invalide single nop size: %d (expect within [0, %d])"),
-                   limit, max_single_nop_size);
+    case rs_fill_nop:
+    case rs_align_code:
+      break;
+    default:
       return;
     }
 
-  i386_output_nops (where, patt, count, limit);
-}
-
-void
-i386_align_code (fragS *fragP, int count)
-{
-  /* Only align for at least a positive non-zero boundary. */
-  if (count <= 0 || count > MAX_MEM_FOR_RS_ALIGN_CODE)
-    return;
-
   /* We need to decide which NOP sequence to use for 32bit and
      64bit. When -mtune= is used:
 
@@ -1356,21 +1332,13 @@ i386_align_code (fragS *fragP, int count)
 
   if (flag_code == CODE_16BIT)
     {
-      if (count > 8)
-       {
-         memcpy (fragP->fr_literal + fragP->fr_fix,
-                 jump_31, count);
-         /* Adjust jump offset.  */
-         fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
-       }
-      else
-       memcpy (fragP->fr_literal + fragP->fr_fix,
-               f16_patt[count - 1], count);
+      patt = f16_patt;
+      max_single_nop_size = sizeof (f16_patt) / sizeof (f16_patt[0]);
+      /* Limit number of NOPs to 2 in 16-bit mode.  */
+      max_number_of_nops = 2;
     }
   else
     {
-      const unsigned char *const *patt = NULL;
-
       if (fragP->tc_frag_data.isa == PROCESSOR_UNKNOWN)
        {
          /* PROCESSOR_UNKNOWN means that all ISAs may be used.  */
@@ -1461,38 +1429,79 @@ i386_align_code (fragS *fragP, int count)
 
       if (patt == f32_patt)
        {
-         /* If the padding is less than 15 bytes, we use the normal
-            ones.  Otherwise, we use a jump instruction and adjust
-            its offset.   */
-         int limit;
+         max_single_nop_size = sizeof (f32_patt) / sizeof (f32_patt[0]);
+         /* Limit number of NOPs to 2 for older processors.  */
+         max_number_of_nops = 2;
+       }
+      else
+       {
+         max_single_nop_size = sizeof (alt_patt) / sizeof (alt_patt[0]);
+         /* Limit number of NOPs to 7 for newer processors.  */
+         max_number_of_nops = 7;
+       }
+    }
 
-         /* For 64bit, the limit is 3 bytes.  */
-         if (flag_code == CODE_64BIT
-             && fragP->tc_frag_data.isa_flags.bitfield.cpulm)
-           limit = 3;
-         else
-           limit = 15;
-         if (count < limit)
-           memcpy (fragP->fr_literal + fragP->fr_fix,
-                   patt[count - 1], count);
-         else
-           {
-             memcpy (fragP->fr_literal + fragP->fr_fix,
-                     jump_31, count);
-             /* Adjust jump offset.  */
-             fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
-           }
+  if (limit == 0)
+    limit = max_single_nop_size;
+
+  if (fragP->fr_type == rs_fill_nop)
+    {
+      /* Output NOPs for .nop directive.  */
+      if (limit > max_single_nop_size)
+       {
+         as_bad_where (fragP->fr_file, fragP->fr_line,
+                       _("invalid single nop size: %d "
+                         "(expect within [0, %d])"),
+                       limit, max_single_nop_size);
+         return;
+       }
+    }
+  else
+    fragP->fr_var = count;
+
+  if ((count / max_single_nop_size) > max_number_of_nops)
+    {
+      /* Generate jump over NOPs.  */
+      offsetT disp = count - 2;
+      if (fits_in_imm7 (disp))
+       {
+         /* Use "jmp disp8" if possible.  */
+         count = disp;
+         where[0] = jump_disp8[0];
+         where[1] = count;
+         where += 2;
        }
       else
        {
-         /* Maximum length of an instruction is 10 byte.  If the
-            padding is greater than 10 bytes and we don't use jump,
-            we have to break it into smaller pieces.  */
-         i386_output_nops (fragP->fr_literal + fragP->fr_fix,
-                           patt, count, 10);
+         unsigned int size_of_jump;
+
+         if (flag_code == CODE_16BIT)
+           {
+             where[0] = jump16_disp32[0];
+             where[1] = jump16_disp32[1];
+             size_of_jump = 2;
+           }
+         else
+           {
+             where[0] = jump32_disp32[0];
+             size_of_jump = 1;
+           }
+
+         count -= size_of_jump + 4;
+         if (!fits_in_imm31 (count))
+           {
+             as_bad_where (fragP->fr_file, fragP->fr_line,
+                           _("jump over nop padding out of range"));
+             return;
+           }
+
+         md_number_to_chars (where + size_of_jump, count, 4);
+         where += size_of_jump + 4;
        }
     }
-  fragP->fr_var = count;
+
+  /* Generate multiple NOPs.  */
+  i386_output_nops (where, patt, count, limit);
 }
 
 static INLINE int
@@ -1685,15 +1694,9 @@ cpu_flags_and_not (i386_cpu_flags x, i386_cpu_flags y)
 
 #define CPU_FLAGS_ARCH_MATCH           0x1
 #define CPU_FLAGS_64BIT_MATCH          0x2
-#define CPU_FLAGS_AES_MATCH            0x4
-#define CPU_FLAGS_PCLMUL_MATCH         0x8
-#define CPU_FLAGS_AVX_MATCH           0x10
 
-#define CPU_FLAGS_32BIT_MATCH \
-  (CPU_FLAGS_ARCH_MATCH | CPU_FLAGS_AES_MATCH \
-   | CPU_FLAGS_PCLMUL_MATCH | CPU_FLAGS_AVX_MATCH)
 #define CPU_FLAGS_PERFECT_MATCH \
-  (CPU_FLAGS_32BIT_MATCH | CPU_FLAGS_64BIT_MATCH)
+  (CPU_FLAGS_ARCH_MATCH | CPU_FLAGS_64BIT_MATCH)
 
 /* Return CPU flags match bits. */
 
@@ -1709,55 +1712,42 @@ cpu_flags_match (const insn_template *t)
   if (cpu_flags_all_zero (&x))
     {
       /* This instruction is available on all archs.  */
-      match |= CPU_FLAGS_32BIT_MATCH;
+      match |= CPU_FLAGS_ARCH_MATCH;
     }
   else
     {
       /* This instruction is available only on some archs.  */
       i386_cpu_flags cpu = cpu_arch_flags;
 
+      /* AVX512VL is no standalone feature - match it and then strip it.  */
+      if (x.bitfield.cpuavx512vl && !cpu.bitfield.cpuavx512vl)
+       return match;
+      x.bitfield.cpuavx512vl = 0;
+
       cpu = cpu_flags_and (x, cpu);
       if (!cpu_flags_all_zero (&cpu))
        {
          if (x.bitfield.cpuavx)
            {
-             /* We only need to check AES/PCLMUL/SSE2AVX with AVX.  */
-             if (cpu.bitfield.cpuavx)
-               {
-                 /* Check SSE2AVX.  */
-                 if (!t->opcode_modifier.sse2avx|| sse2avx)
-                   {
-                     match |= (CPU_FLAGS_ARCH_MATCH
-                               | CPU_FLAGS_AVX_MATCH);
-                     /* Check AES.  */
-                     if (!x.bitfield.cpuaes || cpu.bitfield.cpuaes)
-                       match |= CPU_FLAGS_AES_MATCH;
-                     /* Check PCLMUL.  */
-                     if (!x.bitfield.cpupclmul
-                         || cpu.bitfield.cpupclmul)
-                       match |= CPU_FLAGS_PCLMUL_MATCH;
-                   }
-               }
-             else
+             /* We need to check a few extra flags with AVX.  */
+             if (cpu.bitfield.cpuavx
+                 && (!t->opcode_modifier.sse2avx || sse2avx)
+                 && (!x.bitfield.cpuaes || cpu.bitfield.cpuaes)
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+                 && (!x.bitfield.cpupclmul || cpu.bitfield.cpupclmul))
                match |= CPU_FLAGS_ARCH_MATCH;
            }
-         else if (x.bitfield.cpuavx512vl)
+         else if (x.bitfield.cpuavx512f)
            {
-             /* Match AVX512VL.  */
-             if (cpu.bitfield.cpuavx512vl)
-               {
-                 /* Need another match.  */
-                 cpu.bitfield.cpuavx512vl = 0;
-                 if (!cpu_flags_all_zero (&cpu))
-                   match |= CPU_FLAGS_32BIT_MATCH;
-                 else
-                   match |= CPU_FLAGS_ARCH_MATCH;
-               }
-             else
+             /* We need to check a few extra flags with AVX512F.  */
+             if (cpu.bitfield.cpuavx512f
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+                 && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
+                 && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
                match |= CPU_FLAGS_ARCH_MATCH;
            }
          else
-           match |= CPU_FLAGS_32BIT_MATCH;
+           match |= CPU_FLAGS_ARCH_MATCH;
        }
     }
   return match;
@@ -1783,6 +1773,26 @@ operand_type_and (i386_operand_type x, i386_operand_type y)
   return x;
 }
 
+static INLINE i386_operand_type
+operand_type_and_not (i386_operand_type x, i386_operand_type y)
+{
+  switch (ARRAY_SIZE (x.array))
+    {
+    case 3:
+      x.array [2] &= ~y.array [2];
+      /* Fall through.  */
+    case 2:
+      x.array [1] &= ~y.array [1];
+      /* Fall through.  */
+    case 1:
+      x.array [0] &= ~y.array [0];
+      break;
+    default:
+      abort ();
+    }
+  return x;
+}
+
 static INLINE i386_operand_type
 operand_type_or (i386_operand_type x, i386_operand_type y)
 {
@@ -1825,11 +1835,6 @@ operand_type_xor (i386_operand_type x, i386_operand_type y)
 
 static const i386_operand_type acc32 = OPERAND_TYPE_ACC32;
 static const i386_operand_type acc64 = OPERAND_TYPE_ACC64;
-static const i386_operand_type control = OPERAND_TYPE_CONTROL;
-static const i386_operand_type inoutportreg
-  = OPERAND_TYPE_INOUTPORTREG;
-static const i386_operand_type reg16_inoutportreg
-  = OPERAND_TYPE_REG16_INOUTPORTREG;
 static const i386_operand_type disp16 = OPERAND_TYPE_DISP16;
 static const i386_operand_type disp32 = OPERAND_TYPE_DISP32;
 static const i386_operand_type disp32s = OPERAND_TYPE_DISP32S;
@@ -1895,71 +1900,81 @@ operand_type_check (i386_operand_type t, enum operand_type c)
   return 0;
 }
 
-/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit size
+   between operand GIVEN and opeand WANTED for instruction template T.  */
 
 static INLINE int
-match_reg_size (const insn_template *t, unsigned int j)
+match_operand_size (const insn_template *t, unsigned int wanted,
+                   unsigned int given)
 {
-  return !((i.types[j].bitfield.byte
-           && !t->operand_types[j].bitfield.byte)
-          || (i.types[j].bitfield.word
-              && !t->operand_types[j].bitfield.word)
-          || (i.types[j].bitfield.dword
-              && !t->operand_types[j].bitfield.dword)
-          || (i.types[j].bitfield.qword
-              && !t->operand_types[j].bitfield.qword)
-          || (i.types[j].bitfield.tbyte
-              && !t->operand_types[j].bitfield.tbyte));
+  return !((i.types[given].bitfield.byte
+           && !t->operand_types[wanted].bitfield.byte)
+          || (i.types[given].bitfield.word
+              && !t->operand_types[wanted].bitfield.word)
+          || (i.types[given].bitfield.dword
+              && !t->operand_types[wanted].bitfield.dword)
+          || (i.types[given].bitfield.qword
+              && !t->operand_types[wanted].bitfield.qword)
+          || (i.types[given].bitfield.tbyte
+              && !t->operand_types[wanted].bitfield.tbyte));
 }
 
-/* Return 1 if there is no conflict in SIMD register on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in SIMD register between operand
+   GIVEN and opeand WANTED for instruction template T.  */
 
 static INLINE int
-match_simd_size (const insn_template *t, unsigned int j)
+match_simd_size (const insn_template *t, unsigned int wanted,
+                unsigned int given)
 {
-  return !((i.types[j].bitfield.xmmword
-           && !t->operand_types[j].bitfield.xmmword)
-          || (i.types[j].bitfield.ymmword
-              && !t->operand_types[j].bitfield.ymmword)
-          || (i.types[j].bitfield.zmmword
-              && !t->operand_types[j].bitfield.zmmword));
+  return !((i.types[given].bitfield.xmmword
+           && !t->operand_types[wanted].bitfield.xmmword)
+          || (i.types[given].bitfield.ymmword
+              && !t->operand_types[wanted].bitfield.ymmword)
+          || (i.types[given].bitfield.zmmword
+              && !t->operand_types[wanted].bitfield.zmmword));
 }
 
-/* Return 1 if there is no conflict in any size on operand J for
-   instruction template T.  */
+/* Return 1 if there is no conflict in any size between operand GIVEN
+   and opeand WANTED for instruction template T.  */
 
 static INLINE int
-match_mem_size (const insn_template *t, unsigned int j)
+match_mem_size (const insn_template *t, unsigned int wanted,
+               unsigned int given)
 {
-  return (match_reg_size (t, j)
-         && !((i.types[j].bitfield.unspecified
+  return (match_operand_size (t, wanted, given)
+         && !((i.types[given].bitfield.unspecified
                && !i.broadcast
-               && !t->operand_types[j].bitfield.unspecified)
-              || (i.types[j].bitfield.fword
-                  && !t->operand_types[j].bitfield.fword)
+               && !t->operand_types[wanted].bitfield.unspecified)
+              || (i.types[given].bitfield.fword
+                  && !t->operand_types[wanted].bitfield.fword)
               /* For scalar opcode templates to allow register and memory
                  operands at the same time, some special casing is needed
-                 here.  */
-              || ((t->operand_types[j].bitfield.regsimd
+                 here.  Also for v{,p}broadcast*, {,v}pmov{s,z}*, and
+                 down-conversion vpmov*.  */
+              || ((t->operand_types[wanted].bitfield.regsimd
                    && !t->opcode_modifier.broadcast
-                   && (t->operand_types[j].bitfield.dword
-                       || t->operand_types[j].bitfield.qword))
-                  ? (i.types[j].bitfield.xmmword
-                     || i.types[j].bitfield.ymmword
-                     || i.types[j].bitfield.zmmword)
-                  : !match_simd_size(t, j))));
+                   && (t->operand_types[wanted].bitfield.byte
+                       || t->operand_types[wanted].bitfield.word
+                       || t->operand_types[wanted].bitfield.dword
+                       || t->operand_types[wanted].bitfield.qword))
+                  ? (i.types[given].bitfield.xmmword
+                     || i.types[given].bitfield.ymmword
+                     || i.types[given].bitfield.zmmword)
+                  : !match_simd_size(t, wanted, given))));
 }
 
-/* Return 1 if there is no size conflict on any operands for
-   instruction template T.  */
+/* Return value has MATCH_STRAIGHT set if there is no size conflict on any
+   operands for instruction template T, and it has MATCH_REVERSE set if there
+   is no size conflict on any operands for the template with operands reversed
+   (and the template allows for reversing in the first place).  */
 
-static INLINE int
+#define MATCH_STRAIGHT 1
+#define MATCH_REVERSE  2
+
+static INLINE unsigned int
 operand_size_match (const insn_template *t)
 {
-  unsigned int j;
-  int match = 1;
+  unsigned int j, match = MATCH_STRAIGHT;
 
   /* Don't check jump instructions.  */
   if (t->opcode_modifier.jump
@@ -1976,59 +1991,56 @@ operand_size_match (const insn_template *t)
        continue;
 
       if (t->operand_types[j].bitfield.reg
-         && !match_reg_size (t, j))
+         && !match_operand_size (t, j, j))
        {
          match = 0;
          break;
        }
 
       if (t->operand_types[j].bitfield.regsimd
-         && !match_simd_size (t, j))
+         && !match_simd_size (t, j, j))
        {
          match = 0;
          break;
        }
 
       if (t->operand_types[j].bitfield.acc
-         && (!match_reg_size (t, j) || !match_simd_size (t, j)))
+         && (!match_operand_size (t, j, j) || !match_simd_size (t, j, j)))
        {
          match = 0;
          break;
        }
 
-      if (i.types[j].bitfield.mem && !match_mem_size (t, j))
+      if ((i.flags[j] & Operand_Mem) && !match_mem_size (t, j, j))
        {
          match = 0;
          break;
        }
     }
 
-  if (match)
-    return match;
-  else if (!t->opcode_modifier.d && !t->opcode_modifier.floatd)
+  if (!t->opcode_modifier.d)
     {
 mismatch:
-      i.error = operand_size_mismatch;
-      return 0;
+      if (!match)
+       i.error = operand_size_mismatch;
+      return match;
     }
 
   /* Check reverse.  */
   gas_assert (i.operands == 2);
 
-  match = 1;
   for (j = 0; j < 2; j++)
     {
       if ((t->operand_types[j].bitfield.reg
           || t->operand_types[j].bitfield.acc)
-         && !match_reg_size (t, j ? 0 : 1))
+         && !match_operand_size (t, j, !j))
        goto mismatch;
 
-      if (i.types[j].bitfield.mem
-         && !match_mem_size (t, j ? 0 : 1))
+      if ((i.flags[!j] & Operand_Mem) && !match_mem_size (t, j, !j))
        goto mismatch;
     }
 
-  return match;
+  return match | MATCH_REVERSE;
 }
 
 static INLINE int
@@ -2204,18 +2216,6 @@ fits_in_imm4 (offsetT num)
   return (num & 0xf) == num;
 }
 
-static INLINE int
-fits_in_imm7 (offsetT num)
-{
-  return (num & 0x7f) == num;
-}
-
-static INLINE int
-fits_in_imm31 (offsetT num)
-{
-  return (num & 0x7fffffff) == num;
-}
-
 static i386_operand_type
 smallest_imm_type (offsetT num)
 {
@@ -2333,8 +2333,9 @@ add_prefix (unsigned int prefix)
       && flag_code == CODE_64BIT)
     {
       if ((i.prefix[REX_PREFIX] & prefix & REX_W)
-         || ((i.prefix[REX_PREFIX] & (REX_R | REX_X | REX_B))
-             && (prefix & (REX_R | REX_X | REX_B))))
+         || (i.prefix[REX_PREFIX] & prefix & REX_R)
+         || (i.prefix[REX_PREFIX] & prefix & REX_X)
+         || (i.prefix[REX_PREFIX] & prefix & REX_B))
        ret = PREFIX_EXIST;
       q = REX_PREFIX;
     }
@@ -2638,6 +2639,10 @@ set_cpu_arch (int dummy ATTRIBUTE_UNUSED)
                  cpu_arch_flags = flags;
                  cpu_arch_isa_flags = flags;
                }
+             else
+               cpu_arch_isa_flags
+                 = cpu_flags_or (cpu_arch_isa_flags,
+                                 cpu_arch[j].flags);
              (void) restore_line_pointer (e);
              demand_empty_rest_of_line ();
              return;
@@ -3362,8 +3367,10 @@ build_vex_prefix (const insn_template *t)
     {
       unsigned int op;
 
+      /* Determine vector length from the last multi-length vector
+        operand.  */
       vector_length = 0;
-      for (op = 0; op < t->operands; ++op)
+      for (op = t->operands; op--;)
        if (t->operand_types[op].bitfield.xmmword
            && t->operand_types[op].bitfield.ymmword
            && i.types[op].bitfield.ymmword)
@@ -3463,6 +3470,21 @@ build_vex_prefix (const insn_template *t)
     }
 }
 
+static INLINE bfd_boolean
+is_evex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.evex || t->opcode_modifier.disp8memshift
+        || t->opcode_modifier.broadcast || t->opcode_modifier.masking
+        || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
+}
+
+static INLINE bfd_boolean
+is_any_vex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.vex || t->opcode_modifier.vexopcode
+        || is_evex_encoding (t);
+}
+
 /* Build the EVEX prefix.  */
 
 static void
@@ -3597,6 +3619,58 @@ build_evex_prefix (void)
       /* Encode the vector length.  */
       unsigned int vec_length;
 
+      if (!i.tm.opcode_modifier.evex
+         || i.tm.opcode_modifier.evex == EVEXDYN)
+       {
+         unsigned int op;
+
+         /* Determine vector length from the last multi-length vector
+            operand.  */
+         vec_length = 0;
+         for (op = i.operands; op--;)
+           if (i.tm.operand_types[op].bitfield.xmmword
+               + i.tm.operand_types[op].bitfield.ymmword
+               + i.tm.operand_types[op].bitfield.zmmword > 1)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX512;
+                   break;
+                 }
+               else if (i.types[op].bitfield.ymmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX256;
+                   break;
+                 }
+               else if (i.types[op].bitfield.xmmword)
+                 {
+                   i.tm.opcode_modifier.evex = EVEX128;
+                   break;
+                 }
+               else if (i.broadcast && (int) op == i.broadcast->operand)
+                 {
+                   switch (i.broadcast->bytes)
+                     {
+                       case 64:
+                         i.tm.opcode_modifier.evex = EVEX512;
+                         break;
+                       case 32:
+                         i.tm.opcode_modifier.evex = EVEX256;
+                         break;
+                       case 16:
+                         i.tm.opcode_modifier.evex = EVEX128;
+                         break;
+                       default:
+                         abort ();
+                     }
+                   break;
+                 }
+             }
+
+         if (op >= MAX_OPERANDS)
+           abort ();
+       }
+
       switch (i.tm.opcode_modifier.evex)
        {
        case EVEXLIG: /* LL' is ignored */
@@ -3693,8 +3767,7 @@ bad_register_operand:
 
   gas_assert (i.imm_operands <= 1
              && (i.operands <= 2
-                 || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.evex)
+                 || (is_any_vex_encoding (&i.tm)
                      && i.operands <= 4)));
 
   exp = &im_expressions[i.imm_operands++];
@@ -3785,7 +3858,8 @@ optimize_encoding (void)
        }
     }
   else if (flag_code == CODE_64BIT
-          && ((i.reg_operands == 1
+          && ((i.types[1].bitfield.qword
+               && i.reg_operands == 1
                && i.imm_operands == 1
                && i.op[0].imms->X_op == O_constant
                && ((i.tm.base_opcode == 0xb0
@@ -3800,12 +3874,16 @@ optimize_encoding (void)
                            || ((i.tm.base_opcode == 0xf6
                                 || i.tm.base_opcode == 0xc6)
                                && i.tm.extension_opcode == 0x0)))))
-              || (i.reg_operands == 2
-                  && i.op[0].regs == i.op[1].regs
-                  && ((i.tm.base_opcode == 0x30
-                       || i.tm.base_opcode == 0x28)
-                      && i.tm.extension_opcode == None)))
-          && i.types[1].bitfield.qword)
+              || (i.types[0].bitfield.qword
+                  && ((i.reg_operands == 2
+                       && i.op[0].regs == i.op[1].regs
+                       && ((i.tm.base_opcode == 0x30
+                            || i.tm.base_opcode == 0x28)
+                           && i.tm.extension_opcode == None))
+                      || (i.reg_operands == 1
+                          && i.operands == 1
+                          && i.tm.base_opcode == 0x30
+                          && i.tm.extension_opcode == None)))))
     {
       /* Optimize: -O:
           andq $imm31, %r64   -> andl $imm31, %r32
@@ -3847,20 +3925,33 @@ optimize_encoding (void)
           && i.op[0].regs == i.op[1].regs
           && !i.types[2].bitfield.xmmword
           && (i.tm.opcode_modifier.vex
-              || (!i.mask
+              || ((!i.mask || i.mask->zeroing)
                   && !i.rounding
-                  && i.tm.opcode_modifier.evex
-                  && cpu_arch_flags.bitfield.cpuavx512vl))
+                  && is_evex_encoding (&i.tm)
+                  && (i.vec_encoding != vex_encoding_evex
+                      || i.tm.cpu_flags.bitfield.cpuavx512vl
+                      || (i.tm.operand_types[2].bitfield.zmmword
+                          && i.types[2].bitfield.ymmword)
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
           && ((i.tm.base_opcode == 0x55
                || i.tm.base_opcode == 0x6655
                || i.tm.base_opcode == 0x66df
                || i.tm.base_opcode == 0x57
                || i.tm.base_opcode == 0x6657
-               || i.tm.base_opcode == 0x66ef)
+               || i.tm.base_opcode == 0x66ef
+               || i.tm.base_opcode == 0x66f8
+               || i.tm.base_opcode == 0x66f9
+               || i.tm.base_opcode == 0x66fa
+               || i.tm.base_opcode == 0x66fb
+               || i.tm.base_opcode == 0x42
+               || i.tm.base_opcode == 0x6642
+               || i.tm.base_opcode == 0x47
+               || i.tm.base_opcode == 0x6647)
               && i.tm.extension_opcode == None))
     {
       /* Optimize: -O2:
-          VOP, one of vandnps, vandnpd, vxorps and vxorpd:
+          VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
+          vpsubq and vpsubw:
             EVEX VOP %zmmM, %zmmM, %zmmN
               -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
               -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
@@ -3886,16 +3977,16 @@ optimize_encoding (void)
             EVEX VOP %ymmM, %ymmM, %ymmN
               -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
               -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+          VOP, one of kxord and kxorq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kxorw %kM, %kM, %kN
+          VOP, one of kandnd and kandnq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kandnw %kM, %kM, %kN
        */
-      if (i.tm.opcode_modifier.evex)
+      if (is_evex_encoding (&i.tm))
        {
-         /* If only lower 16 vector registers are used, we can use
-            VEX encoding.  */
-         for (j = 0; j < 3; j++)
-           if (register_number (i.op[j].regs) > 15)
-             break;
-
-         if (j < 3)
+         if (i.vec_encoding == vex_encoding_evex)
            i.tm.opcode_modifier.evex = EVEX128;
          else
            {
@@ -3904,6 +3995,11 @@ optimize_encoding (void)
              i.tm.opcode_modifier.evex = 0;
            }
        }
+      else if (i.tm.operand_types[0].bitfield.regmask)
+       {
+         i.tm.base_opcode &= 0xff;
+         i.tm.opcode_modifier.vexw = VEXW0;
+       }
       else
        i.tm.opcode_modifier.vex = VEX128;
 
@@ -3993,12 +4089,16 @@ md_assemble (char *line)
 
   if (sse_check != check_none
       && !i.tm.opcode_modifier.noavx
+      && !i.tm.cpu_flags.bitfield.cpuavx
       && (i.tm.cpu_flags.bitfield.cpusse
          || i.tm.cpu_flags.bitfield.cpusse2
          || i.tm.cpu_flags.bitfield.cpusse3
          || i.tm.cpu_flags.bitfield.cpussse3
          || i.tm.cpu_flags.bitfield.cpusse4_1
-         || i.tm.cpu_flags.bitfield.cpusse4_2))
+         || i.tm.cpu_flags.bitfield.cpusse4_2
+         || i.tm.cpu_flags.bitfield.cpupclmul
+         || i.tm.cpu_flags.bitfield.cpuaes
+         || i.tm.cpu_flags.bitfield.cpugfni))
     {
       (sse_check == check_warning
        ? as_warn
@@ -4045,6 +4145,13 @@ md_assemble (char *line)
       return;
     }
 
+  /* Check for data size prefix on VEX/XOP/EVEX encoded insns.  */
+  if (i.prefix[DATA_PREFIX] && is_any_vex_encoding (&i.tm))
+    {
+      as_bad (_("data size prefix invalid with `%s'"), i.tm.name);
+      return;
+    }
+
   /* Check if HLE prefix is OK.  */
   if (i.hle_prefix && !check_hle ())
     return;
@@ -4068,10 +4175,16 @@ md_assemble (char *line)
     }
 
   /* Insert BND prefix.  */
-  if (add_bnd_prefix
-      && i.tm.opcode_modifier.bndprefixok
-      && !i.prefix[BND_PREFIX])
-    add_prefix (BND_PREFIX_OPCODE);
+  if (add_bnd_prefix && i.tm.opcode_modifier.bndprefixok)
+    {
+      if (!i.prefix[BND_PREFIX])
+       add_prefix (BND_PREFIX_OPCODE);
+      else if (i.prefix[BND_PREFIX] != BND_PREFIX_OPCODE)
+       {
+         as_warn (_("replacing `rep'/`repe' prefix by `bnd'"));
+         i.prefix[BND_PREFIX] = BND_PREFIX_OPCODE;
+       }
+    }
 
   /* Check string instruction segment overrides.  */
   if (i.tm.opcode_modifier.isstring && i.mem_operands != 0)
@@ -4125,7 +4238,7 @@ md_assemble (char *line)
       as_warn (_("translating to `%sp'"), i.tm.name);
     }
 
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.evex)
+  if (is_any_vex_encoding (&i.tm))
     {
       if (flag_code == CODE_16BIT)
        {
@@ -4487,34 +4600,26 @@ check_suffix:
     {
       supported |= cpu_flags_match (t);
       if (supported == CPU_FLAGS_PERFECT_MATCH)
-       goto skip;
-    }
+       {
+         if (!cpu_arch_flags.bitfield.cpui386 && (flag_code != CODE_16BIT))
+           as_warn (_("use .code16 to ensure correct addressing mode"));
 
-  if (!(supported & CPU_FLAGS_64BIT_MATCH))
-    {
-      as_bad (flag_code == CODE_64BIT
-             ? _("`%s' is not supported in 64-bit mode")
-             : _("`%s' is only supported in 64-bit mode"),
-             current_templates->start->name);
-      return NULL;
-    }
-  if (supported != CPU_FLAGS_PERFECT_MATCH)
-    {
-      as_bad (_("`%s' is not supported on `%s%s'"),
-             current_templates->start->name,
-             cpu_arch_name ? cpu_arch_name : default_arch,
-             cpu_sub_arch_name ? cpu_sub_arch_name : "");
-      return NULL;
+         return l;
+       }
     }
 
-skip:
-  if (!cpu_arch_flags.bitfield.cpui386
-          && (flag_code != CODE_16BIT))
-    {
-      as_warn (_("use .code16 to ensure correct addressing mode"));
-    }
+  if (!(supported & CPU_FLAGS_64BIT_MATCH))
+    as_bad (flag_code == CODE_64BIT
+           ? _("`%s' is not supported in 64-bit mode")
+           : _("`%s' is only supported in 64-bit mode"),
+           current_templates->start->name);
+  else
+    as_bad (_("`%s' is not supported on `%s%s'"),
+           current_templates->start->name,
+           cpu_arch_name ? cpu_arch_name : default_arch,
+           cpu_sub_arch_name ? cpu_sub_arch_name : "");
 
-  return l;
+  return NULL;
 }
 
 static char *
@@ -4596,6 +4701,13 @@ parse_operands (char *l, const char *mnemonic)
          /* Now parse operand adding info to 'i' as we go along.  */
          END_STRING_AND_SAVE (l);
 
+         if (i.mem_operands > 1)
+           {
+             as_bad (_("too many memory references for `%s'"),
+                     mnemonic);
+             return 0;
+           }
+
          if (intel_syntax)
            operand_ok =
              i386_intel_operand (token_start,
@@ -4641,14 +4753,21 @@ swap_2_operands (int xchg1, int xchg2)
 {
   union i386_op temp_op;
   i386_operand_type temp_type;
+  unsigned int temp_flags;
   enum bfd_reloc_code_real temp_reloc;
 
   temp_type = i.types[xchg2];
   i.types[xchg2] = i.types[xchg1];
   i.types[xchg1] = temp_type;
+
+  temp_flags = i.flags[xchg2];
+  i.flags[xchg2] = i.flags[xchg1];
+  i.flags[xchg1] = temp_flags;
+
   temp_op = i.op[xchg2];
   i.op[xchg2] = i.op[xchg1];
   i.op[xchg1] = temp_op;
+
   temp_reloc = i.reloc[xchg2];
   i.reloc[xchg2] = i.reloc[xchg1];
   i.reloc[xchg1] = temp_reloc;
@@ -4931,12 +5050,52 @@ optimize_disp (void)
       }
 }
 
+/* Return 1 if there is a match in broadcast bytes between operand
+   GIVEN and instruction template T.   */
+
+static INLINE int
+match_broadcast_size (const insn_template *t, unsigned int given)
+{
+  return ((t->opcode_modifier.broadcast == BYTE_BROADCAST
+          && i.types[given].bitfield.byte)
+         || (t->opcode_modifier.broadcast == WORD_BROADCAST
+             && i.types[given].bitfield.word)
+         || (t->opcode_modifier.broadcast == DWORD_BROADCAST
+             && i.types[given].bitfield.dword)
+         || (t->opcode_modifier.broadcast == QWORD_BROADCAST
+             && i.types[given].bitfield.qword));
+}
+
 /* Check if operands are valid for the instruction.  */
 
 static int
 check_VecOperands (const insn_template *t)
 {
   unsigned int op;
+  i386_cpu_flags cpu;
+  static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
+
+  /* Templates allowing for ZMMword as well as YMMword and/or XMMword for
+     any one operand are implicity requiring AVX512VL support if the actual
+     operand size is YMMword or XMMword.  Since this function runs after
+     template matching, there's no need to check for YMMword/XMMword in
+     the template.  */
+  cpu = cpu_flags_and (t->cpu_flags, avx512);
+  if (!cpu_flags_all_zero (&cpu)
+      && !t->cpu_flags.bitfield.cpuavx512vl
+      && !cpu_arch_flags.bitfield.cpuavx512vl)
+    {
+      for (op = 0; op < t->operands; ++op)
+       {
+         if (t->operand_types[op].bitfield.zmmword
+             && (i.types[op].bitfield.ymmword
+                 || i.types[op].bitfield.xmmword))
+           {
+             i.error = unsupported;
+             return 1;
+           }
+       }
+    }
 
   /* Without VSIB byte, we can't have a vector register for index.  */
   if (!t->opcode_modifier.vecsib
@@ -5022,42 +5181,67 @@ check_VecOperands (const insn_template *t)
      to the memory operand.  */
   if (i.broadcast)
     {
-      int broadcasted_opnd_size;
+      i386_operand_type type, overlap;
 
       /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type,
-        depending on VecESize.  */
-      if (i.broadcast->type != t->opcode_modifier.broadcast
-         || !i.types[i.broadcast->operand].bitfield.mem
-         || (t->opcode_modifier.vecesize == 0
-             && !i.types[i.broadcast->operand].bitfield.dword
-             && !i.types[i.broadcast->operand].bitfield.unspecified)
-         || (t->opcode_modifier.vecesize == 1
-             && !i.types[i.broadcast->operand].bitfield.qword
-             && !i.types[i.broadcast->operand].bitfield.unspecified))
-       goto bad_broadcast;
-
-      broadcasted_opnd_size = t->opcode_modifier.vecesize ? 64 : 32;
-      if (i.broadcast->type == BROADCAST_1TO16)
-       broadcasted_opnd_size <<= 4; /* Broadcast 1to16.  */
-      else if (i.broadcast->type == BROADCAST_1TO8)
-       broadcasted_opnd_size <<= 3; /* Broadcast 1to8.  */
-      else if (i.broadcast->type == BROADCAST_1TO4)
-       broadcasted_opnd_size <<= 2; /* Broadcast 1to4.  */
-      else if (i.broadcast->type == BROADCAST_1TO2)
-       broadcasted_opnd_size <<= 1; /* Broadcast 1to2.  */
-      else
-       goto bad_broadcast;
-
-      if ((broadcasted_opnd_size == 256
-          && !t->operand_types[i.broadcast->operand].bitfield.ymmword)
-         || (broadcasted_opnd_size == 512
-             && !t->operand_types[i.broadcast->operand].bitfield.zmmword))
+        and its broadcast bytes match the memory operand.  */
+      op = i.broadcast->operand;
+      if (!t->opcode_modifier.broadcast
+         || !(i.flags[op] & Operand_Mem)
+         || (!i.types[op].bitfield.unspecified
+             && !match_broadcast_size (t, op)))
        {
        bad_broadcast:
          i.error = unsupported_broadcast;
          return 1;
        }
+
+      i.broadcast->bytes = ((1 << (t->opcode_modifier.broadcast - 1))
+                           * i.broadcast->type);
+      operand_type_set (&type, 0);
+      switch (i.broadcast->bytes)
+       {
+       case 2:
+         type.bitfield.word = 1;
+         break;
+       case 4:
+         type.bitfield.dword = 1;
+         break;
+       case 8:
+         type.bitfield.qword = 1;
+         break;
+       case 16:
+         type.bitfield.xmmword = 1;
+         break;
+       case 32:
+         type.bitfield.ymmword = 1;
+         break;
+       case 64:
+         type.bitfield.zmmword = 1;
+         break;
+       default:
+         goto bad_broadcast;
+       }
+
+      overlap = operand_type_and (type, t->operand_types[op]);
+      if (operand_type_all_zero (&overlap))
+         goto bad_broadcast;
+
+      if (t->opcode_modifier.checkregsize)
+       {
+         unsigned int j;
+
+         type.bitfield.baseindex = 1;
+         for (j = 0; j < i.operands; ++j)
+           {
+             if (j != op
+                 && !operand_type_register_match(i.types[j],
+                                                 t->operand_types[j],
+                                                 type,
+                                                 t->operand_types[op]))
+               goto bad_broadcast;
+           }
+       }
     }
   /* If broadcast is supported in this instruction, we need to check if
      operand of one-element size isn't specified without broadcast.  */
@@ -5069,24 +5253,49 @@ check_VecOperands (const insn_template *t)
          break;
       gas_assert (op < i.operands);
       /* Check size of the memory operand.  */
-      if ((t->opcode_modifier.vecesize == 0
-          && i.types[op].bitfield.dword)
-         || (t->opcode_modifier.vecesize == 1
-             && i.types[op].bitfield.qword))
+      if (match_broadcast_size (t, op))
        {
          i.error = broadcast_needed;
          return 1;
        }
     }
+  else
+    op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
 
   /* Check if requested masking is supported.  */
-  if (i.mask
-      && (!t->opcode_modifier.masking
-         || (i.mask->zeroing
-             && t->opcode_modifier.masking == MERGING_MASKING)))
+  if (i.mask)
     {
-      i.error = unsupported_masking;
-      return 1;
+      switch (t->opcode_modifier.masking)
+       {
+       case BOTH_MASKING:
+         break;
+       case MERGING_MASKING:
+         if (i.mask->zeroing)
+           {
+       case 0:
+             i.error = unsupported_masking;
+             return 1;
+           }
+         break;
+       case DYNAMIC_MASKING:
+         /* Memory destinations allow only merging masking.  */
+         if (i.mask->zeroing && i.mem_operands)
+           {
+             /* Find memory operand.  */
+             for (op = 0; op < i.operands; op++)
+               if (i.flags[op] & Operand_Mem)
+                 break;
+             gas_assert (op < i.operands);
+             if (op == i.operands - 1)
+               {
+                 i.error = unsupported_masking;
+                 return 1;
+               }
+           }
+         break;
+       default:
+         abort ();
+       }
     }
 
   /* Check if masking is applied to dest operand.  */
@@ -5124,9 +5333,51 @@ check_VecOperands (const insn_template *t)
       && i.disp_encoding != disp_encoding_32bit)
     {
       if (i.broadcast)
-       i.memshift = t->opcode_modifier.vecesize ? 3 : 2;
-      else
+       i.memshift = t->opcode_modifier.broadcast - 1;
+      else if (t->opcode_modifier.disp8memshift != DISP8_SHIFT_VL)
        i.memshift = t->opcode_modifier.disp8memshift;
+      else
+       {
+         const i386_operand_type *type = NULL;
+
+         i.memshift = 0;
+         for (op = 0; op < i.operands; op++)
+           if (operand_type_check (i.types[op], anymem))
+             {
+               if (t->opcode_modifier.evex == EVEXLIG)
+                 i.memshift = 2 + (i.suffix == QWORD_MNEM_SUFFIX);
+               else if (t->operand_types[op].bitfield.xmmword
+                        + t->operand_types[op].bitfield.ymmword
+                        + t->operand_types[op].bitfield.zmmword <= 1)
+                 type = &t->operand_types[op];
+               else if (!i.types[op].bitfield.unspecified)
+                 type = &i.types[op];
+             }
+           else if (i.types[op].bitfield.regsimd
+                    && t->opcode_modifier.evex != EVEXLIG)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 i.memshift = 6;
+               else if (i.types[op].bitfield.ymmword && i.memshift < 5)
+                 i.memshift = 5;
+               else if (i.types[op].bitfield.xmmword && i.memshift < 4)
+                 i.memshift = 4;
+             }
+
+         if (type)
+           {
+             if (type->bitfield.zmmword)
+               i.memshift = 6;
+             else if (type->bitfield.ymmword)
+               i.memshift = 5;
+             else if (type->bitfield.xmmword)
+               i.memshift = 4;
+           }
+
+         /* For the check in fits_in_disp8().  */
+         if (i.memshift == 0)
+           i.memshift = -1;
+       }
 
       for (op = 0; op < i.operands; op++)
        if (operand_type_check (i.types[op], disp)
@@ -5155,7 +5406,7 @@ VEX_check_operands (const insn_template *t)
   if (i.vec_encoding == vex_encoding_evex)
     {
       /* This instruction must be encoded with EVEX prefix.  */
-      if (!t->opcode_modifier.evex)
+      if (!is_evex_encoding (t))
        {
          i.error = unsupported;
          return 1;
@@ -5203,7 +5454,7 @@ match_template (char mnem_suffix)
   i386_operand_type operand_types [MAX_OPERANDS];
   int addr_prefix_disp;
   unsigned int j;
-  unsigned int found_cpu_match;
+  unsigned int found_cpu_match, size_match;
   unsigned int check_register;
   enum i386_error specific_error = 0;
 
@@ -5215,7 +5466,9 @@ match_template (char mnem_suffix)
   addr_prefix_disp = -1;
 
   memset (&suffix_check, 0, sizeof (suffix_check));
-  if (i.suffix == BYTE_MNEM_SUFFIX)
+  if (intel_syntax && i.broadcast)
+    /* nothing */;
+  else if (i.suffix == BYTE_MNEM_SUFFIX)
     suffix_check.no_bsuf = 1;
   else if (i.suffix == WORD_MNEM_SUFFIX)
     suffix_check.no_wsuf = 1;
@@ -5258,11 +5511,6 @@ match_template (char mnem_suffix)
       if (!found_cpu_match)
        continue;
 
-      /* Check old gcc support. */
-      i.error = old_gcc_only;
-      if (!old_gcc && t->opcode_modifier.oldgcc)
-       continue;
-
       /* Check AT&T mnemonic.   */
       i.error = unsupported_with_intel_mnemonic;
       if (intel_mnemonic && t->opcode_modifier.attmnemonic)
@@ -5295,7 +5543,8 @@ match_template (char mnem_suffix)
          || (t->opcode_modifier.no_ldsuf && mnemsuf_check.no_ldsuf))
        continue;
 
-      if (!operand_size_match (t))
+      size_match = operand_size_match (t);
+      if (!size_match)
        continue;
 
       for (j = 0; j < MAX_OPERANDS; j++)
@@ -5306,6 +5555,7 @@ match_template (char mnem_suffix)
          && flag_code != CODE_64BIT
          && (intel_syntax
              ? (!t->opcode_modifier.ignoresize
+                && !t->opcode_modifier.broadcast
                 && !intel_float_operand (t->name))
              : intel_float_operand (t->name) != 2)
          && ((!operand_types[0].bitfield.regmmx
@@ -5388,7 +5638,15 @@ match_template (char mnem_suffix)
        continue;
 
       /* We check register size if needed.  */
-      check_register = t->opcode_modifier.checkregsize;
+      if (t->opcode_modifier.checkregsize)
+       {
+         check_register = (1 << t->operands) - 1;
+         if (i.broadcast)
+           check_register &= ~(1 << i.broadcast->operand);
+       }
+      else
+       check_register = 0;
+
       overlap0 = operand_type_and (i.types[0], operand_types[0]);
       switch (t->operands)
        {
@@ -5406,6 +5664,16 @@ match_template (char mnem_suffix)
              && operand_type_equal (&i.types [0], &acc32)
              && operand_type_equal (&i.types [1], &acc32))
            continue;
+         /* xrelease mov %eax, <disp> is another special case. It must not
+            match the accumulator-only encoding of mov.  */
+         if (flag_code != CODE_64BIT
+             && i.hle_prefix
+             && t->base_opcode == 0xa0
+             && i.types[0].bitfield.acc
+             && operand_type_check (i.types[1], anymem))
+           continue;
+         if (!(size_match & MATCH_STRAIGHT))
+           goto check_reverse;
          /* If we want store form, we reverse direction of operands.  */
          if (i.dir_encoding == dir_encoding_store
              && t->opcode_modifier.d)
@@ -5424,17 +5692,19 @@ match_template (char mnem_suffix)
          overlap1 = operand_type_and (i.types[1], operand_types[1]);
          if (!operand_type_match (overlap0, i.types[0])
              || !operand_type_match (overlap1, i.types[1])
-             || (check_register
+             || ((check_register & 3) == 3
                  && !operand_type_register_match (i.types[0],
                                                   operand_types[0],
                                                   i.types[1],
                                                   operand_types[1])))
            {
              /* Check if other direction is valid ...  */
-             if (!t->opcode_modifier.d && !t->opcode_modifier.floatd)
+             if (!t->opcode_modifier.d)
                continue;
 
 check_reverse:
+             if (!(size_match & MATCH_REVERSE))
+               continue;
              /* Try reversing direction of operands.  */
              overlap0 = operand_type_and (i.types[0], operand_types[1]);
              overlap1 = operand_type_and (i.types[1], operand_types[0]);
@@ -5449,14 +5719,14 @@ check_reverse:
                  /* Does not match either direction.  */
                  continue;
                }
-             /* found_reverse_match holds which of D or FloatDR
+             /* found_reverse_match holds which of D or FloatR
                 we've found.  */
-             if (t->opcode_modifier.d)
-               found_reverse_match = Opcode_D;
-             else if (t->opcode_modifier.floatd)
+             if (!t->opcode_modifier.d)
+               found_reverse_match = 0;
+             else if (operand_types[0].bitfield.tbyte)
                found_reverse_match = Opcode_FloatD;
              else
-               found_reverse_match = 0;
+               found_reverse_match = Opcode_D;
              if (t->opcode_modifier.floatr)
                found_reverse_match |= Opcode_FloatR;
            }
@@ -5491,24 +5761,32 @@ check_reverse:
                  /* Fall through.  */
                case 4:
                  if (!operand_type_match (overlap3, i.types[3])
-                     || (check_register
+                     || ((check_register & 0xa) == 0xa
+                         && !operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[3],
+                                                           operand_types[3]))
+                     || ((check_register & 0xc) == 0xc
                          && !operand_type_register_match (i.types[2],
-                                                          operand_types[2],
-                                                          i.types[3],
-                                                          operand_types[3])))
+                                                           operand_types[2],
+                                                           i.types[3],
+                                                           operand_types[3])))
                    continue;
                  /* Fall through.  */
                case 3:
                  /* Here we make use of the fact that there are no
-                    reverse match 3 operand instructions, and all 3
-                    operand instructions only need to be checked for
-                    register consistency between operands 2 and 3.  */
+                    reverse match 3 operand instructions.  */
                  if (!operand_type_match (overlap2, i.types[2])
-                     || (check_register
+                     || ((check_register & 5) == 5
+                         && !operand_type_register_match (i.types[0],
+                                                           operand_types[0],
+                                                           i.types[2],
+                                                           operand_types[2]))
+                     || ((check_register & 6) == 6
                          && !operand_type_register_match (i.types[1],
-                                                          operand_types[1],
-                                                          i.types[2],
-                                                          operand_types[2])))
+                                                           operand_types[1],
+                                                           i.types[2],
+                                                           operand_types[2])))
                    continue;
                  break;
                }
@@ -5559,9 +5837,6 @@ check_reverse:
        case bad_imm4:
          err_msg = _("constant doesn't fit in 4 bits");
          break;
-       case old_gcc_only:
-         err_msg = _("only supported with old gcc");
-         break;
        case unsupported_with_intel_mnemonic:
          err_msg = _("unsupported with Intel mnemonic");
          break;
@@ -5584,9 +5859,6 @@ check_reverse:
        case unsupported_broadcast:
          err_msg = _("unsupported broadcast");
          break;
-       case broadcast_not_on_src_operand:
-         err_msg = _("broadcast not on source memory operand");
-         break;
        case broadcast_needed:
          err_msg = _("broadcast is needed for operand of such type");
          break;
@@ -5744,26 +6016,19 @@ process_suffix (void)
                if (!i.tm.operand_types[op].bitfield.inoutportreg
                    && !i.tm.operand_types[op].bitfield.shiftcount)
                  {
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.byte)
-                     {
-                       i.suffix = BYTE_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.word)
-                     {
-                       i.suffix = WORD_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.dword)
-                     {
-                       i.suffix = LONG_MNEM_SUFFIX;
-                       break;
-                     }
-                   if (i.types[op].bitfield.reg && i.types[op].bitfield.qword)
-                     {
-                       i.suffix = QWORD_MNEM_SUFFIX;
-                       break;
-                     }
+                   if (!i.types[op].bitfield.reg)
+                     continue;
+                   if (i.types[op].bitfield.byte)
+                     i.suffix = BYTE_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.word)
+                     i.suffix = WORD_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.dword)
+                     i.suffix = LONG_MNEM_SUFFIX;
+                   else if (i.types[op].bitfield.qword)
+                     i.suffix = QWORD_MNEM_SUFFIX;
+                   else
+                     continue;
+                   break;
                  }
            }
        }
@@ -5780,7 +6045,9 @@ process_suffix (void)
        {
          if (intel_syntax
              && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_lsuf)
+             && i.tm.opcode_modifier.no_lsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
            i.suffix = 0;
          else if (!check_long_reg ())
            return 0;
@@ -5789,7 +6056,9 @@ process_suffix (void)
        {
          if (intel_syntax
              && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_qsuf)
+             && i.tm.opcode_modifier.no_qsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
            i.suffix = 0;
          else if (!check_qword_reg ())
            return 0;
@@ -5803,13 +6072,6 @@ process_suffix (void)
          else if (!check_word_reg ())
            return 0;
        }
-      else if (i.suffix == XMMWORD_MNEM_SUFFIX
-              || i.suffix == YMMWORD_MNEM_SUFFIX
-              || i.suffix == ZMMWORD_MNEM_SUFFIX)
-       {
-         /* Skip if the instruction has x/y/z suffix.  match_template
-            should check if it is a valid suffix.  */
-       }
       else if (intel_syntax && i.tm.opcode_modifier.ignoresize)
        /* Do nothing if the instruction is going to ignore the prefix.  */
        ;
@@ -5890,15 +6152,19 @@ process_suffix (void)
        }
     }
 
-  /* Change the opcode based on the operand size given by i.suffix;
-     We don't need to change things for byte insns.  */
-
-  if (i.suffix
-      && i.suffix != BYTE_MNEM_SUFFIX
-      && i.suffix != XMMWORD_MNEM_SUFFIX
-      && i.suffix != YMMWORD_MNEM_SUFFIX
-      && i.suffix != ZMMWORD_MNEM_SUFFIX)
+  /* Change the opcode based on the operand size given by i.suffix.  */
+  switch (i.suffix)
     {
+    /* Size floating point instruction.  */
+    case LONG_MNEM_SUFFIX:
+      if (i.tm.opcode_modifier.floatmf)
+       {
+         i.tm.base_opcode ^= 4;
+         break;
+       }
+    /* fall through */
+    case WORD_MNEM_SUFFIX:
+    case QWORD_MNEM_SUFFIX:
       /* It's not a byte, select word/dword operation.  */
       if (i.tm.opcode_modifier.w)
        {
@@ -5907,25 +6173,32 @@ process_suffix (void)
          else
            i.tm.base_opcode |= 1;
        }
-
+    /* fall through */
+    case SHORT_MNEM_SUFFIX:
       /* Now select between word & dword operations via the operand
         size prefix, except for instructions that will ignore this
         prefix anyway.  */
-      if (i.tm.opcode_modifier.addrprefixop0)
+      if (i.reg_operands > 0
+         && i.types[0].bitfield.reg
+         && i.tm.opcode_modifier.addrprefixopreg
+         && (i.tm.opcode_modifier.immext
+             || i.operands == 1))
        {
          /* The address size override prefix changes the size of the
             first operand.  */
          if ((flag_code == CODE_32BIT
-              && i.op->regs[0].reg_type.bitfield.word)
+              && i.op[0].regs->reg_type.bitfield.word)
              || (flag_code != CODE_32BIT
-                 && i.op->regs[0].reg_type.bitfield.dword))
+                 && i.op[0].regs->reg_type.bitfield.dword))
            if (!add_prefix (ADDR_PREFIX_OPCODE))
              return 0;
        }
       else if (i.suffix != QWORD_MNEM_SUFFIX
-              && i.suffix != LONG_DOUBLE_MNEM_SUFFIX
               && !i.tm.opcode_modifier.ignoresize
               && !i.tm.opcode_modifier.floatmf
+              && !i.tm.opcode_modifier.vex
+              && !i.tm.opcode_modifier.vexopcode
+              && !is_evex_encoding (&i.tm)
               && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
                   || (flag_code == CODE_64BIT
                       && i.tm.opcode_modifier.jumpbyte)))
@@ -5942,27 +6215,52 @@ process_suffix (void)
       /* Set mode64 for an operand.  */
       if (i.suffix == QWORD_MNEM_SUFFIX
          && flag_code == CODE_64BIT
-         && !i.tm.opcode_modifier.norex64)
-       {
+         && !i.tm.opcode_modifier.norex64
          /* Special case for xchg %rax,%rax.  It is NOP and doesn't
-            need rex64.  cmpxchg8b is also a special case. */
-         if (! (i.operands == 2
-                && i.tm.base_opcode == 0x90
-                && i.tm.extension_opcode == None
-                && operand_type_equal (&i.types [0], &acc64)
-                && operand_type_equal (&i.types [1], &acc64))
-             && ! (i.operands == 1
-                   && i.tm.base_opcode == 0xfc7
-                   && i.tm.extension_opcode == 1
-                   && !operand_type_check (i.types [0], reg)
-                   && operand_type_check (i.types [0], anymem)))
-           i.rex |= REX_W;
-       }
-
-      /* Size floating point instruction.  */
-      if (i.suffix == LONG_MNEM_SUFFIX)
-       if (i.tm.opcode_modifier.floatmf)
-         i.tm.base_opcode ^= 4;
+            need rex64. */
+         && ! (i.operands == 2
+               && i.tm.base_opcode == 0x90
+               && i.tm.extension_opcode == None
+               && operand_type_equal (&i.types [0], &acc64)
+               && operand_type_equal (&i.types [1], &acc64)))
+       i.rex |= REX_W;
+
+      break;
+    }
+
+  if (i.reg_operands != 0
+      && i.operands > 1
+      && i.tm.opcode_modifier.addrprefixopreg
+      && !i.tm.opcode_modifier.immext)
+    {
+      /* Check invalid register operand when the address size override
+        prefix changes the size of register operands.  */
+      unsigned int op;
+      enum { need_word, need_dword, need_qword } need;
+
+      if (flag_code == CODE_32BIT)
+       need = i.prefix[ADDR_PREFIX] ? need_word : need_dword;
+      else
+       {
+         if (i.prefix[ADDR_PREFIX])
+           need = need_dword;
+         else
+           need = flag_code == CODE_64BIT ? need_qword : need_word;
+       }
+
+      for (op = 0; op < i.operands; op++)
+       if (i.types[op].bitfield.reg
+           && ((need == need_word
+                && !i.op[op].regs->reg_type.bitfield.word)
+               || (need == need_dword
+                   && !i.op[op].regs->reg_type.bitfield.dword)
+               || (need == need_qword
+                   && !i.op[op].regs->reg_type.bitfield.qword)))
+         {
+           as_bad (_("invalid register operand size for `%s'"),
+                   i.tm.name);
+           return 0;
+         }
     }
 
   return 1;
@@ -6399,20 +6697,21 @@ duplicate:
     }
   else if (i.tm.opcode_modifier.implicitquadgroup)
     {
+      unsigned int regnum, first_reg_in_group, last_reg_in_group;
+
       /* The second operand must be {x,y,z}mmN, where N is a multiple of 4. */
       gas_assert (i.operands >= 2 && i.types[1].bitfield.regsimd);
-      unsigned int regnum = register_number (i.op[1].regs);
-      unsigned int first_reg_in_group = regnum & ~3;
-      unsigned int last_reg_in_group = first_reg_in_group + 3;
-      if (regnum != first_reg_in_group) {
-        as_warn (_("the second source register `%s%s' implicitly denotes"
-            " `%s%.3s%d' to `%s%.3s%d' source group in `%s'"),
-            register_prefix, i.op[1].regs->reg_name,
-            register_prefix, i.op[1].regs->reg_name, first_reg_in_group,
-            register_prefix, i.op[1].regs->reg_name, last_reg_in_group,
-            i.tm.name);
-      }
-       }
+      regnum = register_number (i.op[1].regs);
+      first_reg_in_group = regnum & ~3;
+      last_reg_in_group = first_reg_in_group + 3;
+      if (regnum != first_reg_in_group)
+       as_warn (_("source register `%s%s' implicitly denotes"
+                  " `%s%.3s%u' to `%s%.3s%u' source group in `%s'"),
+                register_prefix, i.op[1].regs->reg_name,
+                register_prefix, i.op[1].regs->reg_name, first_reg_in_group,
+                register_prefix, i.op[1].regs->reg_name, last_reg_in_group,
+                i.tm.name);
+    }
   else if (i.tm.opcode_modifier.regkludge)
     {
       /* The imul $imm, %reg instruction is converted into
@@ -6528,118 +6827,82 @@ build_modrm_byte (void)
   unsigned int source, dest;
   int vex_3_sources;
 
-  /* The first operand of instructions with VEX prefix and 3 sources
-     must be VEX_Imm4.  */
   vex_3_sources = i.tm.opcode_modifier.vexsources == VEX3SOURCES;
   if (vex_3_sources)
     {
       unsigned int nds, reg_slot;
       expressionS *exp;
 
-      if (i.tm.opcode_modifier.veximmext
-          && i.tm.opcode_modifier.immext)
-        {
-          dest = i.operands - 2;
-          gas_assert (dest == 3);
-        }
-      else
-        dest = i.operands - 1;
+      dest = i.operands - 1;
       nds = dest - 1;
 
       /* There are 2 kinds of instructions:
-         1. 5 operands: 4 register operands or 3 register operands
-         plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
-         VexW0 or VexW1.  The destination must be either XMM, YMM or
+        1. 5 operands: 4 register operands or 3 register operands
+        plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
+        VexW0 or VexW1.  The destination must be either XMM, YMM or
         ZMM register.
-         2. 4 operands: 4 register operands or 3 register operands
-         plus 1 memory operand, VexXDS, and VexImmExt  */
+        2. 4 operands: 4 register operands or 3 register operands
+        plus 1 memory operand, with VexXDS.  */
       gas_assert ((i.reg_operands == 4
-                   || (i.reg_operands == 3 && i.mem_operands == 1))
-                  && i.tm.opcode_modifier.vexvvvv == VEXXDS
-                  && (i.tm.opcode_modifier.veximmext
-                      || (i.imm_operands == 1
-                          && i.types[0].bitfield.vec_imm4
-                          && (i.tm.opcode_modifier.vexw == VEXW0
-                              || i.tm.opcode_modifier.vexw == VEXW1)
-                          && i.tm.operand_types[dest].bitfield.regsimd)));
+                  || (i.reg_operands == 3 && i.mem_operands == 1))
+                 && i.tm.opcode_modifier.vexvvvv == VEXXDS
+                 && i.tm.opcode_modifier.vexw
+                 && i.tm.operand_types[dest].bitfield.regsimd);
+
+      /* If VexW1 is set, the first non-immediate operand is the source and
+        the second non-immediate one is encoded in the immediate operand.  */
+      if (i.tm.opcode_modifier.vexw == VEXW1)
+       {
+         source = i.imm_operands;
+         reg_slot = i.imm_operands + 1;
+       }
+      else
+       {
+         source = i.imm_operands + 1;
+         reg_slot = i.imm_operands;
+       }
 
       if (i.imm_operands == 0)
-        {
-          /* When there is no immediate operand, generate an 8bit
-             immediate operand to encode the first operand.  */
-          exp = &im_expressions[i.imm_operands++];
-          i.op[i.operands].imms = exp;
-          i.types[i.operands] = imm8;
-          i.operands++;
-          /* If VexW1 is set, the first operand is the source and
-             the second operand is encoded in the immediate operand.  */
-          if (i.tm.opcode_modifier.vexw == VEXW1)
-            {
-              source = 0;
-              reg_slot = 1;
-            }
-          else
-            {
-              source = 1;
-              reg_slot = 0;
-            }
-
-          /* FMA swaps REG and NDS.  */
-          if (i.tm.cpu_flags.bitfield.cpufma)
-            {
-              unsigned int tmp;
-              tmp = reg_slot;
-              reg_slot = nds;
-              nds = tmp;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          exp->X_op = O_constant;
-          exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
+       {
+         /* When there is no immediate operand, generate an 8bit
+            immediate operand to encode the first operand.  */
+         exp = &im_expressions[i.imm_operands++];
+         i.op[i.operands].imms = exp;
+         i.types[i.operands] = imm8;
+         i.operands++;
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         exp->X_op = O_constant;
+         exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
          gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
        }
       else
-        {
-          unsigned int imm_slot;
-
-          if (i.tm.opcode_modifier.vexw == VEXW0)
-            {
-              /* If VexW0 is set, the third operand is the source and
-                 the second operand is encoded in the immediate
-                 operand.  */
-              source = 2;
-              reg_slot = 1;
-            }
-          else
-            {
-              /* VexW1 is set, the second operand is the source and
-                 the third operand is encoded in the immediate
-                 operand.  */
-              source = 1;
-              reg_slot = 2;
-            }
-
-          if (i.tm.opcode_modifier.immext)
-            {
-              /* When ImmExt is set, the immediate byte is the last
-                 operand.  */
-              imm_slot = i.operands - 1;
-              source--;
-              reg_slot--;
-            }
-          else
-            {
-              imm_slot = 0;
-
-              /* Turn on Imm8 so that output_imm will generate it.  */
-              i.types[imm_slot].bitfield.imm8 = 1;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          i.op[imm_slot].imms->X_add_number
-              |= register_number (i.op[reg_slot].regs) << 4;
+       {
+         unsigned int imm_slot;
+
+         gas_assert (i.imm_operands == 1 && i.types[0].bitfield.vec_imm4);
+
+         if (i.tm.opcode_modifier.immext)
+           {
+             /* When ImmExt is set, the immediate byte is the last
+                operand.  */
+             imm_slot = i.operands - 1;
+             source--;
+             reg_slot--;
+           }
+         else
+           {
+             imm_slot = 0;
+
+             /* Turn on Imm8 so that output_imm will generate it.  */
+             i.types[imm_slot].bitfield.imm8 = 1;
+           }
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         i.op[imm_slot].imms->X_add_number
+             |= register_number (i.op[reg_slot].regs) << 4;
          gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
-        }
+       }
 
       gas_assert (i.tm.operand_types[nds].bitfield.regsimd);
       i.vex.register_specifier = i.op[nds].regs;
@@ -6707,7 +6970,7 @@ build_modrm_byte (void)
            }
          break;
        case 5:
-         if (i.tm.opcode_modifier.evex)
+         if (is_evex_encoding (&i.tm))
            {
              /* For EVEX instructions, when there are 5 operands, the
                 first one must be immediate operand.  If the second one
@@ -6807,12 +7070,11 @@ build_modrm_byte (void)
          if ((i.op[source].regs->reg_flags & RegVRex) != 0)
            i.vrex |= REX_R;
        }
-      if (flag_code != CODE_64BIT && (i.rex & (REX_R | REX_B)))
+      if (flag_code != CODE_64BIT && (i.rex & REX_R))
        {
-         if (!i.types[0].bitfield.control
-             && !i.types[1].bitfield.control)
+         if (!i.types[i.tm.operand_types[0].bitfield.regmem].bitfield.control)
            abort ();
-         i.rex &= ~(REX_R | REX_B);
+         i.rex &= ~REX_R;
          add_prefix (LOCK_PREFIX_OPCODE);
        }
     }
@@ -6832,8 +7094,7 @@ build_modrm_byte (void)
 
          if (i.tm.opcode_modifier.vecsib)
            {
-             if (i.index_reg->reg_num == RegEiz
-                 || i.index_reg->reg_num == RegRiz)
+             if (i.index_reg->reg_num == RegIZ)
                abort ();
 
              i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
@@ -6872,6 +7133,8 @@ build_modrm_byte (void)
                fake_zero_displacement = 1;
              if (i.index_reg == 0)
                {
+                 i386_operand_type newdisp;
+
                  gas_assert (!i.tm.opcode_modifier.vecsib);
                  /* Operand is just <disp>  */
                  if (flag_code == CODE_64BIT)
@@ -6883,26 +7146,26 @@ build_modrm_byte (void)
                      i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
                      i.sib.base = NO_BASE_REGISTER;
                      i.sib.index = NO_INDEX_REGISTER;
-                     i.types[op] = ((i.prefix[ADDR_PREFIX] == 0)
-                                    ? disp32s : disp32);
+                     newdisp = (!i.prefix[ADDR_PREFIX] ? disp32s : disp32);
                    }
                  else if ((flag_code == CODE_16BIT)
                           ^ (i.prefix[ADDR_PREFIX] != 0))
                    {
                      i.rm.regmem = NO_BASE_REGISTER_16;
-                     i.types[op] = disp16;
+                     newdisp = disp16;
                    }
                  else
                    {
                      i.rm.regmem = NO_BASE_REGISTER;
-                     i.types[op] = disp32;
+                     newdisp = disp32;
                    }
+                 i.types[op] = operand_type_and_not (i.types[op], anydisp);
+                 i.types[op] = operand_type_or (i.types[op], newdisp);
                }
              else if (!i.tm.opcode_modifier.vecsib)
                {
                  /* !i.base_reg && i.index_reg  */
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                    i.sib.index = NO_INDEX_REGISTER;
                  else
                    i.sib.index = i.index_reg->reg_num;
@@ -6928,8 +7191,7 @@ build_modrm_byte (void)
                }
            }
          /* RIP addressing for 64bit mode.  */
-         else if (i.base_reg->reg_num == RegRip ||
-                  i.base_reg->reg_num == RegEip)
+         else if (i.base_reg->reg_num == RegIP)
            {
              gas_assert (!i.tm.opcode_modifier.vecsib);
              i.rm.regmem = NO_BASE_REGISTER;
@@ -6978,14 +7240,18 @@ build_modrm_byte (void)
              if (flag_code == CODE_64BIT
                  && operand_type_check (i.types[op], disp))
                {
-                 i386_operand_type temp;
-                 operand_type_set (&temp, 0);
-                 temp.bitfield.disp8 = i.types[op].bitfield.disp8;
-                 i.types[op] = temp;
+                 i.types[op].bitfield.disp16 = 0;
+                 i.types[op].bitfield.disp64 = 0;
                  if (i.prefix[ADDR_PREFIX] == 0)
-                   i.types[op].bitfield.disp32s = 1;
+                   {
+                     i.types[op].bitfield.disp32 = 0;
+                     i.types[op].bitfield.disp32s = 1;
+                   }
                  else
-                   i.types[op].bitfield.disp32 = 1;
+                   {
+                     i.types[op].bitfield.disp32 = 1;
+                     i.types[op].bitfield.disp32s = 0;
+                   }
                }
 
              if (!i.tm.opcode_modifier.vecsib)
@@ -7017,8 +7283,7 @@ build_modrm_byte (void)
                }
              else if (!i.tm.opcode_modifier.vecsib)
                {
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                    i.sib.index = NO_INDEX_REGISTER;
                  else
                    i.sib.index = i.index_reg->reg_num;
@@ -7178,9 +7443,10 @@ build_modrm_byte (void)
                }
              else
                {
-                 /* There are only 2 operands.  */
-                 gas_assert (op < 2 && i.operands == 2);
-                 vex_reg = 1;
+                 /* There are only 2 non-immediate operands.  */
+                 gas_assert (op < i.imm_operands + 2
+                             && i.operands == i.imm_operands + 2);
+                 vex_reg = i.imm_operands + 1;
                }
            }
          else
@@ -7576,22 +7842,16 @@ output_insn (void)
              if (i.tm.base_opcode & 0xff000000)
                {
                  prefix = (i.tm.base_opcode >> 24) & 0xff;
-                 goto check_prefix;
+                 add_prefix (prefix);
                }
              break;
            case 2:
              if ((i.tm.base_opcode & 0xff0000) != 0)
                {
                  prefix = (i.tm.base_opcode >> 16) & 0xff;
-                 if (i.tm.cpu_flags.bitfield.cpupadlock)
-                   {
-check_prefix:
-                     if (prefix != REPE_PREFIX_OPCODE
-                         || (i.prefix[REP_PREFIX]
-                             != REPE_PREFIX_OPCODE))
-                       add_prefix (prefix);
-                   }
-                 else
+                 if (!i.tm.cpu_flags.bitfield.cpupadlock
+                     || prefix != REPE_PREFIX_OPCODE
+                     || (i.prefix[REP_PREFIX] != REPE_PREFIX_OPCODE))
                    add_prefix (prefix);
                }
              break;
@@ -7763,7 +8023,8 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
              int size = disp_size (n);
              offsetT val = i.op[n].disps->X_add_number;
 
-             val = offset_in_range (val >> i.memshift, size);
+             val = offset_in_range (val >> (size == 1 ? i.memshift : 0),
+                                    size);
              p = frag_more (size);
              md_number_to_chars (p, val, size);
            }
@@ -7868,8 +8129,7 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                    {
                      fixP->fx_tcbit = i.rex != 0;
                      if (i.base_reg
-                         && (i.base_reg->reg_num == RegRip
-                             || i.base_reg->reg_num == RegEip))
+                         && (i.base_reg->reg_num == RegIP))
                      fixP->fx_tcbit2 = 1;
                    }
                  else
@@ -8419,15 +8679,15 @@ check_VecOperations (char *op_string, char *op_end)
 
              op_string += 3;
              if (*op_string == '8')
-               bcst_type = BROADCAST_1TO8;
+               bcst_type = 8;
              else if (*op_string == '4')
-               bcst_type = BROADCAST_1TO4;
+               bcst_type = 4;
              else if (*op_string == '2')
-               bcst_type = BROADCAST_1TO2;
+               bcst_type = 2;
              else if (*op_string == '1'
                       && *(op_string+1) == '6')
                {
-                 bcst_type = BROADCAST_1TO16;
+                 bcst_type = 16;
                  op_string++;
                }
              else
@@ -8439,6 +8699,7 @@ check_VecOperations (char *op_string, char *op_end)
 
              broadcast_op.type = bcst_type;
              broadcast_op.operand = this_operand;
+             broadcast_op.bytes = 0;
              i.broadcast = &broadcast_op;
            }
          /* Check masking operation.  */
@@ -8519,6 +8780,12 @@ check_VecOperations (char *op_string, char *op_end)
              return NULL;
            }
          op_string++;
+
+         /* Strip whitespace since the addition of pseudo prefixes
+            changed how the scrubber treats '{'.  */
+         if (is_space_char (*op_string))
+           ++op_string;
+
          continue;
        }
     unknown_vec_op:
@@ -8963,9 +9230,7 @@ i386_addressing_mode (void)
 
          if (addr_reg)
            {
-             if (addr_reg->reg_num == RegEip
-                 || addr_reg->reg_num == RegEiz
-                 || addr_reg->reg_type.bitfield.dword)
+             if (addr_reg->reg_type.bitfield.dword)
                addr_mode = CODE_32BIT;
              else if (flag_code != CODE_64BIT
                       && addr_reg->reg_type.bitfield.word)
@@ -9075,21 +9340,18 @@ bad_address:
        {
          /* 32-bit/64-bit checks.  */
          if ((i.base_reg
-              && (addr_mode == CODE_64BIT
-                  ? !i.base_reg->reg_type.bitfield.qword
-                  : !i.base_reg->reg_type.bitfield.dword)
-              && (i.index_reg
-                  || (i.base_reg->reg_num
-                      != (addr_mode == CODE_64BIT ? RegRip : RegEip))))
+              && ((addr_mode == CODE_64BIT
+                   ? !i.base_reg->reg_type.bitfield.qword
+                   : !i.base_reg->reg_type.bitfield.dword)
+                  || (i.index_reg && i.base_reg->reg_num == RegIP)
+                  || i.base_reg->reg_num == RegIZ))
              || (i.index_reg
                  && !i.index_reg->reg_type.bitfield.xmmword
                  && !i.index_reg->reg_type.bitfield.ymmword
                  && !i.index_reg->reg_type.bitfield.zmmword
                  && ((addr_mode == CODE_64BIT
-                      ? !(i.index_reg->reg_type.bitfield.qword
-                          || i.index_reg->reg_num == RegRiz)
-                      : !(i.index_reg->reg_type.bitfield.dword
-                          || i.index_reg->reg_num == RegEiz))
+                      ? !i.index_reg->reg_type.bitfield.qword
+                      : !i.index_reg->reg_type.bitfield.dword)
                      || !i.index_reg->reg_type.bitfield.baseindex)))
            goto bad_address;
 
@@ -9098,7 +9360,7 @@ bad_address:
              || (current_templates->start->base_opcode & ~1) == 0x0f1a)
            {
              /* They cannot use RIP-relative addressing. */
-             if (i.base_reg && i.base_reg->reg_num == RegRip)
+             if (i.base_reg && i.base_reg->reg_num == RegIP)
                {
                  as_bad (_("`%s' cannot be used here"), operand_string);
                  return 0;
@@ -9537,20 +9799,19 @@ i386_att_operand (char *operand_string)
 
       /* Special case for (%dx) while doing input/output op.  */
       if (i.base_reg
-         && operand_type_equal (&i.base_reg->reg_type,
-                                &reg16_inoutportreg)
+         && i.base_reg->reg_type.bitfield.inoutportreg
          && i.index_reg == 0
          && i.log2_scale_factor == 0
          && i.seg[i.mem_operands] == 0
          && !operand_type_check (i.types[this_operand], disp))
        {
-         i.types[this_operand] = inoutportreg;
+         i.types[this_operand] = i.base_reg->reg_type;
          return 1;
        }
 
       if (i386_index_check (operand_string) == 0)
        return 0;
-      i.types[this_operand].bitfield.mem = 1;
+      i.flags[this_operand] |= Operand_Mem;
       if (i.mem_operands == 0)
        i.memop1_string = xstrdup (operand_string);
       i.mem_operands++;
@@ -10092,6 +10353,11 @@ parse_real_register (char *reg_string, char **end_op)
   /* Handle floating point regs, allowing spaces in the (i) part.  */
   if (r == i386_regtab /* %st is first entry of table  */)
     {
+      if (!cpu_arch_flags.bitfield.cpu8087
+         && !cpu_arch_flags.bitfield.cpu287
+         && !cpu_arch_flags.bitfield.cpu387)
+       return (const reg_entry *) NULL;
+
       if (is_space_char (*s))
        ++s;
       if (*s == '(')
@@ -10132,50 +10398,44 @@ parse_real_register (char *reg_string, char **end_op)
       && !cpu_arch_flags.bitfield.cpui386)
     return (const reg_entry *) NULL;
 
-  if (r->reg_type.bitfield.tbyte
-      && !cpu_arch_flags.bitfield.cpu8087
-      && !cpu_arch_flags.bitfield.cpu287
-      && !cpu_arch_flags.bitfield.cpu387)
-    return (const reg_entry *) NULL;
-
-  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpuregmmx)
+  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpummx)
     return (const reg_entry *) NULL;
 
-  if (r->reg_type.bitfield.xmmword && !cpu_arch_flags.bitfield.cpuregxmm)
-    return (const reg_entry *) NULL;
+  if (!cpu_arch_flags.bitfield.cpuavx512f)
+    {
+      if (r->reg_type.bitfield.zmmword || r->reg_type.bitfield.regmask)
+       return (const reg_entry *) NULL;
 
-  if (r->reg_type.bitfield.ymmword && !cpu_arch_flags.bitfield.cpuregymm)
-    return (const reg_entry *) NULL;
+      if (!cpu_arch_flags.bitfield.cpuavx)
+       {
+         if (r->reg_type.bitfield.ymmword)
+           return (const reg_entry *) NULL;
 
-  if (r->reg_type.bitfield.zmmword && !cpu_arch_flags.bitfield.cpuregzmm)
-    return (const reg_entry *) NULL;
+         if (!cpu_arch_flags.bitfield.cpusse && r->reg_type.bitfield.xmmword)
+           return (const reg_entry *) NULL;
+       }
+    }
 
-  if (r->reg_type.bitfield.regmask
-      && !cpu_arch_flags.bitfield.cpuregmask)
+  if (r->reg_type.bitfield.regbnd && !cpu_arch_flags.bitfield.cpumpx)
     return (const reg_entry *) NULL;
 
   /* Don't allow fake index register unless allow_index_reg isn't 0. */
-  if (!allow_index_reg
-      && (r->reg_num == RegEiz || r->reg_num == RegRiz))
+  if (!allow_index_reg && r->reg_num == RegIZ)
     return (const reg_entry *) NULL;
 
-  /* Upper 16 vector register is only available with VREX in 64bit
-     mode.  */
-  if ((r->reg_flags & RegVRex))
+  /* Upper 16 vector registers are only available with VREX in 64bit
+     mode, and require EVEX encoding.  */
+  if (r->reg_flags & RegVRex)
     {
-      if (i.vec_encoding == vex_encoding_default)
-       i.vec_encoding = vex_encoding_evex;
-
-      if (!cpu_arch_flags.bitfield.cpuvrex
-         || i.vec_encoding != vex_encoding_evex
+      if (!cpu_arch_flags.bitfield.cpuavx512f
          || flag_code != CODE_64BIT)
        return (const reg_entry *) NULL;
+
+      i.vec_encoding = vex_encoding_evex;
     }
 
-  if (((r->reg_flags & (RegRex64 | RegRex))
-       || r->reg_type.bitfield.qword)
-      && (!cpu_arch_flags.bitfield.cpulm
-         || !operand_type_equal (&r->reg_type, &control))
+  if (((r->reg_flags & (RegRex64 | RegRex)) || r->reg_type.bitfield.qword)
+      && (!cpu_arch_flags.bitfield.cpulm || !r->reg_type.bitfield.control)
       && flag_code != CODE_64BIT)
     return (const reg_entry *) NULL;
 
@@ -10299,7 +10559,7 @@ const char *md_shortopts = "qnO::";
 #define OPTION_MSYNTAX (OPTION_MD_BASE + 6)
 #define OPTION_MINDEX_REG (OPTION_MD_BASE + 7)
 #define OPTION_MNAKED_REG (OPTION_MD_BASE + 8)
-#define OPTION_MOLD_GCC (OPTION_MD_BASE + 9)
+#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 9)
 #define OPTION_MSSE2AVX (OPTION_MD_BASE + 10)
 #define OPTION_MSSE_CHECK (OPTION_MD_BASE + 11)
 #define OPTION_MOPERAND_CHECK (OPTION_MD_BASE + 12)
@@ -10315,7 +10575,6 @@ const char *md_shortopts = "qnO::";
 #define OPTION_MAMD64 (OPTION_MD_BASE + 22)
 #define OPTION_MINTEL64 (OPTION_MD_BASE + 23)
 #define OPTION_MFENCE_AS_LOCK_ADD (OPTION_MD_BASE + 24)
-#define OPTION_MRELAX_RELOCATIONS (OPTION_MD_BASE + 25)
 
 struct option md_longopts[] =
 {
@@ -10335,7 +10594,6 @@ struct option md_longopts[] =
   {"msyntax", required_argument, NULL, OPTION_MSYNTAX},
   {"mindex-reg", no_argument, NULL, OPTION_MINDEX_REG},
   {"mnaked-reg", no_argument, NULL, OPTION_MNAKED_REG},
-  {"mold-gcc", no_argument, NULL, OPTION_MOLD_GCC},
   {"msse2avx", no_argument, NULL, OPTION_MSSE2AVX},
   {"msse-check", required_argument, NULL, OPTION_MSSE_CHECK},
   {"moperand-check", required_argument, NULL, OPTION_MOPERAND_CHECK},
@@ -10520,6 +10778,10 @@ md_parse_option (int c, const char *arg)
                      cpu_arch_flags = flags;
                      cpu_arch_isa_flags = flags;
                    }
+                 else
+                   cpu_arch_isa_flags
+                     = cpu_flags_or (cpu_arch_isa_flags,
+                                     cpu_arch[j].flags);
                  break;
                }
            }
@@ -10607,10 +10869,6 @@ md_parse_option (int c, const char *arg)
       allow_naked_reg = 1;
       break;
 
-    case OPTION_MOLD_GCC:
-      old_gcc = 1;
-      break;
-
     case OPTION_MSSE2AVX:
       sse2avx = 1;
       break;
@@ -10894,35 +11152,43 @@ md_show_usage (FILE *stream)
   fprintf (stream, _("\
   -msse2avx               encode SSE instructions with VEX prefix\n"));
   fprintf (stream, _("\
-  -msse-check=[none|error|warning]\n\
+  -msse-check=[none|error|warning] (default: warning)\n\
                           check SSE instructions\n"));
   fprintf (stream, _("\
-  -moperand-check=[none|error|warning]\n\
+  -moperand-check=[none|error|warning] (default: warning)\n\
                           check operand combinations for validity\n"));
   fprintf (stream, _("\
-  -mavxscalar=[128|256]   encode scalar AVX instructions with specific vector\n\
+  -mavxscalar=[128|256] (default: 128)\n\
+                          encode scalar AVX instructions with specific vector\n\
                            length\n"));
   fprintf (stream, _("\
-  -mevexlig=[128|256|512] encode scalar EVEX instructions with specific vector\n\
+  -mevexlig=[128|256|512] (default: 128)\n\
+                          encode scalar EVEX instructions with specific vector\n\
                            length\n"));
   fprintf (stream, _("\
-  -mevexwig=[0|1]         encode EVEX instructions with specific EVEX.W value\n\
+  -mevexwig=[0|1] (default: 0)\n\
+                          encode EVEX instructions with specific EVEX.W value\n\
                            for EVEX.W bit ignored instructions\n"));
   fprintf (stream, _("\
-  -mevexrcig=[rne|rd|ru|rz]\n\
+  -mevexrcig=[rne|rd|ru|rz] (default: rne)\n\
                           encode EVEX instructions with specific EVEX.RC value\n\
                            for SAE-only ignored instructions\n"));
   fprintf (stream, _("\
-  -mmnemonic=[att|intel]  use AT&T/Intel mnemonic\n"));
+  -mmnemonic=[att|intel] "));
+  if (SYSV386_COMPAT)
+    fprintf (stream, _("(default: att)\n"));
+  else
+    fprintf (stream, _("(default: intel)\n"));
+  fprintf (stream, _("\
+                          use AT&T/Intel mnemonic\n"));
   fprintf (stream, _("\
-  -msyntax=[att|intel]    use AT&T/Intel syntax\n"));
+  -msyntax=[att|intel] (default: att)\n\
+                          use AT&T/Intel syntax\n"));
   fprintf (stream, _("\
   -mindex-reg             support pseudo index registers\n"));
   fprintf (stream, _("\
   -mnaked-reg             don't require `%%' prefix for registers\n"));
   fprintf (stream, _("\
-  -mold-gcc               support old (<= 2.8.1) versions of gcc\n"));
-  fprintf (stream, _("\
   -madd-bnd-prefix        add BND prefix for all valid branches\n"));
   fprintf (stream, _("\
   -mshared                disable branch optimization for shared code\n"));
@@ -10931,17 +11197,22 @@ md_show_usage (FILE *stream)
   -mbig-obj               generate big object files\n"));
 #endif
   fprintf (stream, _("\
-  -momit-lock-prefix=[no|yes]\n\
+  -momit-lock-prefix=[no|yes] (default: no)\n\
                           strip all lock prefixes\n"));
   fprintf (stream, _("\
-  -mfence-as-lock-add=[no|yes]\n\
+  -mfence-as-lock-add=[no|yes] (default: no)\n\
                           encode lfence, mfence and sfence as\n\
                            lock addl $0x0, (%%{re}sp)\n"));
   fprintf (stream, _("\
-  -mrelax-relocations=[no|yes]\n\
+  -mrelax-relocations=[no|yes] "));
+  if (DEFAULT_GENERATE_X86_RELAX_RELOCATIONS)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
                           generate relax relocations\n"));
   fprintf (stream, _("\
-  -mamd64                 accept only AMD64 ISA\n"));
+  -mamd64                 accept only AMD64 ISA [default]\n"));
   fprintf (stream, _("\
   -mintel64               accept only Intel64 ISA\n"));
 }