x86: fold a few of the "alternative" NOP patterns

[binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index ca3626e30698e1a01e884580be91b27813225efd..57ae6c522a715ffe5e3a0db852d1900f7238723d 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -131,6 +131,7 @@ typedef struct
    unsigned int len:8;          /* arch string length */
    bool skip:1;                 /* show_arch should skip this. */
    enum processor_type type;    /* arch type */
+  enum { vsz_none, vsz_set, vsz_reset } vsz; /* vector size control */
    i386_cpu_flags enable;               /* cpu feature enable flags */
    i386_cpu_flags disable;      /* cpu feature disable flags */
  }
@@ -164,7 +165,7 @@ static const char *parse_insn (const char *, char *, bool);
  static char *parse_operands (char *, const char *);
  static void swap_operands (void);
  static void swap_2_operands (unsigned int, unsigned int);
-static enum flag_code i386_addressing_mode (void);
+static enum i386_flag_code i386_addressing_mode (void);
  static void optimize_imm (void);
  static bool optimize_disp (const insn_template *t);
  static const insn_template *match_template (char);
@@ -254,6 +255,7 @@ enum i386_error
      no_default_mask,
      unsupported_rc_sae,
      invalid_register_operand,
+    internal_error,
    };
  
  struct _i386_insn
@@ -435,6 +437,7 @@ struct _i386_insn
         vex_encoding_vex,
         vex_encoding_vex3,
         vex_encoding_evex,
+       vex_encoding_evex512,
         vex_encoding_error
        } vec_encoding;
  
@@ -541,13 +544,13 @@ static char register_chars[256];
  static char operand_chars[256];
  
  /* Lexical macros.  */
-#define is_mnemonic_char(x) (mnemonic_chars[(unsigned char) x])
  #define is_operand_char(x) (operand_chars[(unsigned char) x])
  #define is_register_char(x) (register_chars[(unsigned char) x])
  #define is_space_char(x) ((x) == ' ')
  
-/* All non-digit non-letter characters that may occur in an operand.  */
-static char operand_special_chars[] = "%$-+(,)*._~/<>|&^!=:[@]";
+/* All non-digit non-letter characters that may occur in an operand and
+   which aren't already in extra_symbol_chars[].  */
+static const char operand_special_chars[] = "$+,)._~/<>|&^!=:@]";
  
  /* md_assemble() always leaves the strings it's passed unaltered.  To
     effect this we maintain a stack of saved characters that we've smashed
@@ -576,15 +579,8 @@ static int this_operand = -1;
  /* Are we processing a .insn directive?  */
  #define dot_insn() (i.tm.mnem_off == MN__insn)
  
-/* We support four different modes.  FLAG_CODE variable is used to distinguish
-   these.  */
-
-enum flag_code {
-       CODE_32BIT,
-       CODE_16BIT,
-       CODE_64BIT };
-
-static enum flag_code flag_code;
+enum i386_flag_code i386_flag_code;
+#define flag_code i386_flag_code /* Permit to continue using original name.  */
  static unsigned int object_64bit;
  static unsigned int disallow_64bit_reloc;
  static int use_rela_relocations = 0;
@@ -806,7 +802,7 @@ static const char *cpu_arch_name = NULL;
  static char *cpu_sub_arch_name = NULL;
  
  /* CPU feature flags.  */
-static i386_cpu_flags cpu_arch_flags = CPU_UNKNOWN_FLAGS;
+i386_cpu_flags cpu_arch_flags = CPU_UNKNOWN_FLAGS;
  
  /* If we have selected a cpu we are generating instructions for.  */
  static int cpu_arch_tune_set = 0;
@@ -814,9 +810,6 @@ static int cpu_arch_tune_set = 0;
  /* Cpu we are generating instructions for.  */
  enum processor_type cpu_arch_tune = PROCESSOR_UNKNOWN;
  
-/* CPU feature flags of cpu we are generating instructions for.  */
-static i386_cpu_flags cpu_arch_tune_flags;
-
  /* CPU instruction set architecture used.  */
  enum processor_type cpu_arch_isa = PROCESSOR_UNKNOWN;
  
@@ -841,6 +834,10 @@ static unsigned int sse2avx;
  /* Encode aligned vector move as unaligned vector move.  */
  static unsigned int use_unaligned_vector_move;
  
+/* Maximum permitted vector size. */
+#define VSZ_DEFAULT VSZ512
+static unsigned int vector_size = VSZ_DEFAULT;
+
  /* Encode scalar AVX instructions with specific vector length.  */
  static enum
    {
@@ -969,11 +966,14 @@ const relax_typeS md_relax_table[] =
  };
  
  #define ARCH(n, t, f, s) \
-  { STRING_COMMA_LEN (#n), s, PROCESSOR_ ## t, CPU_ ## f ## _FLAGS, \
+  { STRING_COMMA_LEN (#n), s, PROCESSOR_ ## t, vsz_none, CPU_ ## f ## _FLAGS, \
      CPU_NONE_FLAGS }
  #define SUBARCH(n, e, d, s) \
-  { STRING_COMMA_LEN (#n), s, PROCESSOR_NONE, CPU_ ## e ## _FLAGS, \
+  { STRING_COMMA_LEN (#n), s, PROCESSOR_NONE, vsz_none, CPU_ ## e ## _FLAGS, \
      CPU_ ## d ## _FLAGS }
+#define VECARCH(n, e, d, v) \
+  { STRING_COMMA_LEN (#n), false, PROCESSOR_NONE, vsz_ ## v, \
+    CPU_ ## e ## _FLAGS, CPU_ ## d ## _FLAGS }
  
  static const arch_entry cpu_arch[] =
  {
@@ -987,8 +987,8 @@ static const arch_entry cpu_arch[] =
    ARCH (i386, I386, 386, false),
    ARCH (i486, I486, 486, false),
    ARCH (i586, PENTIUM, 586, false),
-  ARCH (i686, PENTIUMPRO, 686, false),
    ARCH (pentium, PENTIUM, 586, false),
+  ARCH (i686, I686, 686, false),
    ARCH (pentiumpro, PENTIUMPRO, PENTIUMPRO, false),
    ARCH (pentiumii, PENTIUMPRO, P2, false),
    ARCH (pentiumiii, PENTIUMPRO, P3, false),
@@ -1035,15 +1035,15 @@ static const arch_entry cpu_arch[] =
    SUBARCH (sse4.1, SSE4_1, ANY_SSE4_1, false),
    SUBARCH (sse4.2, SSE4_2, ANY_SSE4_2, false),
    SUBARCH (sse4, SSE4_2, ANY_SSE4_1, false),
-  SUBARCH (avx, AVX, ANY_AVX, false),
-  SUBARCH (avx2, AVX2, ANY_AVX2, false),
-  SUBARCH (avx512f, AVX512F, ANY_AVX512F, false),
-  SUBARCH (avx512cd, AVX512CD, ANY_AVX512CD, false),
-  SUBARCH (avx512er, AVX512ER, ANY_AVX512ER, false),
-  SUBARCH (avx512pf, AVX512PF, ANY_AVX512PF, false),
-  SUBARCH (avx512dq, AVX512DQ, ANY_AVX512DQ, false),
-  SUBARCH (avx512bw, AVX512BW, ANY_AVX512BW, false),
-  SUBARCH (avx512vl, AVX512VL, ANY_AVX512VL, false),
+  VECARCH (avx, AVX, ANY_AVX, reset),
+  VECARCH (avx2, AVX2, ANY_AVX2, reset),
+  VECARCH (avx512f, AVX512F, ANY_AVX512F, reset),
+  VECARCH (avx512cd, AVX512CD, ANY_AVX512CD, reset),
+  VECARCH (avx512er, AVX512ER, ANY_AVX512ER, reset),
+  VECARCH (avx512pf, AVX512PF, ANY_AVX512PF, reset),
+  VECARCH (avx512dq, AVX512DQ, ANY_AVX512DQ, reset),
+  VECARCH (avx512bw, AVX512BW, ANY_AVX512BW, reset),
+  VECARCH (avx512vl, AVX512VL, ANY_AVX512VL, reset),
    SUBARCH (monitor, MONITOR, MONITOR, false),
    SUBARCH (vmx, VMX, ANY_VMX, false),
    SUBARCH (vmfunc, VMFUNC, ANY_VMFUNC, false),
@@ -1053,8 +1053,8 @@ static const arch_entry cpu_arch[] =
    SUBARCH (xsavec, XSAVEC, ANY_XSAVEC, false),
    SUBARCH (xsaves, XSAVES, ANY_XSAVES, false),
    SUBARCH (aes, AES, ANY_AES, false),
-  SUBARCH (pclmul, PCLMUL, ANY_PCLMUL, false),
-  SUBARCH (clmul, PCLMUL, ANY_PCLMUL, true),
+  SUBARCH (pclmul, PCLMULQDQ, ANY_PCLMULQDQ, false),
+  SUBARCH (clmul, PCLMULQDQ, ANY_PCLMULQDQ, true),
    SUBARCH (fsgsbase, FSGSBASE, FSGSBASE, false),
    SUBARCH (rdrnd, RDRND, RDRND, false),
    SUBARCH (f16c, F16C, ANY_F16C, false),
@@ -1095,15 +1095,15 @@ static const arch_entry cpu_arch[] =
    SUBARCH (prefetchwt1, PREFETCHWT1, PREFETCHWT1, false),
    SUBARCH (se1, SE1, SE1, false),
    SUBARCH (clwb, CLWB, CLWB, false),
-  SUBARCH (avx512ifma, AVX512IFMA, ANY_AVX512IFMA, false),
-  SUBARCH (avx512vbmi, AVX512VBMI, ANY_AVX512VBMI, false),
-  SUBARCH (avx512_4fmaps, AVX512_4FMAPS, ANY_AVX512_4FMAPS, false),
-  SUBARCH (avx512_4vnniw, AVX512_4VNNIW, ANY_AVX512_4VNNIW, false),
-  SUBARCH (avx512_vpopcntdq, AVX512_VPOPCNTDQ, ANY_AVX512_VPOPCNTDQ, false),
-  SUBARCH (avx512_vbmi2, AVX512_VBMI2, ANY_AVX512_VBMI2, false),
-  SUBARCH (avx512_vnni, AVX512_VNNI, ANY_AVX512_VNNI, false),
-  SUBARCH (avx512_bitalg, AVX512_BITALG, ANY_AVX512_BITALG, false),
-  SUBARCH (avx_vnni, AVX_VNNI, ANY_AVX_VNNI, false),
+  VECARCH (avx512ifma, AVX512IFMA, ANY_AVX512IFMA, reset),
+  VECARCH (avx512vbmi, AVX512VBMI, ANY_AVX512VBMI, reset),
+  VECARCH (avx512_4fmaps, AVX512_4FMAPS, ANY_AVX512_4FMAPS, reset),
+  VECARCH (avx512_4vnniw, AVX512_4VNNIW, ANY_AVX512_4VNNIW, reset),
+  VECARCH (avx512_vpopcntdq, AVX512_VPOPCNTDQ, ANY_AVX512_VPOPCNTDQ, reset),
+  VECARCH (avx512_vbmi2, AVX512_VBMI2, ANY_AVX512_VBMI2, reset),
+  VECARCH (avx512_vnni, AVX512_VNNI, ANY_AVX512_VNNI, reset),
+  VECARCH (avx512_bitalg, AVX512_BITALG, ANY_AVX512_BITALG, reset),
+  VECARCH (avx_vnni, AVX_VNNI, ANY_AVX_VNNI, reset),
    SUBARCH (clzero, CLZERO, CLZERO, false),
    SUBARCH (mwaitx, MWAITX, MWAITX, false),
    SUBARCH (ospke, OSPKE, ANY_OSPKE, false),
@@ -1112,8 +1112,8 @@ static const arch_entry cpu_arch[] =
    SUBARCH (ibt, IBT, IBT, false),
    SUBARCH (shstk, SHSTK, SHSTK, false),
    SUBARCH (gfni, GFNI, ANY_GFNI, false),
-  SUBARCH (vaes, VAES, ANY_VAES, false),
-  SUBARCH (vpclmulqdq, VPCLMULQDQ, ANY_VPCLMULQDQ, false),
+  VECARCH (vaes, VAES, ANY_VAES, reset),
+  VECARCH (vpclmulqdq, VPCLMULQDQ, ANY_VPCLMULQDQ, reset),
    SUBARCH (wbnoinvd, WBNOINVD, WBNOINVD, false),
    SUBARCH (pconfig, PCONFIG, PCONFIG, false),
    SUBARCH (waitpkg, WAITPKG, WAITPKG, false),
@@ -1125,9 +1125,9 @@ static const arch_entry cpu_arch[] =
    SUBARCH (amx_tile, AMX_TILE, ANY_AMX_TILE, false),
    SUBARCH (movdiri, MOVDIRI, MOVDIRI, false),
    SUBARCH (movdir64b, MOVDIR64B, MOVDIR64B, false),
-  SUBARCH (avx512_bf16, AVX512_BF16, ANY_AVX512_BF16, false),
-  SUBARCH (avx512_vp2intersect, AVX512_VP2INTERSECT,
-          ANY_AVX512_VP2INTERSECT, false),
+  VECARCH (avx512_bf16, AVX512_BF16, ANY_AVX512_BF16, reset),
+  VECARCH (avx512_vp2intersect, AVX512_VP2INTERSECT,
+          ANY_AVX512_VP2INTERSECT, reset),
    SUBARCH (tdx, TDX, TDX, false),
    SUBARCH (enqcmd, ENQCMD, ENQCMD, false),
    SUBARCH (serialize, SERIALIZE, SERIALIZE, false),
@@ -1139,16 +1139,24 @@ static const arch_entry cpu_arch[] =
    SUBARCH (widekl, WIDEKL, ANY_WIDEKL, false),
    SUBARCH (uintr, UINTR, UINTR, false),
    SUBARCH (hreset, HRESET, HRESET, false),
-  SUBARCH (avx512_fp16, AVX512_FP16, ANY_AVX512_FP16, false),
+  VECARCH (avx512_fp16, AVX512_FP16, ANY_AVX512_FP16, reset),
    SUBARCH (prefetchi, PREFETCHI, PREFETCHI, false),
-  SUBARCH (avx_ifma, AVX_IFMA, ANY_AVX_IFMA, false),
-  SUBARCH (avx_vnni_int8, AVX_VNNI_INT8, ANY_AVX_VNNI_INT8, false),
+  VECARCH (avx_ifma, AVX_IFMA, ANY_AVX_IFMA, reset),
+  VECARCH (avx_vnni_int8, AVX_VNNI_INT8, ANY_AVX_VNNI_INT8, reset),
    SUBARCH (cmpccxadd, CMPCCXADD, CMPCCXADD, false),
    SUBARCH (wrmsrns, WRMSRNS, WRMSRNS, false),
    SUBARCH (msrlist, MSRLIST, MSRLIST, false),
-  SUBARCH (avx_ne_convert, AVX_NE_CONVERT, ANY_AVX_NE_CONVERT, false),
+  VECARCH (avx_ne_convert, AVX_NE_CONVERT, ANY_AVX_NE_CONVERT, reset),
    SUBARCH (rao_int, RAO_INT, RAO_INT, false),
    SUBARCH (rmpquery, RMPQUERY, ANY_RMPQUERY, false),
+  SUBARCH (fred, FRED, ANY_FRED, false),
+  SUBARCH (lkgs, LKGS, ANY_LKGS, false),
+  VECARCH (avx_vnni_int16, AVX_VNNI_INT16, ANY_AVX_VNNI_INT16, reset),
+  VECARCH (sha512, SHA512, ANY_SHA512, reset),
+  VECARCH (sm3, SM3, ANY_SM3, reset),
+  VECARCH (sm4, SM4, ANY_SM4, reset),
+  SUBARCH (pbndkb, PBNDKB, PBNDKB, false),
+  VECARCH (avx10.1, AVX10_1, ANY_AVX512F, set),
  };
  
  #undef SUBARCH
@@ -1265,16 +1273,32 @@ static const unsigned char f32_2[] =
    {0x66,0x90};                         /* xchg %ax,%ax         */
  static const unsigned char f32_3[] =
    {0x8d,0x76,0x00};                    /* leal 0(%esi),%esi    */
-static const unsigned char f32_4[] =
-  {0x8d,0x74,0x26,0x00};               /* leal 0(%esi,1),%esi  */
+#define f32_4 (f32_5 + 1)      /* leal 0(%esi,%eiz),%esi */
+static const unsigned char f32_5[] =
+  {0x2e,0x8d,0x74,0x26,0x00};          /* leal %cs:0(%esi,%eiz),%esi   */
  static const unsigned char f32_6[] =
    {0x8d,0xb6,0x00,0x00,0x00,0x00};     /* leal 0L(%esi),%esi   */
-static const unsigned char f32_7[] =
-  {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};        /* leal 0L(%esi,1),%esi */
+#define f32_7 (f32_8 + 1)      /* leal 0L(%esi,%eiz),%esi */
+static const unsigned char f32_8[] =
+  {0x2e,0x8d,0xb4,0x26,0x00,0x00,0x00,0x00}; /* leal %cs:0L(%esi,%eiz),%esi */
+static const unsigned char f64_3[] =
+  {0x48,0x89,0xf6};                    /* mov %rsi,%rsi        */
+static const unsigned char f64_4[] =
+  {0x48,0x8d,0x76,0x00};               /* lea 0(%rsi),%rsi     */
+#define f64_5 (f64_6 + 1)              /* lea 0(%rsi,%riz),%rsi        */
+static const unsigned char f64_6[] =
+  {0x2e,0x48,0x8d,0x74,0x26,0x00};     /* lea %cs:0(%rsi,%riz),%rsi    */
+static const unsigned char f64_7[] =
+  {0x48,0x8d,0xb6,0x00,0x00,0x00,0x00};        /* lea 0L(%rsi),%rsi    */
+#define f64_8 (f64_9 + 1)              /* lea 0L(%rsi,%riz),%rsi */
+static const unsigned char f64_9[] =
+  {0x2e,0x48,0x8d,0xb4,0x26,0x00,0x00,0x00,0x00}; /* lea %cs:0L(%rsi,%riz),%rsi */
+#define f16_2 (f64_3 + 1)              /* mov %si,%si  */
  static const unsigned char f16_3[] =
    {0x8d,0x74,0x00};                    /* lea 0(%si),%si       */
-static const unsigned char f16_4[] =
-  {0x8d,0xb4,0x00,0x00};               /* lea 0W(%si),%si      */
+#define f16_4 (f16_5 + 1)              /* lea 0W(%si),%si */
+static const unsigned char f16_5[] =
+  {0x2e,0x8d,0xb4,0x00,0x00};          /* lea %cs:0W(%si),%si  */
  static const unsigned char jump_disp8[] =
    {0xeb};                              /* jmp disp8           */
  static const unsigned char jump32_disp32[] =
@@ -1283,11 +1307,15 @@ static const unsigned char jump16_disp32[] =
    {0x66,0xe9};                         /* jmp disp32          */
  /* 32-bit NOPs patterns.  */
  static const unsigned char *const f32_patt[] = {
-  f32_1, f32_2, f32_3, f32_4, NULL, f32_6, f32_7
+  f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8
+};
+/* 64-bit NOPs patterns.  */
+static const unsigned char *const f64_patt[] = {
+  f32_1, f32_2, f64_3, f64_4, f64_5, f64_6, f64_7, f64_8, f64_9
  };
  /* 16-bit NOPs patterns.  */
  static const unsigned char *const f16_patt[] = {
-  f32_1, f32_2, f16_3, f16_4
+  f32_1, f16_2, f16_3, f16_4, f16_5
  };
  /* nopl (%[re]ax) */
  static const unsigned char alt_3[] =
@@ -1296,8 +1324,7 @@ static const unsigned char alt_3[] =
  static const unsigned char alt_4[] =
    {0x0f,0x1f,0x40,0x00};
  /* nopl 0(%[re]ax,%[re]ax,1) */
-static const unsigned char alt_5[] =
-  {0x0f,0x1f,0x44,0x00,0x00};
+#define alt_5 (alt_6 + 1)
  /* nopw 0(%[re]ax,%[re]ax,1) */
  static const unsigned char alt_6[] =
    {0x66,0x0f,0x1f,0x44,0x00,0x00};
@@ -1305,14 +1332,12 @@ static const unsigned char alt_6[] =
  static const unsigned char alt_7[] =
    {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00};
  /* nopl 0L(%[re]ax,%[re]ax,1) */
-static const unsigned char alt_8[] =
-  {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+#define alt_8 (alt_9 + 1)
  /* nopw 0L(%[re]ax,%[re]ax,1) */
  static const unsigned char alt_9[] =
    {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
  /* nopw %cs:0L(%[re]ax,%[re]ax,1) */
-static const unsigned char alt_10[] =
-  {0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
+#define alt_10 (alt_11 + 1)
  /* data16 nopw %cs:0L(%eax,%eax,1) */
  static const unsigned char alt_11[] =
    {0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00};
@@ -1343,14 +1368,6 @@ i386_output_nops (char *where, const unsigned char *const *patt,
      }
  
    nops = patt[max_single_nop_size - 1];
-
-  /* Use the smaller one if the requsted one isn't available.  */
-  if (nops == NULL)
-    {
-      max_single_nop_size--;
-      nops = patt[max_single_nop_size - 1];
-    }
-
    last = count % max_single_nop_size;
  
    count -= last;
@@ -1360,17 +1377,7 @@ i386_output_nops (char *where, const unsigned char *const *patt,
    if (last)
      {
        nops = patt[last - 1];
-      if (nops == NULL)
-       {
-         /* Use the smaller one plus one-byte NOP if the needed one
-            isn't available.  */
-         last--;
-         nops = patt[last - 1];
-         memcpy (where + offset, nops, last);
-         where[offset + last] = *patt[0];
-       }
-      else
-       memcpy (where + offset, nops, last);
+      memcpy (where + offset, nops, last);
      }
  }
  
@@ -1415,18 +1422,18 @@ i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
    /* We need to decide which NOP sequence to use for 32bit and
       64bit. When -mtune= is used:
  
-     1. For PROCESSOR_I386, PROCESSOR_I486, PROCESSOR_PENTIUM and
+     1. For PROCESSOR_I?86, PROCESSOR_PENTIUM, PROCESSOR_IAMCU, and
       PROCESSOR_GENERIC32, f32_patt will be used.
       2. For the rest, alt_patt will be used.
  
       When -mtune= isn't used, alt_patt will be used if
-     cpu_arch_isa_flags has CpuNop.  Otherwise, f32_patt will
+     cpu_arch_isa_flags has CpuNop.  Otherwise, f32_patt/f64_patt will
       be used.
  
       When -march= or .arch is used, we can't use anything beyond
       cpu_arch_isa_flags.   */
  
-  if (flag_code == CODE_16BIT)
+  if (fragP->tc_frag_data.code == CODE_16BIT)
      {
        patt = f16_patt;
        max_single_nop_size = sizeof (f16_patt) / sizeof (f16_patt[0]);
@@ -1435,19 +1442,21 @@ i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
      }
    else
      {
+      patt = fragP->tc_frag_data.code == CODE_64BIT ? f64_patt : f32_patt;
        if (fragP->tc_frag_data.isa == PROCESSOR_UNKNOWN)
         {
-         /* PROCESSOR_UNKNOWN means that all ISAs may be used.  */
-         switch (cpu_arch_tune)
+         /* PROCESSOR_UNKNOWN means that all ISAs may be used, unless
+            explicitly disabled.  */
+         switch (fragP->tc_frag_data.tune)
             {
             case PROCESSOR_UNKNOWN:
               /* We use cpu_arch_isa_flags to check if we SHOULD
                  optimize with nops.  */
-             if (fragP->tc_frag_data.isa_flags.bitfield.cpunop)
+             if (fragP->tc_frag_data.isanop)
                 patt = alt_patt;
-             else
-               patt = f32_patt;
               break;
+
+           case PROCESSOR_PENTIUMPRO:
             case PROCESSOR_PENTIUM4:
             case PROCESSOR_NOCONA:
             case PROCESSOR_CORE:
@@ -1461,15 +1470,16 @@ i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
             case PROCESSOR_BD:
             case PROCESSOR_ZNVER:
             case PROCESSOR_BT:
-             patt = alt_patt;
+             if (fragP->tc_frag_data.cpunop)
+               patt = alt_patt;
               break;
+
             case PROCESSOR_I386:
             case PROCESSOR_I486:
             case PROCESSOR_PENTIUM:
-           case PROCESSOR_PENTIUMPRO:
+           case PROCESSOR_I686:
             case PROCESSOR_IAMCU:
             case PROCESSOR_GENERIC32:
-             patt = f32_patt;
               break;
             case PROCESSOR_NONE:
               abort ();
@@ -1485,47 +1495,22 @@ i386_generate_nops (fragS *fragP, char *where, offsetT count, int limit)
               abort ();
               break;
  
-           case PROCESSOR_I386:
-           case PROCESSOR_I486:
-           case PROCESSOR_PENTIUM:
-           case PROCESSOR_IAMCU:
-           case PROCESSOR_K6:
-           case PROCESSOR_ATHLON:
-           case PROCESSOR_K8:
-           case PROCESSOR_AMDFAM10:
-           case PROCESSOR_BD:
-           case PROCESSOR_ZNVER:
-           case PROCESSOR_BT:
-           case PROCESSOR_GENERIC32:
+           default:
               /* We use cpu_arch_isa_flags to check if we CAN optimize
                  with nops.  */
-             if (fragP->tc_frag_data.isa_flags.bitfield.cpunop)
+             if (fragP->tc_frag_data.isanop)
                 patt = alt_patt;
-             else
-               patt = f32_patt;
-             break;
-           case PROCESSOR_PENTIUMPRO:
-           case PROCESSOR_PENTIUM4:
-           case PROCESSOR_NOCONA:
-           case PROCESSOR_CORE:
-           case PROCESSOR_CORE2:
-           case PROCESSOR_COREI7:
-             if (fragP->tc_frag_data.isa_flags.bitfield.cpunop)
-               patt = alt_patt;
-             else
-               patt = f32_patt;
-             break;
-           case PROCESSOR_GENERIC64:
-             patt = alt_patt;
               break;
+
             case PROCESSOR_NONE:
               abort ();
             }
         }
  
-      if (patt == f32_patt)
+      if (patt != alt_patt)
         {
-         max_single_nop_size = sizeof (f32_patt) / sizeof (f32_patt[0]);
+         max_single_nop_size = patt == f32_patt ? ARRAY_SIZE (f32_patt)
+                                                : ARRAY_SIZE (f64_patt);
           /* Limit number of NOPs to 2 for older processors.  */
           max_number_of_nops = 2;
         }
@@ -1665,6 +1650,51 @@ operand_type_equal (const union i386_operand_type *x,
      }
  }
  
+static INLINE bool
+is_cpu (const insn_template *t, enum i386_cpu cpu)
+{
+  switch (cpu)
+    {
+    case Cpu287:      return t->cpu.bitfield.cpu287;
+    case Cpu387:      return t->cpu.bitfield.cpu387;
+    case Cpu3dnow:    return t->cpu.bitfield.cpu3dnow;
+    case Cpu3dnowA:   return t->cpu.bitfield.cpu3dnowa;
+    case CpuAVX:      return t->cpu.bitfield.cpuavx;
+    case CpuHLE:      return t->cpu.bitfield.cpuhle;
+    case CpuAVX512F:  return t->cpu.bitfield.cpuavx512f;
+    case CpuAVX512VL: return t->cpu.bitfield.cpuavx512vl;
+    case Cpu64:       return t->cpu.bitfield.cpu64;
+    case CpuNo64:     return t->cpu.bitfield.cpuno64;
+    default:
+      gas_assert (cpu < CpuAttrEnums);
+    }
+  return t->cpu.bitfield.isa == cpu + 1u;
+}
+
+static i386_cpu_flags cpu_flags_from_attr (i386_cpu_attr a)
+{
+  const unsigned int bps = sizeof (a.array[0]) * CHAR_BIT;
+  i386_cpu_flags f = { .array[0] = 0 };
+
+  switch (ARRAY_SIZE(a.array))
+    {
+    case 1:
+      f.array[CpuAttrEnums / bps]
+        |= (a.array[0] >> CpuIsaBits) << (CpuAttrEnums % bps);
+      if (CpuAttrEnums % bps > CpuIsaBits)
+       f.array[CpuAttrEnums / bps + 1]
+         = (a.array[0] >> CpuIsaBits) >> (bps - CpuAttrEnums % bps);
+      break;
+    default:
+      abort ();
+    }
+
+  if (a.bitfield.isa)
+    f.array[(a.bitfield.isa - 1) / bps] |= 1u << ((a.bitfield.isa - 1) % bps);
+
+  return f;
+}
+
  static INLINE int
  cpu_flags_all_zero (const union i386_cpu_flags *x)
  {
@@ -1724,10 +1754,11 @@ cpu_flags_equal (const union i386_cpu_flags *x,
  }
  
  static INLINE int
-cpu_flags_check_cpu64 (i386_cpu_flags f)
+cpu_flags_check_cpu64 (const insn_template *t)
  {
-  return !((flag_code == CODE_64BIT && f.bitfield.cpuno64)
-          || (flag_code != CODE_64BIT && f.bitfield.cpu64));
+  return flag_code == CODE_64BIT
+        ? !t->cpu.bitfield.cpuno64
+        : !t->cpu.bitfield.cpu64;
  }
  
  static INLINE i386_cpu_flags
@@ -1810,6 +1841,13 @@ cpu_flags_and_not (i386_cpu_flags x, i386_cpu_flags y)
  
  static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
  
+static INLINE bool need_evex_encoding (void)
+{
+  return i.vec_encoding == vex_encoding_evex
+       || i.vec_encoding == vex_encoding_evex512
+       || i.mask.reg;
+}
+
  #define CPU_FLAGS_ARCH_MATCH           0x1
  #define CPU_FLAGS_64BIT_MATCH          0x2
  
@@ -1821,8 +1859,8 @@ static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
  static int
  cpu_flags_match (const insn_template *t)
  {
-  i386_cpu_flags x = t->cpu_flags;
-  int match = cpu_flags_check_cpu64 (x) ? CPU_FLAGS_64BIT_MATCH : 0;
+  i386_cpu_flags x = cpu_flags_from_attr (t->cpu);
+  int match = cpu_flags_check_cpu64 (t) ? CPU_FLAGS_64BIT_MATCH : 0;
  
    x.bitfield.cpu64 = 0;
    x.bitfield.cpuno64 = 0;
@@ -1837,6 +1875,31 @@ cpu_flags_match (const insn_template *t)
        /* This instruction is available only on some archs.  */
        i386_cpu_flags cpu = cpu_arch_flags;
  
+      /* Dual VEX/EVEX templates may need stripping of one of the flags.  */
+      if (t->opcode_modifier.vex && t->opcode_modifier.evex)
+       {
+         /* Dual AVX/AVX512F templates need to retain AVX512F only if we already
+            know that EVEX encoding will be needed.  */
+         if ((x.bitfield.cpuavx || x.bitfield.cpuavx2)
+             && x.bitfield.cpuavx512f)
+           {
+             if (need_evex_encoding ())
+               {
+                 x.bitfield.cpuavx = 0;
+                 x.bitfield.cpuavx2 = 0;
+               }
+             /* need_evex_encoding() isn't reliable before operands were
+                parsed.  */
+             else if (i.operands)
+               {
+                 x.bitfield.cpuavx512f = 0;
+                 x.bitfield.cpuavx512vl = 0;
+                 if (x.bitfield.cpufma && !cpu.bitfield.cpufma)
+                   x.bitfield.cpuavx = 0;
+               }
+           }
+       }
+
        /* AVX512VL is no standalone feature - match it and then strip it.  */
        if (x.bitfield.cpuavx512vl && !cpu.bitfield.cpuavx512vl)
         return match;
@@ -1851,7 +1914,19 @@ cpu_flags_match (const insn_template *t)
        cpu = cpu_flags_and (x, cpu);
        if (!cpu_flags_all_zero (&cpu))
         {
-         if (x.bitfield.cpuavx)
+         if (t->cpu.bitfield.cpuavx && t->cpu.bitfield.cpuavx512f)
+           {
+             if ((need_evex_encoding ()
+                  ? cpu.bitfield.cpuavx512f
+                  : cpu.bitfield.cpuavx)
+                 && (!x.bitfield.cpufma || cpu.bitfield.cpufma
+                     || cpu_arch_flags.bitfield.cpuavx512f)
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+                 && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
+                 && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
+               match |= CPU_FLAGS_ARCH_MATCH;
+           }
+         else if (x.bitfield.cpuavx)
             {
               /* We need to check a few extra flags with AVX.  */
               if (cpu.bitfield.cpuavx
@@ -1859,16 +1934,16 @@ cpu_flags_match (const insn_template *t)
                       || (sse2avx && !i.prefix[DATA_PREFIX]))
                   && (!x.bitfield.cpuaes || cpu.bitfield.cpuaes)
                   && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
-                 && (!x.bitfield.cpupclmul || cpu.bitfield.cpupclmul))
+                 && (!x.bitfield.cpupclmulqdq || cpu.bitfield.cpupclmulqdq))
                 match |= CPU_FLAGS_ARCH_MATCH;
             }
+         else if (x.bitfield.cpuavx2 && cpu.bitfield.cpuavx2)
+           match |= CPU_FLAGS_ARCH_MATCH;
           else if (x.bitfield.cpuavx512f)
             {
               /* We need to check a few extra flags with AVX512F.  */
               if (cpu.bitfield.cpuavx512f
-                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
-                 && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
-                 && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
+                 && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni))
                 match |= CPU_FLAGS_ARCH_MATCH;
             }
           else
@@ -2164,7 +2239,7 @@ operand_size_match (const insn_template *t)
  
        /* For FMA4 and XOP insns VEX.W controls just the first two
          register operands.  */
-      if (t->cpu_flags.bitfield.cpufma4 || t->cpu_flags.bitfield.cpuxop)
+      if (is_cpu (t, CpuFMA4) || is_cpu (t, CpuXOP))
         given = j < 2 ? 1 - j : j;
  
        if (t->operand_types[j].bitfield.class == Reg
@@ -2548,37 +2623,24 @@ add_prefix (unsigned int prefix)
  static void
  update_code_flag (int value, int check)
  {
-  PRINTF_LIKE ((*as_error));
+  PRINTF_LIKE ((*as_error)) = check ? as_fatal : as_bad;
  
-  flag_code = (enum flag_code) value;
-  if (flag_code == CODE_64BIT)
-    {
-      cpu_arch_flags.bitfield.cpu64 = 1;
-      cpu_arch_flags.bitfield.cpuno64 = 0;
-    }
-  else
-    {
-      cpu_arch_flags.bitfield.cpu64 = 0;
-      cpu_arch_flags.bitfield.cpuno64 = 1;
-    }
-  if (value == CODE_64BIT && !cpu_arch_flags.bitfield.cpulm )
+  if (value == CODE_64BIT && !cpu_arch_flags.bitfield.cpu64 )
      {
-      if (check)
-       as_error = as_fatal;
-      else
-       as_error = as_bad;
-      (*as_error) (_("64bit mode not supported on `%s'."),
-                  cpu_arch_name ? cpu_arch_name : default_arch);
+      as_error (_("64bit mode not supported on `%s'."),
+               cpu_arch_name ? cpu_arch_name : default_arch);
+      return;
      }
+
    if (value == CODE_32BIT && !cpu_arch_flags.bitfield.cpui386)
      {
-      if (check)
-       as_error = as_fatal;
-      else
-       as_error = as_bad;
-      (*as_error) (_("32bit mode not supported on `%s'."),
-                  cpu_arch_name ? cpu_arch_name : default_arch);
+      as_error (_("32bit mode not supported on `%s'."),
+               cpu_arch_name ? cpu_arch_name : default_arch);
+      return;
      }
+
+  flag_code = (enum flag_code) value;
+
    stackop_size = '\0';
  }
  
@@ -2594,8 +2656,6 @@ set_16bit_gcc_code_flag (int new_code_flag)
    flag_code = (enum flag_code) new_code_flag;
    if (flag_code != CODE_16BIT)
      abort ();
-  cpu_arch_flags.bitfield.cpu64 = 0;
-  cpu_arch_flags.bitfield.cpuno64 = 1;
    stackop_size = LONG_MNEM_SUFFIX;
  }
  
@@ -2716,13 +2776,41 @@ check_cpu_arch_compatible (const char *name ATTRIBUTE_UNUSED,
  }
  
  static void
-extend_cpu_sub_arch_name (const char *name)
+extend_cpu_sub_arch_name (const char *pfx, const char *name)
  {
    if (cpu_sub_arch_name)
      cpu_sub_arch_name = reconcat (cpu_sub_arch_name, cpu_sub_arch_name,
-                                 ".", name, (const char *) NULL);
+                                 pfx, name, (const char *) NULL);
    else
-    cpu_sub_arch_name = concat (".", name, (const char *) NULL);
+    cpu_sub_arch_name = concat (pfx, name, (const char *) NULL);
+}
+
+static void isa_enable (unsigned int idx)
+{
+  i386_cpu_flags flags = cpu_flags_or (cpu_arch_flags, cpu_arch[idx].enable);
+
+  if (!cpu_flags_equal (&flags, &cpu_arch_flags))
+    {
+      extend_cpu_sub_arch_name (".", cpu_arch[idx].name);
+      cpu_arch_flags = flags;
+    }
+
+  cpu_arch_isa_flags = cpu_flags_or (cpu_arch_isa_flags, cpu_arch[idx].enable);
+}
+
+static void isa_disable (unsigned int idx)
+{
+  i386_cpu_flags flags
+    = cpu_flags_and_not (cpu_arch_flags, cpu_arch[idx].disable);
+
+  if (!cpu_flags_equal (&flags, &cpu_arch_flags))
+    {
+      extend_cpu_sub_arch_name (".no", cpu_arch[idx].name);
+      cpu_arch_flags = flags;
+    }
+
+  cpu_arch_isa_flags
+    = cpu_flags_and_not (cpu_arch_isa_flags, cpu_arch[idx].disable);
  }
  
  static void
@@ -2737,210 +2825,231 @@ set_cpu_arch (int dummy ATTRIBUTE_UNUSED)
      i386_cpu_flags isa_flags;
      enum processor_type isa;
      enum flag_code flag_code;
+    unsigned int vector_size;
      char stackop_size;
      bool no_cond_jump_promotion;
    } arch_stack_entry;
    static const arch_stack_entry *arch_stack_top;
+  char *s;
+  int e;
+  const char *string;
+  unsigned int j = 0;
  
    SKIP_WHITESPACE ();
  
-  if (!is_end_of_line[(unsigned char) *input_line_pointer])
+  if (is_end_of_line[(unsigned char) *input_line_pointer])
+    {
+      as_bad (_("missing cpu architecture"));
+      input_line_pointer++;
+      return;
+    }
+
+  e = get_symbol_name (&s);
+  string = s;
+
+  if (strcmp (string, "push") == 0)
+    {
+      arch_stack_entry *top = XNEW (arch_stack_entry);
+
+      top->name = cpu_arch_name;
+      if (cpu_sub_arch_name)
+       top->sub_name = xstrdup (cpu_sub_arch_name);
+      else
+       top->sub_name = NULL;
+      top->flags = cpu_arch_flags;
+      top->isa = cpu_arch_isa;
+      top->isa_flags = cpu_arch_isa_flags;
+      top->flag_code = flag_code;
+      top->vector_size = vector_size;
+      top->stackop_size = stackop_size;
+      top->no_cond_jump_promotion = no_cond_jump_promotion;
+
+      top->prev = arch_stack_top;
+      arch_stack_top = top;
+
+      (void) restore_line_pointer (e);
+      demand_empty_rest_of_line ();
+      return;
+    }
+
+  if (strcmp (string, "pop") == 0)
      {
-      char *s;
-      int e = get_symbol_name (&s);
-      const char *string = s;
-      unsigned int j = 0;
-      i386_cpu_flags flags;
+      const arch_stack_entry *top = arch_stack_top;
  
-      if (strcmp (string, "default") == 0)
+      if (!top)
+       as_bad (_(".arch stack is empty"));
+      else if (top->flag_code != flag_code
+              || top->stackop_size != stackop_size)
         {
-         if (strcmp (default_arch, "iamcu") == 0)
-           string = default_arch;
-         else
-           {
-             static const i386_cpu_flags cpu_unknown_flags = CPU_UNKNOWN_FLAGS;
+         static const unsigned int bits[] = {
+           [CODE_16BIT] = 16,
+           [CODE_32BIT] = 32,
+           [CODE_64BIT] = 64,
+         };
  
-             cpu_arch_name = NULL;
-             free (cpu_sub_arch_name);
-             cpu_sub_arch_name = NULL;
-             cpu_arch_flags = cpu_unknown_flags;
-             if (flag_code == CODE_64BIT)
-               {
-                 cpu_arch_flags.bitfield.cpu64 = 1;
-                 cpu_arch_flags.bitfield.cpuno64 = 0;
-               }
-             else
-               {
-                 cpu_arch_flags.bitfield.cpu64 = 0;
-                 cpu_arch_flags.bitfield.cpuno64 = 1;
-               }
-             cpu_arch_isa = PROCESSOR_UNKNOWN;
-             cpu_arch_isa_flags = cpu_arch[flag_code == CODE_64BIT].enable;
-             if (!cpu_arch_tune_set)
-               {
-                 cpu_arch_tune = cpu_arch_isa;
-                 cpu_arch_tune_flags = cpu_arch_isa_flags;
-               }
+         as_bad (_("this `.arch pop' requires `.code%u%s' to be in effect"),
+                 bits[top->flag_code],
+                 top->stackop_size == LONG_MNEM_SUFFIX ? "gcc" : "");
+       }
+      else
+       {
+         arch_stack_top = top->prev;
  
-             j = ARRAY_SIZE (cpu_arch) + 1;
-           }
+         cpu_arch_name = top->name;
+         free (cpu_sub_arch_name);
+         cpu_sub_arch_name = top->sub_name;
+         cpu_arch_flags = top->flags;
+         cpu_arch_isa = top->isa;
+         cpu_arch_isa_flags = top->isa_flags;
+         vector_size = top->vector_size;
+         no_cond_jump_promotion = top->no_cond_jump_promotion;
+
+         XDELETE (top);
         }
-      else if (strcmp (string, "push") == 0)
+
+      (void) restore_line_pointer (e);
+      demand_empty_rest_of_line ();
+      return;
+    }
+
+  if (strcmp (string, "default") == 0)
+    {
+      if (strcmp (default_arch, "iamcu") == 0)
+       string = default_arch;
+      else
         {
-         arch_stack_entry *top = XNEW (arch_stack_entry);
+         static const i386_cpu_flags cpu_unknown_flags = CPU_UNKNOWN_FLAGS;
  
-         top->name = cpu_arch_name;
-         if (cpu_sub_arch_name)
-           top->sub_name = xstrdup (cpu_sub_arch_name);
-         else
-           top->sub_name = NULL;
-         top->flags = cpu_arch_flags;
-         top->isa = cpu_arch_isa;
-         top->isa_flags = cpu_arch_isa_flags;
-         top->flag_code = flag_code;
-         top->stackop_size = stackop_size;
-         top->no_cond_jump_promotion = no_cond_jump_promotion;
+         cpu_arch_name = NULL;
+         free (cpu_sub_arch_name);
+         cpu_sub_arch_name = NULL;
+         cpu_arch_flags = cpu_unknown_flags;
+         cpu_arch_isa = PROCESSOR_UNKNOWN;
+         cpu_arch_isa_flags = cpu_arch[flag_code == CODE_64BIT].enable;
+         if (!cpu_arch_tune_set)
+           cpu_arch_tune = PROCESSOR_UNKNOWN;
  
-         top->prev = arch_stack_top;
-         arch_stack_top = top;
+         vector_size = VSZ_DEFAULT;
  
-         (void) restore_line_pointer (e);
-         demand_empty_rest_of_line ();
-         return;
+         j = ARRAY_SIZE (cpu_arch) + 1;
         }
-      else if (strcmp (string, "pop") == 0)
-       {
-         const arch_stack_entry *top = arch_stack_top;
+    }
  
-         if (!top)
-           as_bad (_(".arch stack is empty"));
-         else if (top->flag_code != flag_code
-                  || top->stackop_size != stackop_size)
+  for (; j < ARRAY_SIZE (cpu_arch); j++)
+    {
+      if (strcmp (string + (*string == '.'), cpu_arch[j].name) == 0
+         && (*string == '.') == (cpu_arch[j].type == PROCESSOR_NONE))
+       {
+         if (*string != '.')
             {
-             static const unsigned int bits[] = {
-               [CODE_16BIT] = 16,
-               [CODE_32BIT] = 32,
-               [CODE_64BIT] = 64,
-             };
+             check_cpu_arch_compatible (string, cpu_arch[j].enable);
  
-             as_bad (_("this `.arch pop' requires `.code%u%s' to be in effect"),
-                     bits[top->flag_code],
-                     top->stackop_size == LONG_MNEM_SUFFIX ? "gcc" : "");
-           }
-         else
-           {
-             arch_stack_top = top->prev;
+             if (flag_code == CODE_64BIT && !cpu_arch[j].enable.bitfield.cpu64 )
+               {
+                 as_bad (_("64bit mode not supported on `%s'."),
+                         cpu_arch[j].name);
+                 (void) restore_line_pointer (e);
+                 ignore_rest_of_line ();
+                 return;
+               }
  
-             cpu_arch_name = top->name;
+             if (flag_code == CODE_32BIT && !cpu_arch[j].enable.bitfield.cpui386)
+               {
+                 as_bad (_("32bit mode not supported on `%s'."),
+                         cpu_arch[j].name);
+                 (void) restore_line_pointer (e);
+                 ignore_rest_of_line ();
+                 return;
+               }
+
+             cpu_arch_name = cpu_arch[j].name;
               free (cpu_sub_arch_name);
-             cpu_sub_arch_name = top->sub_name;
-             cpu_arch_flags = top->flags;
-             cpu_arch_isa = top->isa;
-             cpu_arch_isa_flags = top->isa_flags;
-             no_cond_jump_promotion = top->no_cond_jump_promotion;
+             cpu_sub_arch_name = NULL;
+             cpu_arch_flags = cpu_arch[j].enable;
+             cpu_arch_isa = cpu_arch[j].type;
+             cpu_arch_isa_flags = cpu_arch[j].enable;
+             if (!cpu_arch_tune_set)
+               cpu_arch_tune = cpu_arch_isa;
+
+             vector_size = VSZ_DEFAULT;
  
-             XDELETE (top);
+             pre_386_16bit_warned = false;
+             break;
             }
  
+         if (cpu_flags_all_zero (&cpu_arch[j].enable))
+           continue;
+
+         isa_enable (j);
+
           (void) restore_line_pointer (e);
-         demand_empty_rest_of_line ();
-         return;
-       }
  
-      for (; j < ARRAY_SIZE (cpu_arch); j++)
-       {
-         if (strcmp (string + (*string == '.'), cpu_arch[j].name) == 0
-            && (*string == '.') == (cpu_arch[j].type == PROCESSOR_NONE))
+         switch (cpu_arch[j].vsz)
             {
-             if (*string != '.')
-               {
-                 check_cpu_arch_compatible (string, cpu_arch[j].enable);
+           default:
+             break;
  
-                 cpu_arch_name = cpu_arch[j].name;
-                 free (cpu_sub_arch_name);
-                 cpu_sub_arch_name = NULL;
-                 cpu_arch_flags = cpu_arch[j].enable;
-                 if (flag_code == CODE_64BIT)
-                   {
-                     cpu_arch_flags.bitfield.cpu64 = 1;
-                     cpu_arch_flags.bitfield.cpuno64 = 0;
-                   }
-                 else
-                   {
-                     cpu_arch_flags.bitfield.cpu64 = 0;
-                     cpu_arch_flags.bitfield.cpuno64 = 1;
-                   }
-                 cpu_arch_isa = cpu_arch[j].type;
-                 cpu_arch_isa_flags = cpu_arch[j].enable;
-                 if (!cpu_arch_tune_set)
+           case vsz_set:
+#ifdef SVR4_COMMENT_CHARS
+             if (*input_line_pointer == ':' || *input_line_pointer == '/')
+#else
+             if (*input_line_pointer == '/')
+#endif
+               {
+                 ++input_line_pointer;
+                 switch (get_absolute_expression ())
                     {
-                     cpu_arch_tune = cpu_arch_isa;
-                     cpu_arch_tune_flags = cpu_arch_isa_flags;
+                   case 512: vector_size = VSZ512; break;
+                   case 256: vector_size = VSZ256; break;
+                   case 128: vector_size = VSZ128; break;
+                   default:
+                     as_bad (_("Unrecognized vector size specifier"));
+                     ignore_rest_of_line ();
+                     return;
                     }
-                 pre_386_16bit_warned = false;
                   break;
                 }
-
-             if (cpu_flags_all_zero (&cpu_arch[j].enable))
-               continue;
-
-             flags = cpu_flags_or (cpu_arch_flags,
-                                   cpu_arch[j].enable);
-
-             if (!cpu_flags_equal (&flags, &cpu_arch_flags))
-               {
-                 extend_cpu_sub_arch_name (string + 1);
-                 cpu_arch_flags = flags;
-                 cpu_arch_isa_flags = flags;
-               }
-             else
-               cpu_arch_isa_flags
-                 = cpu_flags_or (cpu_arch_isa_flags,
-                                 cpu_arch[j].enable);
-             (void) restore_line_pointer (e);
-             demand_empty_rest_of_line ();
-             return;
+               /* Fall through.  */
+           case vsz_reset:
+             vector_size = VSZ_DEFAULT;
+             break;
             }
-       }
  
-      if (startswith (string, ".no") && j >= ARRAY_SIZE (cpu_arch))
-       {
-         /* Disable an ISA extension.  */
-         for (j = 0; j < ARRAY_SIZE (cpu_arch); j++)
-           if (cpu_arch[j].type == PROCESSOR_NONE
-               && strcmp (string + 3, cpu_arch[j].name) == 0)
-             {
-               flags = cpu_flags_and_not (cpu_arch_flags,
-                                          cpu_arch[j].disable);
-               if (!cpu_flags_equal (&flags, &cpu_arch_flags))
-                 {
-                   extend_cpu_sub_arch_name (string + 1);
-                   cpu_arch_flags = flags;
-                   cpu_arch_isa_flags = flags;
-                 }
-               (void) restore_line_pointer (e);
-               demand_empty_rest_of_line ();
-               return;
-             }
+         demand_empty_rest_of_line ();
+         return;
         }
+    }
  
-      if (j == ARRAY_SIZE (cpu_arch))
-       as_bad (_("no such architecture: `%s'"), string);
+  if (startswith (string, ".no") && j >= ARRAY_SIZE (cpu_arch))
+    {
+      /* Disable an ISA extension.  */
+      for (j = 0; j < ARRAY_SIZE (cpu_arch); j++)
+       if (cpu_arch[j].type == PROCESSOR_NONE
+           && strcmp (string + 3, cpu_arch[j].name) == 0)
+         {
+           isa_disable (j);
+
+           if (cpu_arch[j].vsz == vsz_set)
+             vector_size = VSZ_DEFAULT;
  
-      *input_line_pointer = e;
+           (void) restore_line_pointer (e);
+           demand_empty_rest_of_line ();
+           return;
+         }
      }
-  else
-    as_bad (_("missing cpu architecture"));
+
+  if (j == ARRAY_SIZE (cpu_arch))
+    as_bad (_("no such architecture: `%s'"), string);
+
+  *input_line_pointer = e;
  
    no_cond_jump_promotion = 0;
    if (*input_line_pointer == ','
        && !is_end_of_line[(unsigned char) input_line_pointer[1]])
      {
-      char *string;
-      char e;
-
        ++input_line_pointer;
-      e = get_symbol_name (&string);
+      e = get_symbol_name (&s);
+      string = s;
  
        if (strcmp (string, "nojumps") == 0)
         no_cond_jump_promotion = 1;
@@ -3068,7 +3177,7 @@ md_begin (void)
    /* Fill in lexical tables:  mnemonic_chars, operand_chars.  */
    {
      int c;
-    char *p;
+    const char *p;
  
      for (c = 0; c < 256; c++)
        {
@@ -3084,11 +3193,6 @@ md_begin (void)
             register_chars[c] = mnemonic_chars[c];
             operand_chars[c] = c;
           }
-       else if (c == '{' || c == '}')
-         {
-           mnemonic_chars[c] = c;
-           operand_chars[c] = c;
-         }
  #ifdef SVR4_COMMENT_CHARS
         else if (c == '\\' && strchr (i386_comment_chars, '/'))
           operand_chars[c] = c;
@@ -3098,13 +3202,12 @@ md_begin (void)
           operand_chars[c] = c;
        }
  
-#ifdef LEX_QM
-    operand_chars['?'] = '?';
-#endif
      mnemonic_chars['_'] = '_';
      mnemonic_chars['-'] = '-';
      mnemonic_chars['.'] = '.';
  
+    for (p = extra_symbol_chars; *p != '\0'; p++)
+      operand_chars[(unsigned char) *p] = *p;
      for (p = operand_special_chars; *p != '\0'; p++)
        operand_chars[(unsigned char) *p] = *p;
    }
@@ -3561,6 +3664,27 @@ install_template (const insn_template *t)
  
    i.tm = *t;
  
+  /* Dual VEX/EVEX templates need stripping one of the possible variants.  */
+  if (t->opcode_modifier.vex && t->opcode_modifier.evex)
+  {
+      if ((is_cpu (t, CpuAVX) || is_cpu (t, CpuAVX2))
+         && is_cpu (t, CpuAVX512F))
+       {
+         if (need_evex_encoding ())
+           {
+             i.tm.opcode_modifier.vex = 0;
+             i.tm.cpu.bitfield.cpuavx = 0;
+             if (is_cpu (&i.tm, CpuAVX2))
+               i.tm.cpu.bitfield.isa = 0;
+           }
+         else
+           {
+             i.tm.opcode_modifier.evex = 0;
+             i.tm.cpu.bitfield.cpuavx512f = 0;
+           }
+       }
+  }
+
    /* Note that for pseudo prefixes this produces a length of 1. But for them
       the length isn't interesting at all.  */
    for (l = 1; l < 4; ++l)
@@ -4415,7 +4539,7 @@ optimize_encoding (void)
                    && is_evex_encoding (&i.tm)
                    && (i.vec_encoding != vex_encoding_evex
                        || cpu_arch_isa_flags.bitfield.cpuavx512vl
-                      || i.tm.cpu_flags.bitfield.cpuavx512vl
+                      || is_cpu (&i.tm, CpuAVX512VL)
                        || (i.tm.operand_types[2].bitfield.zmmword
                            && i.types[2].bitfield.ymmword))))
            && i.tm.opcode_space == SPACE_0F
@@ -4468,6 +4592,8 @@ optimize_encoding (void)
               i.tm.opcode_modifier.vex = VEX128;
               i.tm.opcode_modifier.vexw = VEXW0;
               i.tm.opcode_modifier.evex = 0;
+             i.vec_encoding = vex_encoding_vex;
+             i.mask.reg = NULL;
             }
           else if (optimize > 1)
             i.tm.opcode_modifier.evex = EVEX128;
@@ -4567,6 +4693,90 @@ optimize_encoding (void)
         i.types[j].bitfield.disp8
           = fits_in_disp8 (i.op[j].disps->X_add_number);
      }
+  else if (optimize_for_space
+          && i.tm.base_opcode == 0x29
+          && i.tm.opcode_space == SPACE_0F38
+          && i.operands == i.reg_operands
+          && i.op[0].regs == i.op[1].regs
+          && (!i.tm.opcode_modifier.vex
+              || !(i.op[0].regs->reg_flags & RegRex))
+          && !is_evex_encoding (&i.tm))
+    {
+      /* Optimize: -Os:
+         pcmpeqq %xmmN, %xmmN          -> pcmpeqd %xmmN, %xmmN
+         vpcmpeqq %xmmN, %xmmN, %xmmM  -> vpcmpeqd %xmmN, %xmmN, %xmmM (N < 8)
+         vpcmpeqq %ymmN, %ymmN, %ymmM  -> vpcmpeqd %ymmN, %ymmN, %ymmM (N < 8)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0x76;
+    }
+  else if (((i.tm.base_opcode >= 0x64
+            && i.tm.base_opcode <= 0x66
+            && i.tm.opcode_space == SPACE_0F)
+           || (i.tm.base_opcode == 0x37
+               && i.tm.opcode_space == SPACE_0F38))
+          && i.operands == i.reg_operands
+          && i.op[0].regs == i.op[1].regs
+          && !is_evex_encoding (&i.tm))
+    {
+      /* Optimize: -O:
+         pcmpgt[bwd] %mmN, %mmN             -> pxor %mmN, %mmN
+         pcmpgt[bwdq] %xmmN, %xmmN          -> pxor %xmmN, %xmmN
+         vpcmpgt[bwdq] %xmmN, %xmmN, %xmmM  -> vpxor %xmmN, %xmmN, %xmmM (N < 8)
+         vpcmpgt[bwdq] %xmmN, %xmmN, %xmmM  -> vpxor %xmm0, %xmm0, %xmmM (N > 7)
+         vpcmpgt[bwdq] %ymmN, %ymmN, %ymmM  -> vpxor %ymmN, %ymmN, %ymmM (N < 8)
+         vpcmpgt[bwdq] %ymmN, %ymmN, %ymmM  -> vpxor %ymm0, %ymm0, %ymmM (N > 7)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0xef;
+      if (i.tm.opcode_modifier.vex && (i.op[0].regs->reg_flags & RegRex))
+       {
+         if (i.operands == 2)
+           {
+             gas_assert (i.tm.opcode_modifier.sse2avx);
+
+             i.operands = 3;
+             i.reg_operands = 3;
+             i.tm.operands = 3;
+
+             i.op[2].regs = i.op[0].regs;
+             i.types[2] = i.types[0];
+             i.flags[2] = i.flags[0];
+             i.tm.operand_types[2] = i.tm.operand_types[0];
+
+             i.tm.opcode_modifier.sse2avx = 0;
+           }
+         i.op[0].regs -= i.op[0].regs->reg_num + 8;
+         i.op[1].regs = i.op[0].regs;
+       }
+    }
+  else if (optimize_for_space
+          && i.tm.base_opcode == 0x59
+          && i.tm.opcode_space == SPACE_0F38
+          && i.operands == i.reg_operands
+          && i.tm.opcode_modifier.vex
+          && !(i.op[0].regs->reg_flags & RegRex)
+          && i.op[0].regs->reg_type.bitfield.xmmword
+          && i.vec_encoding != vex_encoding_vex3)
+    {
+      /* Optimize: -Os:
+         vpbroadcastq %xmmN, %xmmM  -> vpunpcklqdq %xmmN, %xmmN, %xmmM (N < 8)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0x6c;
+      i.tm.opcode_modifier.vexvvvv = 1;
+
+      ++i.operands;
+      ++i.reg_operands;
+      ++i.tm.operands;
+
+      i.op[2].regs = i.op[0].regs;
+      i.types[2] = i.types[0];
+      i.flags[2] = i.flags[0];
+      i.tm.operand_types[2] = i.tm.operand_types[0];
+
+      swap_2_operands (1, 2);
+    }
  }
  
  /* Return non-zero for load instruction.  */
@@ -5124,6 +5334,9 @@ md_assemble (char *line)
         case invalid_register_operand:
           err_msg = _("invalid register operand");
           break;
+       case internal_error:
+         err_msg = _("internal error");
+         break;
         }
        as_bad (_("%s for `%s'"), err_msg,
               pass1_mnem ? pass1_mnem : insn_name (current_templates->start));
@@ -5137,7 +5350,7 @@ md_assemble (char *line)
          bypass the logic below when easily possible.  */
        && t->opcode_space >= SPACE_0F
        && t->opcode_space <= SPACE_0F3A
-      && !i.tm.cpu_flags.bitfield.cpusse4a
+      && !is_cpu (&i.tm, CpuSSE4a)
        && !is_any_vex_encoding (t))
      {
        bool simd = false;
@@ -5224,7 +5437,7 @@ md_assemble (char *line)
    if (i.notrack_prefix && i.tm.opcode_modifier.prefixok != PrefixNoTrack)
      as_bad (_("expecting indirect branch instruction after `notrack'"));
  
-  if (i.tm.cpu_flags.bitfield.cpumpx)
+  if (is_cpu (&i.tm, CpuMPX))
      {
        if (flag_code == CODE_64BIT && i.prefix[ADDR_PREFIX])
         as_bad (_("32-bit address isn't allowed in 64-bit MPX instructions."));
@@ -5269,6 +5482,11 @@ md_assemble (char *line)
    if (optimize && !i.no_optimize && i.tm.opcode_modifier.optimize)
      optimize_encoding ();
  
+  /* Past optimization there's no need to distinguish vex_encoding_evex and
+     vex_encoding_evex512 anymore.  */
+  if (i.vec_encoding == vex_encoding_evex512)
+    i.vec_encoding = vex_encoding_evex;
+
    if (use_unaligned_vector_move)
      encode_with_unaligned_vector_move ();
  
@@ -5276,7 +5494,7 @@ md_assemble (char *line)
      return;
  
    /* Check if IP-relative addressing requirements can be satisfied.  */
-  if (i.tm.cpu_flags.bitfield.cpuprefetchi
+  if (is_cpu (&i.tm, CpuPREFETCHI)
        && !(i.base_reg && i.base_reg->reg_num == RegIP))
      as_warn (_("'%s' only supports RIP-relative address"), insn_name (&i.tm));
  
@@ -5297,9 +5515,12 @@ md_assemble (char *line)
         case RegSIMD:
           if (i.tm.operand_types[j].bitfield.tmmword)
             i.xstate |= xstate_tmm;
-         else if (i.tm.operand_types[j].bitfield.zmmword)
+         else if (i.tm.operand_types[j].bitfield.zmmword
+                  && !i.tm.opcode_modifier.vex
+                  && vector_size >= VSZ512)
             i.xstate |= xstate_zmm;
-         else if (i.tm.operand_types[j].bitfield.ymmword)
+         else if (i.tm.operand_types[j].bitfield.ymmword
+                  && vector_size >= VSZ256)
             i.xstate |= xstate_ymm;
           else if (i.tm.operand_types[j].bitfield.xmmword)
             i.xstate |= xstate_xmm;
@@ -5479,6 +5700,12 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
    while (1)
      {
        mnem_p = mnemonic;
+      /* Pseudo-prefixes start with an opening figure brace.  */
+      if ((*mnem_p = *l) == '{')
+       {
+         ++mnem_p;
+         ++l;
+       }
        while ((*mnem_p = mnemonic_chars[(unsigned char) *l]) != 0)
         {
           if (*mnem_p == '.')
@@ -5486,16 +5713,29 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
           mnem_p++;
           if (mnem_p >= mnemonic + MAX_MNEM_SIZE)
             {
+           too_long:
               as_bad (_("no such instruction: `%s'"), token_start);
               return NULL;
             }
           l++;
         }
-      if (!is_space_char (*l)
-         && *l != END_OF_INSN
-         && (intel_syntax
-             || (*l != PREFIX_SEPARATOR
-                 && *l != ',')))
+      /* Pseudo-prefixes end with a closing figure brace.  */
+      if (*mnemonic == '{' && *l == '}')
+       {
+         *mnem_p++ = *l++;
+         if (mnem_p >= mnemonic + MAX_MNEM_SIZE)
+           goto too_long;
+         *mnem_p = '\0';
+
+         /* Point l at the closing brace if there's no other separator.  */
+         if (*l != END_OF_INSN && !is_space_char (*l)
+             && *l != PREFIX_SEPARATOR)
+           --l;
+       }
+      else if (!is_space_char (*l)
+              && *l != END_OF_INSN
+              && (intel_syntax
+                  || (*l != PREFIX_SEPARATOR && *l != ',')))
         {
           if (prefix_only)
             break;
@@ -5520,7 +5760,7 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
           && current_templates
           && current_templates->start->opcode_modifier.isprefix)
         {
-         if (!cpu_flags_check_cpu64 (current_templates->start->cpu_flags))
+         if (!cpu_flags_check_cpu64 (current_templates->start))
             {
               as_bad ((flag_code != CODE_64BIT
                        ? _("`%s' is only supported in 64-bit mode")
@@ -5598,13 +5838,13 @@ parse_insn (const char *line, char *mnemonic, bool prefix_only)
                 case PREFIX_EXIST:
                   return NULL;
                 case PREFIX_DS:
-                 if (current_templates->start->cpu_flags.bitfield.cpuibt)
+                 if (is_cpu (current_templates->start, CpuIBT))
                     i.notrack_prefix = insn_name (current_templates->start);
                   break;
                 case PREFIX_REP:
-                 if (current_templates->start->cpu_flags.bitfield.cpuhle)
+                 if (is_cpu (current_templates->start, CpuHLE))
                     i.hle_prefix = insn_name (current_templates->start);
-                 else if (current_templates->start->cpu_flags.bitfield.cpumpx)
+                 else if (is_cpu (current_templates->start, CpuMPX))
                     i.bnd_prefix = insn_name (current_templates->start);
                   else
                     i.rep_prefix = insn_name (current_templates->start);
@@ -6022,9 +6262,15 @@ optimize_imm (void)
             break;
           }
      }
-  else if ((flag_code == CODE_16BIT) ^ (i.prefix[DATA_PREFIX] != 0))
+  else if ((flag_code == CODE_16BIT)
+           ^ (i.prefix[DATA_PREFIX] != 0 && !(i.prefix[REX_PREFIX] & REX_W)))
      guess_suffix = WORD_MNEM_SUFFIX;
-  else if (flag_code != CODE_64BIT || !(i.prefix[REX_PREFIX] & REX_W))
+  else if (flag_code != CODE_64BIT
+          || (!(i.prefix[REX_PREFIX] & REX_W)
+              /* A more generic (but also more involved) way of dealing
+                 with the special case(s) would be to go look for
+                 DefaultSize attributes on any of the templates.  */
+              && current_templates->start->mnem_off != MN_push))
      guess_suffix = LONG_MNEM_SUFFIX;
  
    for (op = i.operands; --op >= 0;)
@@ -6275,10 +6521,11 @@ check_VecOperands (const insn_template *t)
       operand size is YMMword or XMMword.  Since this function runs after
       template matching, there's no need to check for YMMword/XMMword in
       the template.  */
-  cpu = cpu_flags_and (t->cpu_flags, avx512);
+  cpu = cpu_flags_and (cpu_flags_from_attr (t->cpu), avx512);
    if (!cpu_flags_all_zero (&cpu)
-      && !t->cpu_flags.bitfield.cpuavx512vl
-      && !cpu_arch_flags.bitfield.cpuavx512vl)
+      && !is_cpu (t, CpuAVX512VL)
+      && !cpu_arch_flags.bitfield.cpuavx512vl
+      && (!t->opcode_modifier.vex || need_evex_encoding ()))
      {
        for (op = 0; op < t->operands; ++op)
         {
@@ -6294,8 +6541,7 @@ check_VecOperands (const insn_template *t)
  
    /* Somewhat similarly, templates specifying both AVX and AVX2 are
       requiring AVX2 support if the actual operand size is YMMword.  */
-  if (t->cpu_flags.bitfield.cpuavx
-      && t->cpu_flags.bitfield.cpuavx2
+  if (is_cpu (t, CpuAVX) && is_cpu (t, CpuAVX2)
        && !cpu_arch_flags.bitfield.cpuavx2)
      {
        for (op = 0; op < t->operands; ++op)
@@ -6454,9 +6700,13 @@ check_VecOperands (const insn_template *t)
           type.bitfield.xmmword = 1;
           break;
         case 32:
+         if (vector_size < VSZ256)
+           goto bad_broadcast;
           type.bitfield.ymmword = 1;
           break;
         case 64:
+         if (vector_size < VSZ512)
+           goto bad_broadcast;
           type.bitfield.zmmword = 1;
           break;
         default:
@@ -6515,36 +6765,25 @@ check_VecOperands (const insn_template *t)
    /* Check if requested masking is supported.  */
    if (i.mask.reg)
      {
-      switch (t->opcode_modifier.masking)
+      if (!t->opcode_modifier.masking)
         {
-       case BOTH_MASKING:
-         break;
-       case MERGING_MASKING:
-         if (i.mask.zeroing)
-           {
-       case 0:
-             i.error = unsupported_masking;
-             return 1;
-           }
-         break;
-       case DYNAMIC_MASKING:
-         /* Memory destinations allow only merging masking.  */
-         if (i.mask.zeroing && i.mem_operands)
-           {
-             /* Find memory operand.  */
-             for (op = 0; op < i.operands; op++)
-               if (i.flags[op] & Operand_Mem)
-                 break;
-             gas_assert (op < i.operands);
-             if (op == i.operands - 1)
-               {
-                 i.error = unsupported_masking;
-                 return 1;
-               }
-           }
-         break;
-       default:
-         abort ();
+         i.error = unsupported_masking;
+         return 1;
+       }
+
+      /* Common rules for masking:
+        - mask register destinations permit only zeroing-masking, without
+          that actually being expressed by a {z} operand suffix or EVEX.z,
+        - memory destinations allow only merging-masking,
+        - scatter/gather insns (i.e. ones using vSIB) only allow merging-
+          masking.  */
+      if (i.mask.zeroing
+         && (t->operand_types[t->operands - 1].bitfield.class == RegMask
+             || (i.flags[t->operands - 1] & Operand_Mem)
+             || t->opcode_modifier.sib))
+       {
+         i.error = unsupported_masking;
+         return 1;
         }
      }
  
@@ -6582,7 +6821,7 @@ check_VecOperands (const insn_template *t)
      }
  
    /* Check the special Imm4 cases; must be the first operand.  */
-  if (t->cpu_flags.bitfield.cpuxop && t->operands == 5)
+  if (is_cpu (t, CpuXOP) && t->operands == 5)
      {
        if (i.op[0].imms->X_op != O_constant
           || !fits_in_imm4 (i.op[0].imms->X_add_number))
@@ -6597,6 +6836,8 @@ check_VecOperands (const insn_template *t)
  
    /* Check vector Disp8 operand.  */
    if (t->opcode_modifier.disp8memshift
+      && (!t->opcode_modifier.vex
+          || need_evex_encoding ())
        && i.disp_encoding <= disp_encoding_8bit)
      {
        if (i.broadcast.type || i.broadcast.bytes)
@@ -6679,7 +6920,21 @@ VEX_check_encoding (const insn_template *t)
        return 1;
      }
  
-  if (i.vec_encoding == vex_encoding_evex)
+  /* Vector size restrictions.  */
+  if ((vector_size < VSZ512
+       && (t->opcode_modifier.evex == EVEX512
+          || t->opcode_modifier.vsz >= VSZ512))
+      || (vector_size < VSZ256
+         && (t->opcode_modifier.evex == EVEX256
+             || t->opcode_modifier.vex == VEX256
+             || t->opcode_modifier.vsz >= VSZ256)))
+    {
+      i.error = unsupported;
+      return 1;
+    }
+
+  if (i.vec_encoding == vex_encoding_evex
+      || i.vec_encoding == vex_encoding_evex512)
      {
        /* This instruction must be encoded with EVEX prefix.  */
        if (!is_evex_encoding (t))
@@ -7050,8 +7305,8 @@ match_template (char mnem_suffix)
               if (!(size_match & MATCH_REVERSE))
                 continue;
               /* Try reversing direction of operands.  */
-             j = t->cpu_flags.bitfield.cpufma4
-                 || t->cpu_flags.bitfield.cpuxop ? 1 : i.operands - 1;
+             j = is_cpu (t, CpuFMA4)
+                 || is_cpu (t, CpuXOP) ? 1 : i.operands - 1;
               overlap0 = operand_type_and (i.types[0], operand_types[j]);
               overlap1 = operand_type_and (i.types[j], operand_types[0]);
               overlap2 = operand_type_and (i.types[1], operand_types[1]);
@@ -7085,8 +7340,7 @@ match_template (char mnem_suffix)
                       && (intel_syntax || intel_mnemonic))
                     found_reverse_match |= Opcode_FloatR;
                 }
-             else if (t->cpu_flags.bitfield.cpufma4
-                      || t->cpu_flags.bitfield.cpuxop)
+             else if (is_cpu (t, CpuFMA4) || is_cpu (t, CpuXOP))
                 {
                   found_reverse_match = Opcode_VexW;
                   goto check_operands_345;
@@ -7177,6 +7431,33 @@ match_template (char mnem_suffix)
           continue;
         }
  
+      /* Check whether to use the shorter VEX encoding for certain insns where
+        the EVEX enconding comes first in the table.  This requires the respective
+        AVX-* feature to be explicitly enabled.  */
+      if (t == current_templates->start
+         && t->opcode_modifier.disp8memshift
+         && !t->opcode_modifier.vex
+         && !need_evex_encoding ()
+         && t + 1 < current_templates->end
+         && t[1].opcode_modifier.vex)
+       {
+         i386_cpu_flags cpu;
+         unsigned int memshift = i.memshift;
+
+         i.memshift = 0;
+         cpu = cpu_flags_and (cpu_flags_from_attr (t[1].cpu), cpu_arch_isa_flags);
+         if (!cpu_flags_all_zero (&cpu)
+             && (!i.types[0].bitfield.disp8
+                 || !operand_type_check (i.types[0], disp)
+                 || i.op[0].disps->X_op != O_constant
+                 || fits_in_disp8 (i.op[0].disps->X_add_number)))
+           {
+             specific_error = progress (internal_error);
+             continue;
+           }
+         i.memshift = memshift;
+       }
+
        /* We've found a match; break out of loop.  */
        break;
      }
@@ -7306,7 +7587,7 @@ process_suffix (void)
                  && (i.tm.base_opcode | 8) == 0xbe)
                 || (i.tm.opcode_space == SPACE_BASE
                     && i.tm.base_opcode == 0x63
-                   && i.tm.cpu_flags.bitfield.cpu64);
+                   && is_cpu (&i.tm, Cpu64));
  
        /* movsx/movzx want only their source operand considered here, for the
          ambiguity checking below.  The suffix will be replaced afterwards
@@ -7468,8 +7749,27 @@ process_suffix (void)
  
           for (op = 0; op < i.tm.operands; ++op)
             {
-             if (is_evex_encoding (&i.tm)
-                 && !cpu_arch_flags.bitfield.cpuavx512vl)
+             if (vector_size < VSZ512)
+               {
+                 i.tm.operand_types[op].bitfield.zmmword = 0;
+                 if (vector_size < VSZ256)
+                   {
+                     i.tm.operand_types[op].bitfield.ymmword = 0;
+                     if (i.tm.operand_types[op].bitfield.xmmword
+                         && (i.tm.opcode_modifier.evex == EVEXDYN
+                             || (!i.tm.opcode_modifier.evex
+                                 && is_evex_encoding (&i.tm))))
+                       i.tm.opcode_modifier.evex = EVEX128;
+                   }
+                 else if (i.tm.operand_types[op].bitfield.ymmword
+                          && !i.tm.operand_types[op].bitfield.xmmword
+                          && (i.tm.opcode_modifier.evex == EVEXDYN
+                              || (!i.tm.opcode_modifier.evex
+                                  && is_evex_encoding (&i.tm))))
+                   i.tm.opcode_modifier.evex = EVEX256;
+               }
+             else if (is_evex_encoding (&i.tm)
+                      && !cpu_arch_flags.bitfield.cpuavx512vl)
                 {
                   if (i.tm.operand_types[op].bitfield.ymmword)
                     i.tm.operand_types[op].bitfield.xmmword = 0;
@@ -7968,7 +8268,8 @@ update_imm (unsigned int j)
                || operand_type_equal (&overlap, &imm16_32)
                || operand_type_equal (&overlap, &imm16_32s))
         {
-         if ((flag_code == CODE_16BIT) ^ (i.prefix[DATA_PREFIX] != 0))
+         if ((flag_code == CODE_16BIT)
+             ^ (i.prefix[DATA_PREFIX] != 0 && !(i.prefix[REX_PREFIX] & REX_W)))
             overlap = imm16;
           else
             overlap = imm32s;
@@ -8831,8 +9132,6 @@ output_branch (void)
        off = 0;
      }
  
-  frag_now->tc_frag_data.code64 = flag_code == CODE_64BIT;
-
    /* 1 possible extra opcode + 4 byte displacement go in var part.
       Pass reloc in fr_var.  */
    frag_var (rs_machine_dependent, 5, i.reloc[0], subtype, sym, off, p);
@@ -9308,7 +9607,7 @@ maybe_fused_with_jcc_p (enum mf_cmp_kind* mf_cmp_p)
      }
  
    /* inc, dec without inc/dec m.   */
-  if ((i.tm.cpu_flags.bitfield.cpuno64
+  if ((is_cpu (&i.tm, CpuNo64)
         && (i.tm.base_opcode | 0xf) == 0x4f)
        || ((i.tm.base_opcode | 1) == 0xff
           && i.tm.extension_opcode <= 0x1))
@@ -9356,7 +9655,7 @@ add_branch_prefix_frag_p (void)
    if (!align_branch_power
        || !align_branch_prefix_size
        || now_seg == absolute_section
-      || i.tm.cpu_flags.bitfield.cpupadlock
+      || is_cpu (&i.tm, CpuPadLock)
        || !cpu_arch_flags.bitfield.cpui386)
      return 0;
  
@@ -9484,14 +9783,14 @@ output_insn (void)
    if (IS_ELF && x86_used_note && now_seg != absolute_section)
      {
        if ((i.xstate & xstate_tmm) == xstate_tmm
-         || i.tm.cpu_flags.bitfield.cpuamx_tile)
+         || is_cpu (&i.tm, CpuAMX_TILE))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_TMM;
  
-      if (i.tm.cpu_flags.bitfield.cpu8087
-         || i.tm.cpu_flags.bitfield.cpu287
-         || i.tm.cpu_flags.bitfield.cpu387
-         || i.tm.cpu_flags.bitfield.cpu687
-         || i.tm.cpu_flags.bitfield.cpufisttp)
+      if (is_cpu (&i.tm, Cpu8087)
+         || is_cpu (&i.tm, Cpu287)
+         || is_cpu (&i.tm, Cpu387)
+         || is_cpu (&i.tm, Cpu687)
+         || is_cpu (&i.tm, CpuFISTTP))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X87;
  
        if ((i.xstate & xstate_mmx)
@@ -9510,16 +9809,16 @@ output_insn (void)
         }
  
        /* vzeroall / vzeroupper */
-      if (i.tm.base_opcode == 0x77 && i.tm.cpu_flags.bitfield.cpuavx)
+      if (i.tm.base_opcode == 0x77 && is_cpu (&i.tm, CpuAVX))
         i.xstate |= xstate_ymm;
  
        if ((i.xstate & xstate_xmm)
           /* ldmxcsr / stmxcsr / vldmxcsr / vstmxcsr */
           || (i.tm.base_opcode == 0xae
-             && (i.tm.cpu_flags.bitfield.cpusse
-                 || i.tm.cpu_flags.bitfield.cpuavx))
-         || i.tm.cpu_flags.bitfield.cpuwidekl
-         || i.tm.cpu_flags.bitfield.cpukl)
+             && (is_cpu (&i.tm, CpuSSE)
+                 || is_cpu (&i.tm, CpuAVX)))
+         || is_cpu (&i.tm, CpuWideKL)
+         || is_cpu (&i.tm, CpuKL))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XMM;
  
        if ((i.xstate & xstate_ymm) == xstate_ymm)
@@ -9528,65 +9827,65 @@ output_insn (void)
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_ZMM;
        if (i.mask.reg || (i.xstate & xstate_mask) == xstate_mask)
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_MASK;
-      if (i.tm.cpu_flags.bitfield.cpufxsr)
+      if (is_cpu (&i.tm, CpuFXSR))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_FXSR;
-      if (i.tm.cpu_flags.bitfield.cpuxsave)
+      if (is_cpu (&i.tm, CpuXsave))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVE;
-      if (i.tm.cpu_flags.bitfield.cpuxsaveopt)
+      if (is_cpu (&i.tm, CpuXsaveopt))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEOPT;
-      if (i.tm.cpu_flags.bitfield.cpuxsavec)
+      if (is_cpu (&i.tm, CpuXSAVEC))
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XSAVEC;
  
        if (x86_feature_2_used
-         || i.tm.cpu_flags.bitfield.cpucmov
-         || i.tm.cpu_flags.bitfield.cpusyscall
+         || is_cpu (&i.tm, CpuCMOV)
+         || is_cpu (&i.tm, CpuSYSCALL)
           || i.tm.mnem_off == MN_cmpxchg8b)
         x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_BASELINE;
-      if (i.tm.cpu_flags.bitfield.cpusse3
-         || i.tm.cpu_flags.bitfield.cpussse3
-         || i.tm.cpu_flags.bitfield.cpusse4_1
-         || i.tm.cpu_flags.bitfield.cpusse4_2
-         || i.tm.cpu_flags.bitfield.cpucx16
-         || i.tm.cpu_flags.bitfield.cpupopcnt
+      if (is_cpu (&i.tm, CpuSSE3)
+         || is_cpu (&i.tm, CpuSSSE3)
+         || is_cpu (&i.tm, CpuSSE4_1)
+         || is_cpu (&i.tm, CpuSSE4_2)
+         || is_cpu (&i.tm, CpuCX16)
+         || is_cpu (&i.tm, CpuPOPCNT)
           /* LAHF-SAHF insns in 64-bit mode.  */
           || (flag_code == CODE_64BIT
               && (i.tm.base_opcode | 1) == 0x9f
               && i.tm.opcode_space == SPACE_BASE))
         x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_V2;
-      if (i.tm.cpu_flags.bitfield.cpuavx
-         || i.tm.cpu_flags.bitfield.cpuavx2
+      if (is_cpu (&i.tm, CpuAVX)
+         || is_cpu (&i.tm, CpuAVX2)
           /* Any VEX encoded insns execpt for AVX512F, AVX512BW, AVX512DQ,
              XOP, FMA4, LPW, TBM, and AMX.  */
           || (i.tm.opcode_modifier.vex
-             && !i.tm.cpu_flags.bitfield.cpuavx512f
-             && !i.tm.cpu_flags.bitfield.cpuavx512bw
-             && !i.tm.cpu_flags.bitfield.cpuavx512dq
-             && !i.tm.cpu_flags.bitfield.cpuxop
-             && !i.tm.cpu_flags.bitfield.cpufma4
-             && !i.tm.cpu_flags.bitfield.cpulwp
-             && !i.tm.cpu_flags.bitfield.cputbm
+             && !is_cpu (&i.tm, CpuAVX512F)
+             && !is_cpu (&i.tm, CpuAVX512BW)
+             && !is_cpu (&i.tm, CpuAVX512DQ)
+             && !is_cpu (&i.tm, CpuXOP)
+             && !is_cpu (&i.tm, CpuFMA4)
+             && !is_cpu (&i.tm, CpuLWP)
+             && !is_cpu (&i.tm, CpuTBM)
               && !(x86_feature_2_used & GNU_PROPERTY_X86_FEATURE_2_TMM))
-         || i.tm.cpu_flags.bitfield.cpuf16c
-         || i.tm.cpu_flags.bitfield.cpufma
-         || i.tm.cpu_flags.bitfield.cpulzcnt
-         || i.tm.cpu_flags.bitfield.cpumovbe
-         || i.tm.cpu_flags.bitfield.cpuxsaves
+         || is_cpu (&i.tm, CpuF16C)
+         || is_cpu (&i.tm, CpuFMA)
+         || is_cpu (&i.tm, CpuLZCNT)
+         || is_cpu (&i.tm, CpuMovbe)
+         || is_cpu (&i.tm, CpuXSAVES)
           || (x86_feature_2_used
               & (GNU_PROPERTY_X86_FEATURE_2_XSAVE
                  | GNU_PROPERTY_X86_FEATURE_2_XSAVEOPT
                  | GNU_PROPERTY_X86_FEATURE_2_XSAVEC)) != 0)
         x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_V3;
-      if (i.tm.cpu_flags.bitfield.cpuavx512f
-         || i.tm.cpu_flags.bitfield.cpuavx512bw
-         || i.tm.cpu_flags.bitfield.cpuavx512dq
-         || i.tm.cpu_flags.bitfield.cpuavx512vl
+      if (is_cpu (&i.tm, CpuAVX512F)
+         || is_cpu (&i.tm, CpuAVX512BW)
+         || is_cpu (&i.tm, CpuAVX512DQ)
+         || is_cpu (&i.tm, CpuAVX512VL)
           /* Any EVEX encoded insns except for AVX512ER, AVX512PF,
              AVX512-4FMAPS, and AVX512-4VNNIW.  */
           || (i.tm.opcode_modifier.evex
-             && !i.tm.cpu_flags.bitfield.cpuavx512er
-             && !i.tm.cpu_flags.bitfield.cpuavx512pf
-             && !i.tm.cpu_flags.bitfield.cpuavx512_4fmaps
-             && !i.tm.cpu_flags.bitfield.cpuavx512_4vnniw))
+             && !is_cpu (&i.tm, CpuAVX512ER)
+             && !is_cpu (&i.tm, CpuAVX512PF)
+             && !is_cpu (&i.tm, CpuAVX512_4FMAPS)
+             && !is_cpu (&i.tm, CpuAVX512_4VNNIW)))
         x86_isa_1_used |= GNU_PROPERTY_X86_ISA_1_V4;
      }
  #endif
@@ -9731,7 +10030,7 @@ output_insn (void)
               add_prefix (0xf2);
               break;
             case PREFIX_0XF3:
-             if (!i.tm.cpu_flags.bitfield.cpupadlock
+             if (!is_cpu (&i.tm, CpuPadLock)
                   || (i.prefix[REP_PREFIX] != 0xf3))
                 add_prefix (0xf3);
               break;
@@ -10208,6 +10507,7 @@ output_imm (fragS *insn_start_frag, offsetT insn_start_off)
               if (i.types[n].bitfield.imm32s
                   && (i.suffix == QWORD_MNEM_SUFFIX
                       || (!i.suffix && i.tm.opcode_modifier.no_lsuf)
+                     || (i.prefix[REX_PREFIX] & REX_W)
                       || dot_insn ()))
                 sign = 1;
               else
@@ -10667,6 +10967,29 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
    if (line > end && i.vec_encoding == vex_encoding_default)
      i.vec_encoding = evex ? vex_encoding_evex : vex_encoding_vex;
  
+  if (i.vec_encoding != vex_encoding_default)
+    {
+      /* Only address size and segment override prefixes are permitted with
+         VEX/XOP/EVEX encodings.  */
+      const unsigned char *p = i.prefix;
+
+      for (j = 0; j < ARRAY_SIZE (i.prefix); ++j, ++p)
+       {
+         if (!*p)
+           continue;
+
+         switch (j)
+           {
+           case SEG_PREFIX:
+           case ADDR_PREFIX:
+             break;
+           default:
+                 as_bad (_("illegal prefix used with VEX/XOP/EVEX"));
+                 goto bad;
+           }
+       }
+    }
+
    if (line > end && *line == '.')
      {
        /* Length specifier (VEX.L, XOP.L, EVEX.L'L).  */
@@ -10975,6 +11298,10 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
           goto done;
         }
  
+      /* No need to distinguish vex_encoding_evex and vex_encoding_evex512.  */
+      if (i.vec_encoding == vex_encoding_evex512)
+       i.vec_encoding = vex_encoding_evex;
+
        /* Are we to emit ModR/M encoding?  */
        if (!i.short_form
           && (i.mem_operands
@@ -11302,8 +11629,9 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
           ? i.broadcast.type || i.broadcast.bytes
             || i.rounding.type != rc_none
             || i.mask.reg
-         : (i.broadcast.type || i.broadcast.bytes)
-           && i.rounding.type != rc_none))
+         : (i.mem_operands && i.rounding.type != rc_none)
+           || ((i.broadcast.type || i.broadcast.bytes)
+               && !(i.flags[i.broadcast.operand] & Operand_Mem))))
      {
        as_bad (_("conflicting .insn operands"));
        goto done;
@@ -11397,6 +11725,12 @@ RC_SAE_specifier (const char *pstr)
               return NULL;
             }
  
+         if (i.vec_encoding == vex_encoding_default)
+           i.vec_encoding = vex_encoding_evex512;
+         else if (i.vec_encoding != vex_encoding_evex
+                  && i.vec_encoding != vex_encoding_evex512)
+           return NULL;
+
           i.rounding.type = RC_NamesTable[j].type;
  
           return (char *)(pstr + RC_NamesTable[j].len);
@@ -11456,6 +11790,12 @@ check_VecOperations (char *op_string)
                 }
               op_string++;
  
+             if (i.vec_encoding == vex_encoding_default)
+               i.vec_encoding = vex_encoding_evex;
+             else if (i.vec_encoding != vex_encoding_evex
+                      && i.vec_encoding != vex_encoding_evex512)
+               goto unknown_vec_op;
+
               i.broadcast.type = bcst_type;
               i.broadcast.operand = this_operand;
  
@@ -12034,7 +12374,7 @@ i386_addressing_mode (void)
    if (i.prefix[ADDR_PREFIX])
      addr_mode = flag_code == CODE_32BIT ? CODE_16BIT : CODE_32BIT;
    else if (flag_code == CODE_16BIT
-          && current_templates->start->cpu_flags.bitfield.cpumpx
+          && is_cpu (current_templates->start, CpuMPX)
            /* Avoid replacing the "16-bit addressing not allowed" diagnostic
               from md_assemble() by "is not a valid base/index expression"
               when there is a base and/or index.  */
@@ -12114,13 +12454,13 @@ i386_index_check (const char *operand_string)
        /* Memory operands of string insns are special in that they only allow
          a single register (rDI, rSI, or rBX) as their memory address.  */
        const reg_entry *expected_reg;
-      static const char *di_si[][2] =
+      static const char di_si[][2][4] =
         {
           { "esi", "edi" },
           { "si", "di" },
           { "rsi", "rdi" }
         };
-      static const char *bx[] = { "ebx", "bx", "rbx" };
+      static const char bx[][4] = { "ebx", "bx", "rbx" };
  
        kind = "string address";
  
@@ -13152,7 +13492,8 @@ md_estimate_size_before_relax (fragS *fragP, segT segment)
        else if (size == 2)
         reloc_type = BFD_RELOC_16_PCREL;
  #if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
-      else if (fragP->tc_frag_data.code64 && fragP->fr_offset == 0
+      else if (fragP->tc_frag_data.code == CODE_64BIT
+              && fragP->fr_offset == 0
                && need_plt32_p (fragP->fr_symbol))
         reloc_type = BFD_RELOC_X86_64_PLT32;
  #endif
@@ -13717,6 +14058,21 @@ static bool check_register (const reg_entry *r)
         }
      }
  
+  if (r->reg_type.bitfield.zmmword)
+    {
+      if (vector_size < VSZ512)
+       return false;
+
+      if (i.vec_encoding == vex_encoding_default)
+       i.vec_encoding = vex_encoding_evex512;
+      else if (i.vec_encoding != vex_encoding_evex
+              && i.vec_encoding != vex_encoding_evex512)
+       i.vec_encoding = vex_encoding_error;
+    }
+
+  if (vector_size < VSZ256 && r->reg_type.bitfield.ymmword)
+    return false;
+
    if (r->reg_type.bitfield.tmmword
        && (!cpu_arch_flags.bitfield.cpuamx_tile
            || flag_code != CODE_64BIT))
@@ -13737,14 +14093,15 @@ static bool check_register (const reg_entry *r)
           || flag_code != CODE_64BIT)
         return false;
  
-      if (i.vec_encoding == vex_encoding_default)
+      if (i.vec_encoding == vex_encoding_default
+         || i.vec_encoding == vex_encoding_evex512)
         i.vec_encoding = vex_encoding_evex;
        else if (i.vec_encoding != vex_encoding_evex)
         i.vec_encoding = vex_encoding_error;
      }
  
    if (((r->reg_flags & (RegRex64 | RegRex)) || r->reg_type.bitfield.qword)
-      && (!cpu_arch_flags.bitfield.cpulm
+      && (!cpu_arch_flags.bitfield.cpu64
           || r->reg_type.bitfield.class != RegCR
           || dot_insn ())
        && flag_code != CODE_64BIT)
@@ -13884,6 +14241,13 @@ i386_parse_name (char *name, expressionS *e, char *nextcharP)
    const reg_entry *r = NULL;
    char *end = input_line_pointer;
  
+  /* We only know the terminating character here.  It being double quote could
+     be the closing one of a quoted symbol name, or an opening one from a
+     following string (or another quoted symbol name).  Since the latter can't
+     be valid syntax for anything, bailing in either case is good enough.  */
+  if (*nextcharP == '"')
+    return 0;
+
    *end = *nextcharP;
    if (*name == REGISTER_PREFIX || allow_naked_reg)
      r = parse_real_register (name, &input_line_pointer);
@@ -14166,7 +14530,21 @@ md_parse_option (int c, const char *arg)
  #endif
  
      case OPTION_32:
-      default_arch = "i386";
+      {
+       const char **list, **l;
+
+       list = bfd_target_list ();
+       for (l = list; *l != NULL; l++)
+         if (strstr (*l, "-i386")
+             || strstr (*l, "-go32"))
+           {
+             default_arch = "i386";
+             break;
+           }
+       if (*l == NULL)
+         as_fatal (_("no compiled in support for ix86"));
+       free (list);
+      }
        break;
  
      case OPTION_DIVIDE:
@@ -14194,13 +14572,21 @@ md_parse_option (int c, const char *arg)
         arch++;
        do
         {
+         char *vsz;
+
           if (*arch == '.')
             as_fatal (_("invalid -march= option: `%s'"), arg);
           next = strchr (arch, '+');
           if (next)
             *next++ = '\0';
+         vsz = strchr (arch, '/');
+         if (vsz)
+           *vsz++ = '\0';
           for (j = 0; j < ARRAY_SIZE (cpu_arch); j++)
             {
+             if (vsz && cpu_arch[j].vsz != vsz_set)
+               continue;
+
               if (arch == saved && cpu_arch[j].type != PROCESSOR_NONE
                   && strcmp (arch, cpu_arch[j].name) == 0)
                 {
@@ -14215,10 +14601,8 @@ md_parse_option (int c, const char *arg)
                   cpu_arch_isa = cpu_arch[j].type;
                   cpu_arch_isa_flags = cpu_arch[j].enable;
                   if (!cpu_arch_tune_set)
-                   {
-                     cpu_arch_tune = cpu_arch_isa;
-                     cpu_arch_tune_flags = cpu_arch_isa_flags;
-                   }
+                   cpu_arch_tune = cpu_arch_isa;
+                 vector_size = VSZ_DEFAULT;
                   break;
                 }
               else if (cpu_arch[j].type == PROCESSOR_NONE
@@ -14226,21 +14610,38 @@ md_parse_option (int c, const char *arg)
                        && !cpu_flags_all_zero (&cpu_arch[j].enable))
                 {
                   /* ISA extension.  */
-                 i386_cpu_flags flags;
+                 isa_enable (j);
  
-                 flags = cpu_flags_or (cpu_arch_flags,
-                                       cpu_arch[j].enable);
-
-                 if (!cpu_flags_equal (&flags, &cpu_arch_flags))
+                 switch (cpu_arch[j].vsz)
                     {
-                     extend_cpu_sub_arch_name (arch);
-                     cpu_arch_flags = flags;
-                     cpu_arch_isa_flags = flags;
+                   default:
+                     break;
+
+                   case vsz_set:
+                     if (vsz)
+                       {
+                         char *end;
+                         unsigned long val = strtoul (vsz, &end, 0);
+
+                         if (*end)
+                           val = 0;
+                         switch (val)
+                           {
+                           case 512: vector_size = VSZ512; break;
+                           case 256: vector_size = VSZ256; break;
+                           case 128: vector_size = VSZ128; break;
+                           default:
+                             as_warn (_("Unrecognized vector size specifier ignored"));
+                             break;
+                           }
+                         break;
+                       }
+                       /* Fall through.  */
+                   case vsz_reset:
+                     vector_size = VSZ_DEFAULT;
+                     break;
                     }
-                 else
-                   cpu_arch_isa_flags
-                     = cpu_flags_or (cpu_arch_isa_flags,
-                                     cpu_arch[j].enable);
+
                   break;
                 }
             }
@@ -14252,16 +14653,9 @@ md_parse_option (int c, const char *arg)
                 if (cpu_arch[j].type == PROCESSOR_NONE
                     && strcmp (arch + 2, cpu_arch[j].name) == 0)
                   {
-                   i386_cpu_flags flags;
-
-                   flags = cpu_flags_and_not (cpu_arch_flags,
-                                              cpu_arch[j].disable);
-                   if (!cpu_flags_equal (&flags, &cpu_arch_flags))
-                     {
-                       extend_cpu_sub_arch_name (arch);
-                       cpu_arch_flags = flags;
-                       cpu_arch_isa_flags = flags;
-                     }
+                   isa_disable (j);
+                   if (cpu_arch[j].vsz == vsz_set)
+                     vector_size = VSZ_DEFAULT;
                     break;
                   }
             }
@@ -14285,7 +14679,6 @@ md_parse_option (int c, const char *arg)
             {
               cpu_arch_tune_set = 1;
               cpu_arch_tune = cpu_arch [j].type;
-             cpu_arch_tune_flags = cpu_arch[j].enable;
               break;
             }
         }
@@ -14891,10 +15284,7 @@ i386_target_format (void)
           cpu_arch_isa = PROCESSOR_IAMCU;
           cpu_arch_isa_flags = iamcu_flags;
           if (!cpu_arch_tune_set)
-           {
-             cpu_arch_tune = cpu_arch_isa;
-             cpu_arch_tune_flags = cpu_arch_isa_flags;
-           }
+           cpu_arch_tune = PROCESSOR_IAMCU;
         }
        else if (cpu_arch_isa != PROCESSOR_IAMCU)
         as_fatal (_("Intel MCU doesn't support `%s' architecture"),
@@ -14905,8 +15295,6 @@ i386_target_format (void)
  
    if (cpu_flags_all_zero (&cpu_arch_isa_flags))
      cpu_arch_isa_flags = cpu_arch[flag_code == CODE_64BIT].enable;
-  if (cpu_flags_all_zero (&cpu_arch_tune_flags))
-    cpu_arch_tune_flags = cpu_arch[flag_code == CODE_64BIT].enable;
  
    switch (OUTPUT_FLAVOR)
      {
@@ -15527,15 +15915,6 @@ x86_64_section_letter (int letter, const char **ptr_msg)
    return -1;
  }
  
-bfd_vma
-x86_64_section_word (char *str, size_t len)
-{
-  if (len == 5 && flag_code == CODE_64BIT && startswith (str, "large"))
-    return SHF_X86_64_LARGE;
-
-  return -1;
-}
-
  static void
  handle_large_common (int small ATTRIBUTE_UNUSED)
  {