x86: Display default x86-specific options for "as --help"

[binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 1c64d08ab26be3ec34e7c057df2d8a75fb6e228f..ddbffb95b113b705d410f5859ba745711cb506eb 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -225,11 +225,14 @@ static struct Mask_Operation mask_op;
     broadcast factor.  */
  struct Broadcast_Operation
  {
-  /* Type of broadcast: no broadcast, {1to8}, or {1to16}.  */
+  /* Type of broadcast: {1to2}, {1to4}, {1to8}, or {1to16}.  */
    int type;
  
    /* Index of broadcasted operand.  */
    int operand;
+
+  /* Number of bytes to broadcast.  */
+  int bytes;
  };
  
  static struct Broadcast_Operation broadcast_op;
@@ -269,7 +272,6 @@ enum i386_error
      invalid_vector_register_set,
      unsupported_vector_index_register,
      unsupported_broadcast,
-    broadcast_not_on_src_operand,
      broadcast_needed,
      unsupported_masking,
      mask_not_on_destination,
@@ -307,6 +309,7 @@ struct _i386_insn
      /* Flags for operands.  */
      unsigned int flags[MAX_OPERANDS];
  #define Operand_PCrel 1
+#define Operand_Mem   2
  
      /* Relocation type for operand */
      enum bfd_reloc_code_real reloc[MAX_OPERANDS];
@@ -432,7 +435,6 @@ const char extra_symbol_chars[] = "*%-([{}"
          && !defined (TE_GNU)                           \
          && !defined (TE_LINUX)                         \
          && !defined (TE_NACL)                          \
-        && !defined (TE_NETWARE)                       \
          && !defined (TE_FreeBSD)                       \
          && !defined (TE_DragonFly)                     \
          && !defined (TE_NetBSD)))
@@ -844,6 +846,8 @@ static const arch_entry cpu_arch[] =
      CPU_BDVER4_FLAGS, 0 },
    { STRING_COMMA_LEN ("znver1"), PROCESSOR_ZNVER,
      CPU_ZNVER1_FLAGS, 0 },
+  { STRING_COMMA_LEN ("znver2"), PROCESSOR_ZNVER,
+    CPU_ZNVER2_FLAGS, 0 },
    { STRING_COMMA_LEN ("btver1"), PROCESSOR_BT,
      CPU_BTVER1_FLAGS, 0 },
    { STRING_COMMA_LEN ("btver2"), PROCESSOR_BT,
@@ -1026,6 +1030,14 @@ static const arch_entry cpu_arch[] =
      CPU_WBNOINVD_FLAGS, 0 },
    { STRING_COMMA_LEN (".pconfig"), PROCESSOR_UNKNOWN,
      CPU_PCONFIG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".waitpkg"), PROCESSOR_UNKNOWN,
+    CPU_WAITPKG_FLAGS, 0 },
+  { STRING_COMMA_LEN (".cldemote"), PROCESSOR_UNKNOWN,
+    CPU_CLDEMOTE_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdiri"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIRI_FLAGS, 0 },
+  { STRING_COMMA_LEN (".movdir64b"), PROCESSOR_UNKNOWN,
+    CPU_MOVDIR64B_FLAGS, 0 },
  };
  
  static const noarch_entry cpu_noarch[] =
@@ -1061,6 +1073,8 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("noavx512_bitalg"), CPU_ANY_AVX512_BITALG_FLAGS },
    { STRING_COMMA_LEN ("noibt"), CPU_ANY_IBT_FLAGS },
    { STRING_COMMA_LEN ("noshstk"), CPU_ANY_SHSTK_FLAGS },
+  { STRING_COMMA_LEN ("nomovdiri"), CPU_ANY_MOVDIRI_FLAGS },
+  { STRING_COMMA_LEN ("nomovdir64b"), CPU_ANY_MOVDIR64B_FLAGS },
  };
  
  #ifdef I386COFF
@@ -1821,11 +1835,6 @@ operand_type_xor (i386_operand_type x, i386_operand_type y)
  
  static const i386_operand_type acc32 = OPERAND_TYPE_ACC32;
  static const i386_operand_type acc64 = OPERAND_TYPE_ACC64;
-static const i386_operand_type control = OPERAND_TYPE_CONTROL;
-static const i386_operand_type inoutportreg
-  = OPERAND_TYPE_INOUTPORTREG;
-static const i386_operand_type reg16_inoutportreg
-  = OPERAND_TYPE_REG16_INOUTPORTREG;
  static const i386_operand_type disp16 = OPERAND_TYPE_DISP16;
  static const i386_operand_type disp32 = OPERAND_TYPE_DISP32;
  static const i386_operand_type disp32s = OPERAND_TYPE_DISP32S;
@@ -1891,71 +1900,81 @@ operand_type_check (i386_operand_type t, enum operand_type c)
    return 0;
  }
  
-/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in 8bit/16bit/32bit/64bit/80bit size
+   between operand GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_reg_size (const insn_template *t, unsigned int j)
+match_operand_size (const insn_template *t, unsigned int wanted,
+                   unsigned int given)
  {
-  return !((i.types[j].bitfield.byte
-           && !t->operand_types[j].bitfield.byte)
-          || (i.types[j].bitfield.word
-              && !t->operand_types[j].bitfield.word)
-          || (i.types[j].bitfield.dword
-              && !t->operand_types[j].bitfield.dword)
-          || (i.types[j].bitfield.qword
-              && !t->operand_types[j].bitfield.qword)
-          || (i.types[j].bitfield.tbyte
-              && !t->operand_types[j].bitfield.tbyte));
+  return !((i.types[given].bitfield.byte
+           && !t->operand_types[wanted].bitfield.byte)
+          || (i.types[given].bitfield.word
+              && !t->operand_types[wanted].bitfield.word)
+          || (i.types[given].bitfield.dword
+              && !t->operand_types[wanted].bitfield.dword)
+          || (i.types[given].bitfield.qword
+              && !t->operand_types[wanted].bitfield.qword)
+          || (i.types[given].bitfield.tbyte
+              && !t->operand_types[wanted].bitfield.tbyte));
  }
  
-/* Return 1 if there is no conflict in SIMD register on
-   operand J for instruction template T.  */
+/* Return 1 if there is no conflict in SIMD register between operand
+   GIVEN and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_simd_size (const insn_template *t, unsigned int j)
+match_simd_size (const insn_template *t, unsigned int wanted,
+                unsigned int given)
  {
-  return !((i.types[j].bitfield.xmmword
-           && !t->operand_types[j].bitfield.xmmword)
-          || (i.types[j].bitfield.ymmword
-              && !t->operand_types[j].bitfield.ymmword)
-          || (i.types[j].bitfield.zmmword
-              && !t->operand_types[j].bitfield.zmmword));
+  return !((i.types[given].bitfield.xmmword
+           && !t->operand_types[wanted].bitfield.xmmword)
+          || (i.types[given].bitfield.ymmword
+              && !t->operand_types[wanted].bitfield.ymmword)
+          || (i.types[given].bitfield.zmmword
+              && !t->operand_types[wanted].bitfield.zmmword));
  }
  
-/* Return 1 if there is no conflict in any size on operand J for
-   instruction template T.  */
+/* Return 1 if there is no conflict in any size between operand GIVEN
+   and opeand WANTED for instruction template T.  */
  
  static INLINE int
-match_mem_size (const insn_template *t, unsigned int j)
+match_mem_size (const insn_template *t, unsigned int wanted,
+               unsigned int given)
  {
-  return (match_reg_size (t, j)
-         && !((i.types[j].bitfield.unspecified
+  return (match_operand_size (t, wanted, given)
+         && !((i.types[given].bitfield.unspecified
                 && !i.broadcast
-               && !t->operand_types[j].bitfield.unspecified)
-              || (i.types[j].bitfield.fword
-                  && !t->operand_types[j].bitfield.fword)
+               && !t->operand_types[wanted].bitfield.unspecified)
+              || (i.types[given].bitfield.fword
+                  && !t->operand_types[wanted].bitfield.fword)
                /* For scalar opcode templates to allow register and memory
                   operands at the same time, some special casing is needed
-                 here.  */
-              || ((t->operand_types[j].bitfield.regsimd
+                 here.  Also for v{,p}broadcast*, {,v}pmov{s,z}*, and
+                 down-conversion vpmov*.  */
+              || ((t->operand_types[wanted].bitfield.regsimd
                     && !t->opcode_modifier.broadcast
-                   && (t->operand_types[j].bitfield.dword
-                       || t->operand_types[j].bitfield.qword))
-                  ? (i.types[j].bitfield.xmmword
-                     || i.types[j].bitfield.ymmword
-                     || i.types[j].bitfield.zmmword)
-                  : !match_simd_size(t, j))));
+                   && (t->operand_types[wanted].bitfield.byte
+                       || t->operand_types[wanted].bitfield.word
+                       || t->operand_types[wanted].bitfield.dword
+                       || t->operand_types[wanted].bitfield.qword))
+                  ? (i.types[given].bitfield.xmmword
+                     || i.types[given].bitfield.ymmword
+                     || i.types[given].bitfield.zmmword)
+                  : !match_simd_size(t, wanted, given))));
  }
  
-/* Return 1 if there is no size conflict on any operands for
-   instruction template T.  */
+/* Return value has MATCH_STRAIGHT set if there is no size conflict on any
+   operands for instruction template T, and it has MATCH_REVERSE set if there
+   is no size conflict on any operands for the template with operands reversed
+   (and the template allows for reversing in the first place).  */
  
-static INLINE int
+#define MATCH_STRAIGHT 1
+#define MATCH_REVERSE  2
+
+static INLINE unsigned int
  operand_size_match (const insn_template *t)
  {
-  unsigned int j;
-  int match = 1;
+  unsigned int j, match = MATCH_STRAIGHT;
  
    /* Don't check jump instructions.  */
    if (t->opcode_modifier.jump
@@ -1972,59 +1991,56 @@ operand_size_match (const insn_template *t)
         continue;
  
        if (t->operand_types[j].bitfield.reg
-         && !match_reg_size (t, j))
+         && !match_operand_size (t, j, j))
         {
           match = 0;
           break;
         }
  
        if (t->operand_types[j].bitfield.regsimd
-         && !match_simd_size (t, j))
+         && !match_simd_size (t, j, j))
         {
           match = 0;
           break;
         }
  
        if (t->operand_types[j].bitfield.acc
-         && (!match_reg_size (t, j) || !match_simd_size (t, j)))
+         && (!match_operand_size (t, j, j) || !match_simd_size (t, j, j)))
         {
           match = 0;
           break;
         }
  
-      if (i.types[j].bitfield.mem && !match_mem_size (t, j))
+      if ((i.flags[j] & Operand_Mem) && !match_mem_size (t, j, j))
         {
           match = 0;
           break;
         }
      }
  
-  if (match)
-    return match;
-  else if (!t->opcode_modifier.d)
+  if (!t->opcode_modifier.d)
      {
  mismatch:
-      i.error = operand_size_mismatch;
-      return 0;
+      if (!match)
+       i.error = operand_size_mismatch;
+      return match;
      }
  
    /* Check reverse.  */
    gas_assert (i.operands == 2);
  
-  match = 1;
    for (j = 0; j < 2; j++)
      {
        if ((t->operand_types[j].bitfield.reg
            || t->operand_types[j].bitfield.acc)
-         && !match_reg_size (t, j ? 0 : 1))
+         && !match_operand_size (t, j, !j))
         goto mismatch;
  
-      if (i.types[j].bitfield.mem
-         && !match_mem_size (t, j ? 0 : 1))
+      if ((i.flags[!j] & Operand_Mem) && !match_mem_size (t, j, !j))
         goto mismatch;
      }
  
-  return match;
+  return match | MATCH_REVERSE;
  }
  
  static INLINE int
@@ -2317,8 +2333,9 @@ add_prefix (unsigned int prefix)
        && flag_code == CODE_64BIT)
      {
        if ((i.prefix[REX_PREFIX] & prefix & REX_W)
-         || ((i.prefix[REX_PREFIX] & (REX_R | REX_X | REX_B))
-             && (prefix & (REX_R | REX_X | REX_B))))
+         || (i.prefix[REX_PREFIX] & prefix & REX_R)
+         || (i.prefix[REX_PREFIX] & prefix & REX_X)
+         || (i.prefix[REX_PREFIX] & prefix & REX_B))
         ret = PREFIX_EXIST;
        q = REX_PREFIX;
      }
@@ -2622,6 +2639,10 @@ set_cpu_arch (int dummy ATTRIBUTE_UNUSED)
                   cpu_arch_flags = flags;
                   cpu_arch_isa_flags = flags;
                 }
+             else
+               cpu_arch_isa_flags
+                 = cpu_flags_or (cpu_arch_isa_flags,
+                                 cpu_arch[j].flags);
               (void) restore_line_pointer (e);
               demand_empty_rest_of_line ();
               return;
@@ -3346,8 +3367,10 @@ build_vex_prefix (const insn_template *t)
      {
        unsigned int op;
  
+      /* Determine vector length from the last multi-length vector
+        operand.  */
        vector_length = 0;
-      for (op = 0; op < t->operands; ++op)
+      for (op = t->operands; op--;)
         if (t->operand_types[op].bitfield.xmmword
             && t->operand_types[op].bitfield.ymmword
             && i.types[op].bitfield.ymmword)
@@ -3450,11 +3473,18 @@ build_vex_prefix (const insn_template *t)
  static INLINE bfd_boolean
  is_evex_encoding (const insn_template *t)
  {
-  return t->opcode_modifier.evex
+  return t->opcode_modifier.evex || t->opcode_modifier.disp8memshift
          || t->opcode_modifier.broadcast || t->opcode_modifier.masking
          || t->opcode_modifier.staticrounding || t->opcode_modifier.sae;
  }
  
+static INLINE bfd_boolean
+is_any_vex_encoding (const insn_template *t)
+{
+  return t->opcode_modifier.vex || t->opcode_modifier.vexopcode
+        || is_evex_encoding (t);
+}
+
  /* Build the EVEX prefix.  */
  
  static void
@@ -3594,22 +3624,51 @@ build_evex_prefix (void)
         {
           unsigned int op;
  
+         /* Determine vector length from the last multi-length vector
+            operand.  */
           vec_length = 0;
-         for (op = 0; op < i.tm.operands; ++op)
+         for (op = i.operands; op--;)
             if (i.tm.operand_types[op].bitfield.xmmword
                 + i.tm.operand_types[op].bitfield.ymmword
                 + i.tm.operand_types[op].bitfield.zmmword > 1)
               {
                 if (i.types[op].bitfield.zmmword)
-                 i.tm.opcode_modifier.evex = EVEX512;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX512;
+                   break;
+                 }
                 else if (i.types[op].bitfield.ymmword)
-                 i.tm.opcode_modifier.evex = EVEX256;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX256;
+                   break;
+                 }
                 else if (i.types[op].bitfield.xmmword)
-                 i.tm.opcode_modifier.evex = EVEX128;
-               else
-                 continue;
-               break;
+                 {
+                   i.tm.opcode_modifier.evex = EVEX128;
+                   break;
+                 }
+               else if (i.broadcast && (int) op == i.broadcast->operand)
+                 {
+                   switch (i.broadcast->bytes)
+                     {
+                       case 64:
+                         i.tm.opcode_modifier.evex = EVEX512;
+                         break;
+                       case 32:
+                         i.tm.opcode_modifier.evex = EVEX256;
+                         break;
+                       case 16:
+                         i.tm.opcode_modifier.evex = EVEX128;
+                         break;
+                       default:
+                         abort ();
+                     }
+                   break;
+                 }
               }
+
+         if (op >= MAX_OPERANDS)
+           abort ();
         }
  
        switch (i.tm.opcode_modifier.evex)
@@ -3708,9 +3767,7 @@ bad_register_operand:
  
    gas_assert (i.imm_operands <= 1
               && (i.operands <= 2
-                 || ((i.tm.opcode_modifier.vex
-                      || i.tm.opcode_modifier.vexopcode
-                      || is_evex_encoding (&i.tm))
+                 || (is_any_vex_encoding (&i.tm)
                       && i.operands <= 4)));
  
    exp = &im_expressions[i.imm_operands++];
@@ -3868,10 +3925,14 @@ optimize_encoding (void)
            && i.op[0].regs == i.op[1].regs
            && !i.types[2].bitfield.xmmword
            && (i.tm.opcode_modifier.vex
-              || (!i.mask
+              || ((!i.mask || i.mask->zeroing)
                    && !i.rounding
                    && is_evex_encoding (&i.tm)
-                  && cpu_arch_flags.bitfield.cpuavx512vl))
+                  && (i.vec_encoding != vex_encoding_evex
+                      || i.tm.cpu_flags.bitfield.cpuavx512vl
+                      || (i.tm.operand_types[2].bitfield.zmmword
+                          && i.types[2].bitfield.ymmword)
+                      || cpu_arch_isa_flags.bitfield.cpuavx512vl)))
            && ((i.tm.base_opcode == 0x55
                 || i.tm.base_opcode == 0x6655
                 || i.tm.base_opcode == 0x66df
@@ -3881,7 +3942,11 @@ optimize_encoding (void)
                 || i.tm.base_opcode == 0x66f8
                 || i.tm.base_opcode == 0x66f9
                 || i.tm.base_opcode == 0x66fa
-               || i.tm.base_opcode == 0x66fb)
+               || i.tm.base_opcode == 0x66fb
+               || i.tm.base_opcode == 0x42
+               || i.tm.base_opcode == 0x6642
+               || i.tm.base_opcode == 0x47
+               || i.tm.base_opcode == 0x6647)
                && i.tm.extension_opcode == None))
      {
        /* Optimize: -O2:
@@ -3912,16 +3977,16 @@ optimize_encoding (void)
              EVEX VOP %ymmM, %ymmM, %ymmN
                -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
                -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+          VOP, one of kxord and kxorq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kxorw %kM, %kM, %kN
+          VOP, one of kandnd and kandnq:
+            VEX VOP %kM, %kM, %kN
+              -> VEX kandnw %kM, %kM, %kN
         */
        if (is_evex_encoding (&i.tm))
         {
-         /* If only lower 16 vector registers are used, we can use
-            VEX encoding.  */
-         for (j = 0; j < 3; j++)
-           if (register_number (i.op[j].regs) > 15)
-             break;
-
-         if (j < 3)
+         if (i.vec_encoding == vex_encoding_evex)
             i.tm.opcode_modifier.evex = EVEX128;
           else
             {
@@ -3930,6 +3995,11 @@ optimize_encoding (void)
               i.tm.opcode_modifier.evex = 0;
             }
         }
+      else if (i.tm.operand_types[0].bitfield.regmask)
+       {
+         i.tm.base_opcode &= 0xff;
+         i.tm.opcode_modifier.vexw = VEXW0;
+       }
        else
         i.tm.opcode_modifier.vex = VEX128;
  
@@ -4075,6 +4145,13 @@ md_assemble (char *line)
        return;
      }
  
+  /* Check for data size prefix on VEX/XOP/EVEX encoded insns.  */
+  if (i.prefix[DATA_PREFIX] && is_any_vex_encoding (&i.tm))
+    {
+      as_bad (_("data size prefix invalid with `%s'"), i.tm.name);
+      return;
+    }
+
    /* Check if HLE prefix is OK.  */
    if (i.hle_prefix && !check_hle ())
      return;
@@ -4098,10 +4175,16 @@ md_assemble (char *line)
      }
  
    /* Insert BND prefix.  */
-  if (add_bnd_prefix
-      && i.tm.opcode_modifier.bndprefixok
-      && !i.prefix[BND_PREFIX])
-    add_prefix (BND_PREFIX_OPCODE);
+  if (add_bnd_prefix && i.tm.opcode_modifier.bndprefixok)
+    {
+      if (!i.prefix[BND_PREFIX])
+       add_prefix (BND_PREFIX_OPCODE);
+      else if (i.prefix[BND_PREFIX] != BND_PREFIX_OPCODE)
+       {
+         as_warn (_("replacing `rep'/`repe' prefix by `bnd'"));
+         i.prefix[BND_PREFIX] = BND_PREFIX_OPCODE;
+       }
+    }
  
    /* Check string instruction segment overrides.  */
    if (i.tm.opcode_modifier.isstring && i.mem_operands != 0)
@@ -4155,8 +4238,7 @@ md_assemble (char *line)
        as_warn (_("translating to `%sp'"), i.tm.name);
      }
  
-  if (i.tm.opcode_modifier.vex || i.tm.opcode_modifier.vexopcode
-      || is_evex_encoding (&i.tm))
+  if (is_any_vex_encoding (&i.tm))
      {
        if (flag_code == CODE_16BIT)
         {
@@ -4619,6 +4701,13 @@ parse_operands (char *l, const char *mnemonic)
           /* Now parse operand adding info to 'i' as we go along.  */
           END_STRING_AND_SAVE (l);
  
+         if (i.mem_operands > 1)
+           {
+             as_bad (_("too many memory references for `%s'"),
+                     mnemonic);
+             return 0;
+           }
+
           if (intel_syntax)
             operand_ok =
               i386_intel_operand (token_start,
@@ -4664,14 +4753,21 @@ swap_2_operands (int xchg1, int xchg2)
  {
    union i386_op temp_op;
    i386_operand_type temp_type;
+  unsigned int temp_flags;
    enum bfd_reloc_code_real temp_reloc;
  
    temp_type = i.types[xchg2];
    i.types[xchg2] = i.types[xchg1];
    i.types[xchg1] = temp_type;
+
+  temp_flags = i.flags[xchg2];
+  i.flags[xchg2] = i.flags[xchg1];
+  i.flags[xchg1] = temp_flags;
+
    temp_op = i.op[xchg2];
    i.op[xchg2] = i.op[xchg1];
    i.op[xchg1] = temp_op;
+
    temp_reloc = i.reloc[xchg2];
    i.reloc[xchg2] = i.reloc[xchg1];
    i.reloc[xchg1] = temp_reloc;
@@ -4954,12 +5050,52 @@ optimize_disp (void)
        }
  }
  
+/* Return 1 if there is a match in broadcast bytes between operand
+   GIVEN and instruction template T.   */
+
+static INLINE int
+match_broadcast_size (const insn_template *t, unsigned int given)
+{
+  return ((t->opcode_modifier.broadcast == BYTE_BROADCAST
+          && i.types[given].bitfield.byte)
+         || (t->opcode_modifier.broadcast == WORD_BROADCAST
+             && i.types[given].bitfield.word)
+         || (t->opcode_modifier.broadcast == DWORD_BROADCAST
+             && i.types[given].bitfield.dword)
+         || (t->opcode_modifier.broadcast == QWORD_BROADCAST
+             && i.types[given].bitfield.qword));
+}
+
  /* Check if operands are valid for the instruction.  */
  
  static int
  check_VecOperands (const insn_template *t)
  {
    unsigned int op;
+  i386_cpu_flags cpu;
+  static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
+
+  /* Templates allowing for ZMMword as well as YMMword and/or XMMword for
+     any one operand are implicity requiring AVX512VL support if the actual
+     operand size is YMMword or XMMword.  Since this function runs after
+     template matching, there's no need to check for YMMword/XMMword in
+     the template.  */
+  cpu = cpu_flags_and (t->cpu_flags, avx512);
+  if (!cpu_flags_all_zero (&cpu)
+      && !t->cpu_flags.bitfield.cpuavx512vl
+      && !cpu_arch_flags.bitfield.cpuavx512vl)
+    {
+      for (op = 0; op < t->operands; ++op)
+       {
+         if (t->operand_types[op].bitfield.zmmword
+             && (i.types[op].bitfield.ymmword
+                 || i.types[op].bitfield.xmmword))
+           {
+             i.error = unsupported;
+             return 1;
+           }
+       }
+    }
  
    /* Without VSIB byte, we can't have a vector register for index.  */
    if (!t->opcode_modifier.vecsib
@@ -5045,42 +5181,67 @@ check_VecOperands (const insn_template *t)
       to the memory operand.  */
    if (i.broadcast)
      {
-      int broadcasted_opnd_size;
+      i386_operand_type type, overlap;
  
        /* Check if specified broadcast is supported in this instruction,
-        and it's applied to memory operand of DWORD or QWORD type,
-        depending on VecESize.  */
-      if (i.broadcast->type != t->opcode_modifier.broadcast
-         || !i.types[i.broadcast->operand].bitfield.mem
-         || (t->opcode_modifier.vecesize == 0
-             && !i.types[i.broadcast->operand].bitfield.dword
-             && !i.types[i.broadcast->operand].bitfield.unspecified)
-         || (t->opcode_modifier.vecesize == 1
-             && !i.types[i.broadcast->operand].bitfield.qword
-             && !i.types[i.broadcast->operand].bitfield.unspecified))
-       goto bad_broadcast;
-
-      broadcasted_opnd_size = t->opcode_modifier.vecesize ? 64 : 32;
-      if (i.broadcast->type == BROADCAST_1TO16)
-       broadcasted_opnd_size <<= 4; /* Broadcast 1to16.  */
-      else if (i.broadcast->type == BROADCAST_1TO8)
-       broadcasted_opnd_size <<= 3; /* Broadcast 1to8.  */
-      else if (i.broadcast->type == BROADCAST_1TO4)
-       broadcasted_opnd_size <<= 2; /* Broadcast 1to4.  */
-      else if (i.broadcast->type == BROADCAST_1TO2)
-       broadcasted_opnd_size <<= 1; /* Broadcast 1to2.  */
-      else
-       goto bad_broadcast;
-
-      if ((broadcasted_opnd_size == 256
-          && !t->operand_types[i.broadcast->operand].bitfield.ymmword)
-         || (broadcasted_opnd_size == 512
-             && !t->operand_types[i.broadcast->operand].bitfield.zmmword))
+        and its broadcast bytes match the memory operand.  */
+      op = i.broadcast->operand;
+      if (!t->opcode_modifier.broadcast
+         || !(i.flags[op] & Operand_Mem)
+         || (!i.types[op].bitfield.unspecified
+             && !match_broadcast_size (t, op)))
         {
         bad_broadcast:
           i.error = unsupported_broadcast;
           return 1;
         }
+
+      i.broadcast->bytes = ((1 << (t->opcode_modifier.broadcast - 1))
+                           * i.broadcast->type);
+      operand_type_set (&type, 0);
+      switch (i.broadcast->bytes)
+       {
+       case 2:
+         type.bitfield.word = 1;
+         break;
+       case 4:
+         type.bitfield.dword = 1;
+         break;
+       case 8:
+         type.bitfield.qword = 1;
+         break;
+       case 16:
+         type.bitfield.xmmword = 1;
+         break;
+       case 32:
+         type.bitfield.ymmword = 1;
+         break;
+       case 64:
+         type.bitfield.zmmword = 1;
+         break;
+       default:
+         goto bad_broadcast;
+       }
+
+      overlap = operand_type_and (type, t->operand_types[op]);
+      if (operand_type_all_zero (&overlap))
+         goto bad_broadcast;
+
+      if (t->opcode_modifier.checkregsize)
+       {
+         unsigned int j;
+
+         type.bitfield.baseindex = 1;
+         for (j = 0; j < i.operands; ++j)
+           {
+             if (j != op
+                 && !operand_type_register_match(i.types[j],
+                                                 t->operand_types[j],
+                                                 type,
+                                                 t->operand_types[op]))
+               goto bad_broadcast;
+           }
+       }
      }
    /* If broadcast is supported in this instruction, we need to check if
       operand of one-element size isn't specified without broadcast.  */
@@ -5092,24 +5253,49 @@ check_VecOperands (const insn_template *t)
           break;
        gas_assert (op < i.operands);
        /* Check size of the memory operand.  */
-      if ((t->opcode_modifier.vecesize == 0
-          && i.types[op].bitfield.dword)
-         || (t->opcode_modifier.vecesize == 1
-             && i.types[op].bitfield.qword))
+      if (match_broadcast_size (t, op))
         {
           i.error = broadcast_needed;
           return 1;
         }
      }
+  else
+    op = MAX_OPERANDS - 1; /* Avoid uninitialized variable warning.  */
  
    /* Check if requested masking is supported.  */
-  if (i.mask
-      && (!t->opcode_modifier.masking
-         || (i.mask->zeroing
-             && t->opcode_modifier.masking == MERGING_MASKING)))
+  if (i.mask)
      {
-      i.error = unsupported_masking;
-      return 1;
+      switch (t->opcode_modifier.masking)
+       {
+       case BOTH_MASKING:
+         break;
+       case MERGING_MASKING:
+         if (i.mask->zeroing)
+           {
+       case 0:
+             i.error = unsupported_masking;
+             return 1;
+           }
+         break;
+       case DYNAMIC_MASKING:
+         /* Memory destinations allow only merging masking.  */
+         if (i.mask->zeroing && i.mem_operands)
+           {
+             /* Find memory operand.  */
+             for (op = 0; op < i.operands; op++)
+               if (i.flags[op] & Operand_Mem)
+                 break;
+             gas_assert (op < i.operands);
+             if (op == i.operands - 1)
+               {
+                 i.error = unsupported_masking;
+                 return 1;
+               }
+           }
+         break;
+       default:
+         abort ();
+       }
      }
  
    /* Check if masking is applied to dest operand.  */
@@ -5147,9 +5333,51 @@ check_VecOperands (const insn_template *t)
        && i.disp_encoding != disp_encoding_32bit)
      {
        if (i.broadcast)
-       i.memshift = t->opcode_modifier.vecesize ? 3 : 2;
-      else
+       i.memshift = t->opcode_modifier.broadcast - 1;
+      else if (t->opcode_modifier.disp8memshift != DISP8_SHIFT_VL)
         i.memshift = t->opcode_modifier.disp8memshift;
+      else
+       {
+         const i386_operand_type *type = NULL;
+
+         i.memshift = 0;
+         for (op = 0; op < i.operands; op++)
+           if (operand_type_check (i.types[op], anymem))
+             {
+               if (t->opcode_modifier.evex == EVEXLIG)
+                 i.memshift = 2 + (i.suffix == QWORD_MNEM_SUFFIX);
+               else if (t->operand_types[op].bitfield.xmmword
+                        + t->operand_types[op].bitfield.ymmword
+                        + t->operand_types[op].bitfield.zmmword <= 1)
+                 type = &t->operand_types[op];
+               else if (!i.types[op].bitfield.unspecified)
+                 type = &i.types[op];
+             }
+           else if (i.types[op].bitfield.regsimd
+                    && t->opcode_modifier.evex != EVEXLIG)
+             {
+               if (i.types[op].bitfield.zmmword)
+                 i.memshift = 6;
+               else if (i.types[op].bitfield.ymmword && i.memshift < 5)
+                 i.memshift = 5;
+               else if (i.types[op].bitfield.xmmword && i.memshift < 4)
+                 i.memshift = 4;
+             }
+
+         if (type)
+           {
+             if (type->bitfield.zmmword)
+               i.memshift = 6;
+             else if (type->bitfield.ymmword)
+               i.memshift = 5;
+             else if (type->bitfield.xmmword)
+               i.memshift = 4;
+           }
+
+         /* For the check in fits_in_disp8().  */
+         if (i.memshift == 0)
+           i.memshift = -1;
+       }
  
        for (op = 0; op < i.operands; op++)
         if (operand_type_check (i.types[op], disp)
@@ -5226,7 +5454,7 @@ match_template (char mnem_suffix)
    i386_operand_type operand_types [MAX_OPERANDS];
    int addr_prefix_disp;
    unsigned int j;
-  unsigned int found_cpu_match;
+  unsigned int found_cpu_match, size_match;
    unsigned int check_register;
    enum i386_error specific_error = 0;
  
@@ -5238,7 +5466,9 @@ match_template (char mnem_suffix)
    addr_prefix_disp = -1;
  
    memset (&suffix_check, 0, sizeof (suffix_check));
-  if (i.suffix == BYTE_MNEM_SUFFIX)
+  if (intel_syntax && i.broadcast)
+    /* nothing */;
+  else if (i.suffix == BYTE_MNEM_SUFFIX)
      suffix_check.no_bsuf = 1;
    else if (i.suffix == WORD_MNEM_SUFFIX)
      suffix_check.no_wsuf = 1;
@@ -5313,7 +5543,8 @@ match_template (char mnem_suffix)
           || (t->opcode_modifier.no_ldsuf && mnemsuf_check.no_ldsuf))
         continue;
  
-      if (!operand_size_match (t))
+      size_match = operand_size_match (t);
+      if (!size_match)
         continue;
  
        for (j = 0; j < MAX_OPERANDS; j++)
@@ -5324,6 +5555,7 @@ match_template (char mnem_suffix)
           && flag_code != CODE_64BIT
           && (intel_syntax
               ? (!t->opcode_modifier.ignoresize
+                && !t->opcode_modifier.broadcast
                  && !intel_float_operand (t->name))
               : intel_float_operand (t->name) != 2)
           && ((!operand_types[0].bitfield.regmmx
@@ -5406,7 +5638,15 @@ match_template (char mnem_suffix)
         continue;
  
        /* We check register size if needed.  */
-      check_register = t->opcode_modifier.checkregsize;
+      if (t->opcode_modifier.checkregsize)
+       {
+         check_register = (1 << t->operands) - 1;
+         if (i.broadcast)
+           check_register &= ~(1 << i.broadcast->operand);
+       }
+      else
+       check_register = 0;
+
        overlap0 = operand_type_and (i.types[0], operand_types[0]);
        switch (t->operands)
         {
@@ -5424,6 +5664,16 @@ match_template (char mnem_suffix)
               && operand_type_equal (&i.types [0], &acc32)
               && operand_type_equal (&i.types [1], &acc32))
             continue;
+         /* xrelease mov %eax, <disp> is another special case. It must not
+            match the accumulator-only encoding of mov.  */
+         if (flag_code != CODE_64BIT
+             && i.hle_prefix
+             && t->base_opcode == 0xa0
+             && i.types[0].bitfield.acc
+             && operand_type_check (i.types[1], anymem))
+           continue;
+         if (!(size_match & MATCH_STRAIGHT))
+           goto check_reverse;
           /* If we want store form, we reverse direction of operands.  */
           if (i.dir_encoding == dir_encoding_store
               && t->opcode_modifier.d)
@@ -5442,7 +5692,7 @@ match_template (char mnem_suffix)
           overlap1 = operand_type_and (i.types[1], operand_types[1]);
           if (!operand_type_match (overlap0, i.types[0])
               || !operand_type_match (overlap1, i.types[1])
-             || (check_register
+             || ((check_register & 3) == 3
                   && !operand_type_register_match (i.types[0],
                                                    operand_types[0],
                                                    i.types[1],
@@ -5453,6 +5703,8 @@ match_template (char mnem_suffix)
                 continue;
  
  check_reverse:
+             if (!(size_match & MATCH_REVERSE))
+               continue;
               /* Try reversing direction of operands.  */
               overlap0 = operand_type_and (i.types[0], operand_types[1]);
               overlap1 = operand_type_and (i.types[1], operand_types[0]);
@@ -5509,26 +5761,32 @@ check_reverse:
                   /* Fall through.  */
                 case 4:
                   if (!operand_type_match (overlap3, i.types[3])
-                     || (check_register
+                     || ((check_register & 0xa) == 0xa
+                         && !operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[3],
+                                                           operand_types[3]))
+                     || ((check_register & 0xc) == 0xc
                           && !operand_type_register_match (i.types[2],
-                                                          operand_types[2],
-                                                          i.types[3],
-                                                          operand_types[3])))
+                                                           operand_types[2],
+                                                           i.types[3],
+                                                           operand_types[3])))
                     continue;
                   /* Fall through.  */
                 case 3:
                   /* Here we make use of the fact that there are no
                      reverse match 3 operand instructions.  */
                   if (!operand_type_match (overlap2, i.types[2])
-                     || (check_register
-                         && (!operand_type_register_match (i.types[0],
+                     || ((check_register & 5) == 5
+                         && !operand_type_register_match (i.types[0],
                                                             operand_types[0],
                                                             i.types[2],
-                                                           operand_types[2])
-                             || !operand_type_register_match (i.types[1],
-                                                              operand_types[1],
-                                                              i.types[2],
-                                                              operand_types[2]))))
+                                                           operand_types[2]))
+                     || ((check_register & 6) == 6
+                         && !operand_type_register_match (i.types[1],
+                                                           operand_types[1],
+                                                           i.types[2],
+                                                           operand_types[2])))
                     continue;
                   break;
                 }
@@ -5601,9 +5859,6 @@ check_reverse:
         case unsupported_broadcast:
           err_msg = _("unsupported broadcast");
           break;
-       case broadcast_not_on_src_operand:
-         err_msg = _("broadcast not on source memory operand");
-         break;
         case broadcast_needed:
           err_msg = _("broadcast is needed for operand of such type");
           break;
@@ -5790,7 +6045,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_lsuf)
+             && i.tm.opcode_modifier.no_lsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_long_reg ())
             return 0;
@@ -5799,7 +6056,9 @@ process_suffix (void)
         {
           if (intel_syntax
               && i.tm.opcode_modifier.ignoresize
-             && i.tm.opcode_modifier.no_qsuf)
+             && i.tm.opcode_modifier.no_qsuf
+             && !i.tm.opcode_modifier.todword
+             && !i.tm.opcode_modifier.toqword)
             i.suffix = 0;
           else if (!check_qword_reg ())
             return 0;
@@ -5919,20 +6178,27 @@ process_suffix (void)
        /* Now select between word & dword operations via the operand
          size prefix, except for instructions that will ignore this
          prefix anyway.  */
-      if (i.tm.opcode_modifier.addrprefixop0)
+      if (i.reg_operands > 0
+         && i.types[0].bitfield.reg
+         && i.tm.opcode_modifier.addrprefixopreg
+         && (i.tm.opcode_modifier.immext
+             || i.operands == 1))
         {
           /* The address size override prefix changes the size of the
              first operand.  */
           if ((flag_code == CODE_32BIT
-              && i.op->regs[0].reg_type.bitfield.word)
+              && i.op[0].regs->reg_type.bitfield.word)
               || (flag_code != CODE_32BIT
-                 && i.op->regs[0].reg_type.bitfield.dword))
+                 && i.op[0].regs->reg_type.bitfield.dword))
             if (!add_prefix (ADDR_PREFIX_OPCODE))
               return 0;
         }
        else if (i.suffix != QWORD_MNEM_SUFFIX
                && !i.tm.opcode_modifier.ignoresize
                && !i.tm.opcode_modifier.floatmf
+              && !i.tm.opcode_modifier.vex
+              && !i.tm.opcode_modifier.vexopcode
+              && !is_evex_encoding (&i.tm)
                && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
                    || (flag_code == CODE_64BIT
                        && i.tm.opcode_modifier.jumpbyte)))
@@ -5962,6 +6228,41 @@ process_suffix (void)
        break;
      }
  
+  if (i.reg_operands != 0
+      && i.operands > 1
+      && i.tm.opcode_modifier.addrprefixopreg
+      && !i.tm.opcode_modifier.immext)
+    {
+      /* Check invalid register operand when the address size override
+        prefix changes the size of register operands.  */
+      unsigned int op;
+      enum { need_word, need_dword, need_qword } need;
+
+      if (flag_code == CODE_32BIT)
+       need = i.prefix[ADDR_PREFIX] ? need_word : need_dword;
+      else
+       {
+         if (i.prefix[ADDR_PREFIX])
+           need = need_dword;
+         else
+           need = flag_code == CODE_64BIT ? need_qword : need_word;
+       }
+
+      for (op = 0; op < i.operands; op++)
+       if (i.types[op].bitfield.reg
+           && ((need == need_word
+                && !i.op[op].regs->reg_type.bitfield.word)
+               || (need == need_dword
+                   && !i.op[op].regs->reg_type.bitfield.dword)
+               || (need == need_qword
+                   && !i.op[op].regs->reg_type.bitfield.qword)))
+         {
+           as_bad (_("invalid register operand size for `%s'"),
+                   i.tm.name);
+           return 0;
+         }
+    }
+
    return 1;
  }
  
@@ -6526,118 +6827,82 @@ build_modrm_byte (void)
    unsigned int source, dest;
    int vex_3_sources;
  
-  /* The first operand of instructions with VEX prefix and 3 sources
-     must be VEX_Imm4.  */
    vex_3_sources = i.tm.opcode_modifier.vexsources == VEX3SOURCES;
    if (vex_3_sources)
      {
        unsigned int nds, reg_slot;
        expressionS *exp;
  
-      if (i.tm.opcode_modifier.veximmext
-          && i.tm.opcode_modifier.immext)
-        {
-          dest = i.operands - 2;
-          gas_assert (dest == 3);
-        }
-      else
-        dest = i.operands - 1;
+      dest = i.operands - 1;
        nds = dest - 1;
  
        /* There are 2 kinds of instructions:
-         1. 5 operands: 4 register operands or 3 register operands
-         plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
-         VexW0 or VexW1.  The destination must be either XMM, YMM or
+        1. 5 operands: 4 register operands or 3 register operands
+        plus 1 memory operand plus one Vec_Imm4 operand, VexXDS, and
+        VexW0 or VexW1.  The destination must be either XMM, YMM or
          ZMM register.
-         2. 4 operands: 4 register operands or 3 register operands
-         plus 1 memory operand, VexXDS, and VexImmExt  */
+        2. 4 operands: 4 register operands or 3 register operands
+        plus 1 memory operand, with VexXDS.  */
        gas_assert ((i.reg_operands == 4
-                   || (i.reg_operands == 3 && i.mem_operands == 1))
-                  && i.tm.opcode_modifier.vexvvvv == VEXXDS
-                  && (i.tm.opcode_modifier.veximmext
-                      || (i.imm_operands == 1
-                          && i.types[0].bitfield.vec_imm4
-                          && (i.tm.opcode_modifier.vexw == VEXW0
-                              || i.tm.opcode_modifier.vexw == VEXW1)
-                          && i.tm.operand_types[dest].bitfield.regsimd)));
+                  || (i.reg_operands == 3 && i.mem_operands == 1))
+                 && i.tm.opcode_modifier.vexvvvv == VEXXDS
+                 && i.tm.opcode_modifier.vexw
+                 && i.tm.operand_types[dest].bitfield.regsimd);
+
+      /* If VexW1 is set, the first non-immediate operand is the source and
+        the second non-immediate one is encoded in the immediate operand.  */
+      if (i.tm.opcode_modifier.vexw == VEXW1)
+       {
+         source = i.imm_operands;
+         reg_slot = i.imm_operands + 1;
+       }
+      else
+       {
+         source = i.imm_operands + 1;
+         reg_slot = i.imm_operands;
+       }
  
        if (i.imm_operands == 0)
-        {
-          /* When there is no immediate operand, generate an 8bit
-             immediate operand to encode the first operand.  */
-          exp = &im_expressions[i.imm_operands++];
-          i.op[i.operands].imms = exp;
-          i.types[i.operands] = imm8;
-          i.operands++;
-          /* If VexW1 is set, the first operand is the source and
-             the second operand is encoded in the immediate operand.  */
-          if (i.tm.opcode_modifier.vexw == VEXW1)
-            {
-              source = 0;
-              reg_slot = 1;
-            }
-          else
-            {
-              source = 1;
-              reg_slot = 0;
-            }
-
-          /* FMA swaps REG and NDS.  */
-          if (i.tm.cpu_flags.bitfield.cpufma)
-            {
-              unsigned int tmp;
-              tmp = reg_slot;
-              reg_slot = nds;
-              nds = tmp;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          exp->X_op = O_constant;
-          exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
+       {
+         /* When there is no immediate operand, generate an 8bit
+            immediate operand to encode the first operand.  */
+         exp = &im_expressions[i.imm_operands++];
+         i.op[i.operands].imms = exp;
+         i.types[i.operands] = imm8;
+         i.operands++;
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         exp->X_op = O_constant;
+         exp->X_add_number = register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
         }
        else
-        {
-          unsigned int imm_slot;
-
-          if (i.tm.opcode_modifier.vexw == VEXW0)
-            {
-              /* If VexW0 is set, the third operand is the source and
-                 the second operand is encoded in the immediate
-                 operand.  */
-              source = 2;
-              reg_slot = 1;
-            }
-          else
-            {
-              /* VexW1 is set, the second operand is the source and
-                 the third operand is encoded in the immediate
-                 operand.  */
-              source = 1;
-              reg_slot = 2;
-            }
-
-          if (i.tm.opcode_modifier.immext)
-            {
-              /* When ImmExt is set, the immediate byte is the last
-                 operand.  */
-              imm_slot = i.operands - 1;
-              source--;
-              reg_slot--;
-            }
-          else
-            {
-              imm_slot = 0;
-
-              /* Turn on Imm8 so that output_imm will generate it.  */
-              i.types[imm_slot].bitfield.imm8 = 1;
-            }
-
-          gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
-          i.op[imm_slot].imms->X_add_number
-              |= register_number (i.op[reg_slot].regs) << 4;
+       {
+         unsigned int imm_slot;
+
+         gas_assert (i.imm_operands == 1 && i.types[0].bitfield.vec_imm4);
+
+         if (i.tm.opcode_modifier.immext)
+           {
+             /* When ImmExt is set, the immediate byte is the last
+                operand.  */
+             imm_slot = i.operands - 1;
+             source--;
+             reg_slot--;
+           }
+         else
+           {
+             imm_slot = 0;
+
+             /* Turn on Imm8 so that output_imm will generate it.  */
+             i.types[imm_slot].bitfield.imm8 = 1;
+           }
+
+         gas_assert (i.tm.operand_types[reg_slot].bitfield.regsimd);
+         i.op[imm_slot].imms->X_add_number
+             |= register_number (i.op[reg_slot].regs) << 4;
           gas_assert ((i.op[reg_slot].regs->reg_flags & RegVRex) == 0);
-        }
+       }
  
        gas_assert (i.tm.operand_types[nds].bitfield.regsimd);
        i.vex.register_specifier = i.op[nds].regs;
@@ -6805,12 +7070,11 @@ build_modrm_byte (void)
           if ((i.op[source].regs->reg_flags & RegVRex) != 0)
             i.vrex |= REX_R;
         }
-      if (flag_code != CODE_64BIT && (i.rex & (REX_R | REX_B)))
+      if (flag_code != CODE_64BIT && (i.rex & REX_R))
         {
-         if (!i.types[0].bitfield.control
-             && !i.types[1].bitfield.control)
+         if (!i.types[i.tm.operand_types[0].bitfield.regmem].bitfield.control)
             abort ();
-         i.rex &= ~(REX_R | REX_B);
+         i.rex &= ~REX_R;
           add_prefix (LOCK_PREFIX_OPCODE);
         }
      }
@@ -6830,8 +7094,7 @@ build_modrm_byte (void)
  
           if (i.tm.opcode_modifier.vecsib)
             {
-             if (i.index_reg->reg_num == RegEiz
-                 || i.index_reg->reg_num == RegRiz)
+             if (i.index_reg->reg_num == RegIZ)
                 abort ();
  
               i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
@@ -6902,8 +7165,7 @@ build_modrm_byte (void)
               else if (!i.tm.opcode_modifier.vecsib)
                 {
                   /* !i.base_reg && i.index_reg  */
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -6929,8 +7191,7 @@ build_modrm_byte (void)
                 }
             }
           /* RIP addressing for 64bit mode.  */
-         else if (i.base_reg->reg_num == RegRip ||
-                  i.base_reg->reg_num == RegEip)
+         else if (i.base_reg->reg_num == RegIP)
             {
               gas_assert (!i.tm.opcode_modifier.vecsib);
               i.rm.regmem = NO_BASE_REGISTER;
@@ -7022,8 +7283,7 @@ build_modrm_byte (void)
                 }
               else if (!i.tm.opcode_modifier.vecsib)
                 {
-                 if (i.index_reg->reg_num == RegEiz
-                     || i.index_reg->reg_num == RegRiz)
+                 if (i.index_reg->reg_num == RegIZ)
                     i.sib.index = NO_INDEX_REGISTER;
                   else
                     i.sib.index = i.index_reg->reg_num;
@@ -7582,22 +7842,16 @@ output_insn (void)
               if (i.tm.base_opcode & 0xff000000)
                 {
                   prefix = (i.tm.base_opcode >> 24) & 0xff;
-                 goto check_prefix;
+                 add_prefix (prefix);
                 }
               break;
             case 2:
               if ((i.tm.base_opcode & 0xff0000) != 0)
                 {
                   prefix = (i.tm.base_opcode >> 16) & 0xff;
-                 if (i.tm.cpu_flags.bitfield.cpupadlock)
-                   {
-check_prefix:
-                     if (prefix != REPE_PREFIX_OPCODE
-                         || (i.prefix[REP_PREFIX]
-                             != REPE_PREFIX_OPCODE))
-                       add_prefix (prefix);
-                   }
-                 else
+                 if (!i.tm.cpu_flags.bitfield.cpupadlock
+                     || prefix != REPE_PREFIX_OPCODE
+                     || (i.prefix[REP_PREFIX] != REPE_PREFIX_OPCODE))
                     add_prefix (prefix);
                 }
               break;
@@ -7769,7 +8023,8 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
               int size = disp_size (n);
               offsetT val = i.op[n].disps->X_add_number;
  
-             val = offset_in_range (val >> i.memshift, size);
+             val = offset_in_range (val >> (size == 1 ? i.memshift : 0),
+                                    size);
               p = frag_more (size);
               md_number_to_chars (p, val, size);
             }
@@ -7874,8 +8129,7 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                     {
                       fixP->fx_tcbit = i.rex != 0;
                       if (i.base_reg
-                         && (i.base_reg->reg_num == RegRip
-                             || i.base_reg->reg_num == RegEip))
+                         && (i.base_reg->reg_num == RegIP))
                       fixP->fx_tcbit2 = 1;
                     }
                   else
@@ -8425,15 +8679,15 @@ check_VecOperations (char *op_string, char *op_end)
  
               op_string += 3;
               if (*op_string == '8')
-               bcst_type = BROADCAST_1TO8;
+               bcst_type = 8;
               else if (*op_string == '4')
-               bcst_type = BROADCAST_1TO4;
+               bcst_type = 4;
               else if (*op_string == '2')
-               bcst_type = BROADCAST_1TO2;
+               bcst_type = 2;
               else if (*op_string == '1'
                        && *(op_string+1) == '6')
                 {
-                 bcst_type = BROADCAST_1TO16;
+                 bcst_type = 16;
                   op_string++;
                 }
               else
@@ -8445,6 +8699,7 @@ check_VecOperations (char *op_string, char *op_end)
  
               broadcast_op.type = bcst_type;
               broadcast_op.operand = this_operand;
+             broadcast_op.bytes = 0;
               i.broadcast = &broadcast_op;
             }
           /* Check masking operation.  */
@@ -8525,6 +8780,12 @@ check_VecOperations (char *op_string, char *op_end)
               return NULL;
             }
           op_string++;
+
+         /* Strip whitespace since the addition of pseudo prefixes
+            changed how the scrubber treats '{'.  */
+         if (is_space_char (*op_string))
+           ++op_string;
+
           continue;
         }
      unknown_vec_op:
@@ -8969,9 +9230,7 @@ i386_addressing_mode (void)
  
           if (addr_reg)
             {
-             if (addr_reg->reg_num == RegEip
-                 || addr_reg->reg_num == RegEiz
-                 || addr_reg->reg_type.bitfield.dword)
+             if (addr_reg->reg_type.bitfield.dword)
                 addr_mode = CODE_32BIT;
               else if (flag_code != CODE_64BIT
                        && addr_reg->reg_type.bitfield.word)
@@ -9081,21 +9340,18 @@ bad_address:
         {
           /* 32-bit/64-bit checks.  */
           if ((i.base_reg
-              && (addr_mode == CODE_64BIT
-                  ? !i.base_reg->reg_type.bitfield.qword
-                  : !i.base_reg->reg_type.bitfield.dword)
-              && (i.index_reg
-                  || (i.base_reg->reg_num
-                      != (addr_mode == CODE_64BIT ? RegRip : RegEip))))
+              && ((addr_mode == CODE_64BIT
+                   ? !i.base_reg->reg_type.bitfield.qword
+                   : !i.base_reg->reg_type.bitfield.dword)
+                  || (i.index_reg && i.base_reg->reg_num == RegIP)
+                  || i.base_reg->reg_num == RegIZ))
               || (i.index_reg
                   && !i.index_reg->reg_type.bitfield.xmmword
                   && !i.index_reg->reg_type.bitfield.ymmword
                   && !i.index_reg->reg_type.bitfield.zmmword
                   && ((addr_mode == CODE_64BIT
-                      ? !(i.index_reg->reg_type.bitfield.qword
-                          || i.index_reg->reg_num == RegRiz)
-                      : !(i.index_reg->reg_type.bitfield.dword
-                          || i.index_reg->reg_num == RegEiz))
+                      ? !i.index_reg->reg_type.bitfield.qword
+                      : !i.index_reg->reg_type.bitfield.dword)
                       || !i.index_reg->reg_type.bitfield.baseindex)))
             goto bad_address;
  
@@ -9104,7 +9360,7 @@ bad_address:
               || (current_templates->start->base_opcode & ~1) == 0x0f1a)
             {
               /* They cannot use RIP-relative addressing. */
-             if (i.base_reg && i.base_reg->reg_num == RegRip)
+             if (i.base_reg && i.base_reg->reg_num == RegIP)
                 {
                   as_bad (_("`%s' cannot be used here"), operand_string);
                   return 0;
@@ -9543,20 +9799,19 @@ i386_att_operand (char *operand_string)
  
        /* Special case for (%dx) while doing input/output op.  */
        if (i.base_reg
-         && operand_type_equal (&i.base_reg->reg_type,
-                                &reg16_inoutportreg)
+         && i.base_reg->reg_type.bitfield.inoutportreg
           && i.index_reg == 0
           && i.log2_scale_factor == 0
           && i.seg[i.mem_operands] == 0
           && !operand_type_check (i.types[this_operand], disp))
         {
-         i.types[this_operand] = inoutportreg;
+         i.types[this_operand] = i.base_reg->reg_type;
           return 1;
         }
  
        if (i386_index_check (operand_string) == 0)
         return 0;
-      i.types[this_operand].bitfield.mem = 1;
+      i.flags[this_operand] |= Operand_Mem;
        if (i.mem_operands == 0)
         i.memop1_string = xstrdup (operand_string);
        i.mem_operands++;
@@ -10098,6 +10353,11 @@ parse_real_register (char *reg_string, char **end_op)
    /* Handle floating point regs, allowing spaces in the (i) part.  */
    if (r == i386_regtab /* %st is first entry of table  */)
      {
+      if (!cpu_arch_flags.bitfield.cpu8087
+         && !cpu_arch_flags.bitfield.cpu287
+         && !cpu_arch_flags.bitfield.cpu387)
+       return (const reg_entry *) NULL;
+
        if (is_space_char (*s))
         ++s;
        if (*s == '(')
@@ -10138,50 +10398,44 @@ parse_real_register (char *reg_string, char **end_op)
        && !cpu_arch_flags.bitfield.cpui386)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.tbyte
-      && !cpu_arch_flags.bitfield.cpu8087
-      && !cpu_arch_flags.bitfield.cpu287
-      && !cpu_arch_flags.bitfield.cpu387)
+  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpummx)
      return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.regmmx && !cpu_arch_flags.bitfield.cpuregmmx)
-    return (const reg_entry *) NULL;
-
-  if (r->reg_type.bitfield.xmmword && !cpu_arch_flags.bitfield.cpuregxmm)
-    return (const reg_entry *) NULL;
+  if (!cpu_arch_flags.bitfield.cpuavx512f)
+    {
+      if (r->reg_type.bitfield.zmmword || r->reg_type.bitfield.regmask)
+       return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.ymmword && !cpu_arch_flags.bitfield.cpuregymm)
-    return (const reg_entry *) NULL;
+      if (!cpu_arch_flags.bitfield.cpuavx)
+       {
+         if (r->reg_type.bitfield.ymmword)
+           return (const reg_entry *) NULL;
  
-  if (r->reg_type.bitfield.zmmword && !cpu_arch_flags.bitfield.cpuregzmm)
-    return (const reg_entry *) NULL;
+         if (!cpu_arch_flags.bitfield.cpusse && r->reg_type.bitfield.xmmword)
+           return (const reg_entry *) NULL;
+       }
+    }
  
-  if (r->reg_type.bitfield.regmask
-      && !cpu_arch_flags.bitfield.cpuregmask)
+  if (r->reg_type.bitfield.regbnd && !cpu_arch_flags.bitfield.cpumpx)
      return (const reg_entry *) NULL;
  
    /* Don't allow fake index register unless allow_index_reg isn't 0. */
-  if (!allow_index_reg
-      && (r->reg_num == RegEiz || r->reg_num == RegRiz))
+  if (!allow_index_reg && r->reg_num == RegIZ)
      return (const reg_entry *) NULL;
  
-  /* Upper 16 vector register is only available with VREX in 64bit
-     mode.  */
-  if ((r->reg_flags & RegVRex))
+  /* Upper 16 vector registers are only available with VREX in 64bit
+     mode, and require EVEX encoding.  */
+  if (r->reg_flags & RegVRex)
      {
-      if (i.vec_encoding == vex_encoding_default)
-       i.vec_encoding = vex_encoding_evex;
-
-      if (!cpu_arch_flags.bitfield.cpuvrex
-         || i.vec_encoding != vex_encoding_evex
+      if (!cpu_arch_flags.bitfield.cpuavx512f
           || flag_code != CODE_64BIT)
         return (const reg_entry *) NULL;
+
+      i.vec_encoding = vex_encoding_evex;
      }
  
-  if (((r->reg_flags & (RegRex64 | RegRex))
-       || r->reg_type.bitfield.qword)
-      && (!cpu_arch_flags.bitfield.cpulm
-         || !operand_type_equal (&r->reg_type, &control))
+  if (((r->reg_flags & (RegRex64 | RegRex)) || r->reg_type.bitfield.qword)
+      && (!cpu_arch_flags.bitfield.cpulm || !r->reg_type.bitfield.control)
        && flag_code != CODE_64BIT)
      return (const reg_entry *) NULL;
  
@@ -10524,6 +10778,10 @@ md_parse_option (int c, const char *arg)
                       cpu_arch_flags = flags;
                       cpu_arch_isa_flags = flags;
                     }
+                 else
+                   cpu_arch_isa_flags
+                     = cpu_flags_or (cpu_arch_isa_flags,
+                                     cpu_arch[j].flags);
                   break;
                 }
             }
@@ -10894,28 +11152,38 @@ md_show_usage (FILE *stream)
    fprintf (stream, _("\
    -msse2avx               encode SSE instructions with VEX prefix\n"));
    fprintf (stream, _("\
-  -msse-check=[none|error|warning]\n\
+  -msse-check=[none|error|warning] (default: warning)\n\
                            check SSE instructions\n"));
    fprintf (stream, _("\
-  -moperand-check=[none|error|warning]\n\
+  -moperand-check=[none|error|warning] (default: warning)\n\
                            check operand combinations for validity\n"));
    fprintf (stream, _("\
-  -mavxscalar=[128|256]   encode scalar AVX instructions with specific vector\n\
+  -mavxscalar=[128|256] (default: 128)\n\
+                          encode scalar AVX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexlig=[128|256|512] encode scalar EVEX instructions with specific vector\n\
+  -mevexlig=[128|256|512] (default: 128)\n\
+                          encode scalar EVEX instructions with specific vector\n\
                             length\n"));
    fprintf (stream, _("\
-  -mevexwig=[0|1]         encode EVEX instructions with specific EVEX.W value\n\
+  -mevexwig=[0|1] (default: 0)\n\
+                          encode EVEX instructions with specific EVEX.W value\n\
                             for EVEX.W bit ignored instructions\n"));
    fprintf (stream, _("\
-  -mevexrcig=[rne|rd|ru|rz]\n\
+  -mevexrcig=[rne|rd|ru|rz] (default: rne)\n\
                            encode EVEX instructions with specific EVEX.RC value\n\
                             for SAE-only ignored instructions\n"));
    fprintf (stream, _("\
-  -mmnemonic=[att|intel]  use AT&T/Intel mnemonic\n"));
+  -mmnemonic=[att|intel] "));
+  if (SYSV386_COMPAT)
+    fprintf (stream, _("(default: att)\n"));
+  else
+    fprintf (stream, _("(default: intel)\n"));
+  fprintf (stream, _("\
+                          use AT&T/Intel mnemonic\n"));
    fprintf (stream, _("\
-  -msyntax=[att|intel]    use AT&T/Intel syntax\n"));
+  -msyntax=[att|intel] (default: att)\n\
+                          use AT&T/Intel syntax\n"));
    fprintf (stream, _("\
    -mindex-reg             support pseudo index registers\n"));
    fprintf (stream, _("\
@@ -10929,17 +11197,22 @@ md_show_usage (FILE *stream)
    -mbig-obj               generate big object files\n"));
  #endif
    fprintf (stream, _("\
-  -momit-lock-prefix=[no|yes]\n\
+  -momit-lock-prefix=[no|yes] (default: no)\n\
                            strip all lock prefixes\n"));
    fprintf (stream, _("\
-  -mfence-as-lock-add=[no|yes]\n\
+  -mfence-as-lock-add=[no|yes] (default: no)\n\
                            encode lfence, mfence and sfence as\n\
                             lock addl $0x0, (%%{re}sp)\n"));
    fprintf (stream, _("\
-  -mrelax-relocations=[no|yes]\n\
+  -mrelax-relocations=[no|yes] "));
+  if (DEFAULT_GENERATE_X86_RELAX_RELOCATIONS)
+    fprintf (stream, _("(default: yes)\n"));
+  else
+    fprintf (stream, _("(default: no)\n"));
+  fprintf (stream, _("\
                            generate relax relocations\n"));
    fprintf (stream, _("\
-  -mamd64                 accept only AMD64 ISA\n"));
+  -mamd64                 accept only AMD64 ISA [default]\n"));
    fprintf (stream, _("\
    -mintel64               accept only Intel64 ISA\n"));
  }