| align_branch_fused_bit
| align_branch_jmp_bit);
+/* Types of condition jump used by macro-fusion. */
+enum mf_jcc_kind
+ {
+ mf_jcc_jo = 0, /* base opcode 0x70 */
+ mf_jcc_jc, /* base opcode 0x72 */
+ mf_jcc_je, /* base opcode 0x74 */
+ mf_jcc_jna, /* base opcode 0x76 */
+ mf_jcc_js, /* base opcode 0x78 */
+ mf_jcc_jp, /* base opcode 0x7a */
+ mf_jcc_jl, /* base opcode 0x7c */
+ mf_jcc_jle, /* base opcode 0x7e */
+ };
+
+/* Types of compare flag-modifying insntructions used by macro-fusion. */
+enum mf_cmp_kind
+ {
+ mf_cmp_test_and, /* test/cmp */
+ mf_cmp_alu_cmp, /* add/sub/cmp */
+ mf_cmp_incdec /* inc/dec */
+ };
+
/* The maximum padding size for fused jcc. CMP like instruction can
be 9 bytes and jcc can be 6 bytes. Leave room just in case for
prefixes. */
}
/* Return 1 for test, and, cmp, add, sub, inc and dec which may
- be macro-fused with conditional jumps. */
+ be macro-fused with conditional jumps.
+ NB: If TEST/AND/CMP/ADD/SUB/INC/DEC is of RIP relative address,
+ or is one of the following format:
+
+ cmp m, imm
+ add m, imm
+ sub m, imm
+ test m, imm
+ and m, imm
+ inc m
+ dec m
+
+ it is unfusible. */
static int
-maybe_fused_with_jcc_p (void)
+maybe_fused_with_jcc_p (enum mf_cmp_kind* mf_cmp_p)
{
/* No RIP address. */
if (i.base_reg && i.base_reg->reg_num == RegIP)
if (is_any_vex_encoding (&i.tm))
return 0;
- /* and, add, sub with destination register. */
- if ((i.tm.base_opcode >= 0x20 && i.tm.base_opcode <= 0x25)
- || i.tm.base_opcode <= 5
+ /* add, sub without add/sub m, imm. */
+ if (i.tm.base_opcode <= 5
|| (i.tm.base_opcode >= 0x28 && i.tm.base_opcode <= 0x2d)
|| ((i.tm.base_opcode | 3) == 0x83
- && ((i.tm.extension_opcode | 1) == 0x5
+ && (i.tm.extension_opcode == 0x5
|| i.tm.extension_opcode == 0x0)))
- return (i.types[1].bitfield.class == Reg
- || i.types[1].bitfield.instance == Accum);
+ {
+ *mf_cmp_p = mf_cmp_alu_cmp;
+ return !(i.mem_operands && i.imm_operands);
+ }
- /* test, cmp with any register. */
+ /* and without and m, imm. */
+ if ((i.tm.base_opcode >= 0x20 && i.tm.base_opcode <= 0x25)
+ || ((i.tm.base_opcode | 3) == 0x83
+ && i.tm.extension_opcode == 0x4))
+ {
+ *mf_cmp_p = mf_cmp_test_and;
+ return !(i.mem_operands && i.imm_operands);
+ }
+
+ /* test without test m imm. */
if ((i.tm.base_opcode | 1) == 0x85
|| (i.tm.base_opcode | 1) == 0xa9
|| ((i.tm.base_opcode | 1) == 0xf7
- && i.tm.extension_opcode == 0)
- || (i.tm.base_opcode >= 0x38 && i.tm.base_opcode <= 0x3d)
+ && i.tm.extension_opcode == 0))
+ {
+ *mf_cmp_p = mf_cmp_test_and;
+ return !(i.mem_operands && i.imm_operands);
+ }
+
+ /* cmp without cmp m, imm. */
+ if ((i.tm.base_opcode >= 0x38 && i.tm.base_opcode <= 0x3d)
|| ((i.tm.base_opcode | 3) == 0x83
&& (i.tm.extension_opcode == 0x7)))
- return (i.types[0].bitfield.class == Reg
- || i.types[0].bitfield.instance == Accum
- || i.types[1].bitfield.class == Reg
- || i.types[1].bitfield.instance == Accum);
+ {
+ *mf_cmp_p = mf_cmp_alu_cmp;
+ return !(i.mem_operands && i.imm_operands);
+ }
- /* inc, dec with any register. */
+ /* inc, dec without inc/dec m. */
if ((i.tm.cpu_flags.bitfield.cpuno64
&& (i.tm.base_opcode | 0xf) == 0x4f)
|| ((i.tm.base_opcode | 1) == 0xff
&& i.tm.extension_opcode <= 0x1))
- return (i.types[0].bitfield.class == Reg
- || i.types[0].bitfield.instance == Accum);
+ {
+ *mf_cmp_p = mf_cmp_incdec;
+ return !i.mem_operands;
+ }
return 0;
}
/* Return 1 if a FUSED_JCC_PADDING frag should be generated. */
static int
-add_fused_jcc_padding_frag_p (void)
+add_fused_jcc_padding_frag_p (enum mf_cmp_kind* mf_cmp_p)
{
/* NB: Don't work with COND_JUMP86 without i386. */
if (!align_branch_power
|| !(align_branch & align_branch_fused_bit))
return 0;
- if (maybe_fused_with_jcc_p ())
+ if (maybe_fused_with_jcc_p (mf_cmp_p))
{
if (last_insn.kind == last_insn_other
|| last_insn.seg != now_seg)
/* Return 1 if a BRANCH_PADDING frag should be generated. */
static int
-add_branch_padding_frag_p (enum align_branch_kind *branch_p)
+add_branch_padding_frag_p (enum align_branch_kind *branch_p,
+ enum mf_jcc_kind *mf_jcc_p)
{
int add_padding;
}
else
{
+ /* Because J<cc> and JN<cc> share same group in macro-fusible table,
+ igore the lowest bit. */
+ *mf_jcc_p = (i.tm.base_opcode & 0x0e) >> 1;
*branch_p = align_branch_jcc;
if ((align_branch & align_branch_jcc_bit))
add_padding = 1;
offsetT insn_start_off;
fragS *fragP = NULL;
enum align_branch_kind branch = align_branch_none;
+ /* The initializer is arbitrary just to avoid uninitialized error.
+ it's actually either assigned in add_branch_padding_frag_p
+ or never be used. */
+ enum mf_jcc_kind mf_jcc = mf_jcc_jo;
#if defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF)
if (IS_ELF && x86_used_note)
insn_start_frag = frag_now;
insn_start_off = frag_now_fix ();
- if (add_branch_padding_frag_p (&branch))
+ if (add_branch_padding_frag_p (&branch, &mf_jcc))
{
char *p;
/* Branch can be 8 bytes. Leave some room for prefixes. */
ENCODE_RELAX_STATE (BRANCH_PADDING, 0),
NULL, 0, p);
+ fragP->tc_frag_data.mf_type = mf_jcc;
fragP->tc_frag_data.branch_type = branch;
fragP->tc_frag_data.max_bytes = max_branch_padding_size;
}
unsigned char *q;
unsigned int j;
unsigned int prefix;
+ enum mf_cmp_kind mf_cmp;
if (avoid_fence
&& (i.tm.base_opcode == 0xfaee8
if (branch)
/* Skip if this is a branch. */
;
- else if (add_fused_jcc_padding_frag_p ())
+ else if (add_fused_jcc_padding_frag_p (&mf_cmp))
{
/* Make room for padding. */
frag_grow (MAX_FUSED_JCC_PADDING_SIZE);
ENCODE_RELAX_STATE (FUSED_JCC_PADDING, 0),
NULL, 0, p);
+ fragP->tc_frag_data.mf_type = mf_cmp;
fragP->tc_frag_data.branch_type = align_branch_fused;
fragP->tc_frag_data.max_bytes = MAX_FUSED_JCC_PADDING_SIZE;
}
}
#endif
+/* Table 3-2. Macro-Fusible Instructions in Haswell Microarchitecture
+ Note also work for Skylake and Cascadelake.
+---------------------------------------------------------------------
+| JCC | ADD/SUB/CMP | INC/DEC | TEST/AND |
+| ------ | ----------- | ------- | -------- |
+| Jo | N | N | Y |
+| Jno | N | N | Y |
+| Jc/Jb | Y | N | Y |
+| Jae/Jnb | Y | N | Y |
+| Je/Jz | Y | Y | Y |
+| Jne/Jnz | Y | Y | Y |
+| Jna/Jbe | Y | N | Y |
+| Ja/Jnbe | Y | N | Y |
+| Js | N | N | Y |
+| Jns | N | N | Y |
+| Jp/Jpe | N | N | Y |
+| Jnp/Jpo | N | N | Y |
+| Jl/Jnge | Y | Y | Y |
+| Jge/Jnl | Y | Y | Y |
+| Jle/Jng | Y | Y | Y |
+| Jg/Jnle | Y | Y | Y |
+--------------------------------------------------------------------- */
+static int
+i386_macro_fusible_p (enum mf_cmp_kind mf_cmp, enum mf_jcc_kind mf_jcc)
+{
+ if (mf_cmp == mf_cmp_alu_cmp)
+ return ((mf_jcc >= mf_jcc_jc && mf_jcc <= mf_jcc_jna)
+ || mf_jcc == mf_jcc_jl || mf_jcc == mf_jcc_jle);
+ if (mf_cmp == mf_cmp_incdec)
+ return (mf_jcc == mf_jcc_je || mf_jcc == mf_jcc_jl
+ || mf_jcc == mf_jcc_jle);
+ if (mf_cmp == mf_cmp_test_and)
+ return 1;
+ return 0;
+}
+
/* Return the next non-empty frag. */
static fragS *
/* Return the next jcc frag after BRANCH_PADDING. */
static fragS *
-i386_next_jcc_frag (fragS *fragP)
+i386_next_fusible_jcc_frag (fragS *maybe_cmp_fragP, fragS *pad_fragP)
{
- if (!fragP)
+ fragS *branch_fragP;
+ if (!pad_fragP)
return NULL;
- if (fragP->fr_type == rs_machine_dependent
- && (TYPE_FROM_RELAX_STATE (fragP->fr_subtype)
+ if (pad_fragP->fr_type == rs_machine_dependent
+ && (TYPE_FROM_RELAX_STATE (pad_fragP->fr_subtype)
== BRANCH_PADDING))
{
- fragP = i386_next_non_empty_frag (fragP);
- if (fragP->fr_type != rs_machine_dependent)
+ branch_fragP = i386_next_non_empty_frag (pad_fragP);
+ if (branch_fragP->fr_type != rs_machine_dependent)
return NULL;
- if (TYPE_FROM_RELAX_STATE (fragP->fr_subtype) == COND_JUMP)
- return fragP;
+ if (TYPE_FROM_RELAX_STATE (branch_fragP->fr_subtype) == COND_JUMP
+ && i386_macro_fusible_p (maybe_cmp_fragP->tc_frag_data.mf_type,
+ pad_fragP->tc_frag_data.mf_type))
+ return branch_fragP;
}
return NULL;
*/
cmp_fragP = i386_next_non_empty_frag (next_fragP);
pad_fragP = i386_next_non_empty_frag (cmp_fragP);
- branch_fragP = i386_next_jcc_frag (pad_fragP);
+ branch_fragP = i386_next_fusible_jcc_frag (next_fragP, pad_fragP);
if (branch_fragP)
{
/* The BRANCH_PADDING frag is merged with the
--- /dev/null
+#as: -mbranches-within-32B-boundaries
+#objdump: -dw
+
+.*: +file format .*
+
+Disassembly of section .text:
+
+0+ <foo>:
+ 0: 65 a3 01 00 00 00 mov %eax,%gs:0x1
+ 6: 55 push %ebp
+ 7: 55 push %ebp
+ 8: 55 push %ebp
+ 9: 55 push %ebp
+ a: 89 e5 mov %esp,%ebp
+ c: 89 7d f8 mov %edi,-0x8\(%ebp\)
+ f: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 12: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 15: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 18: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 1b: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 1e: 39 c5 cmp %eax,%ebp
+ 20: 70 62 jo 84 <foo\+0x84>
+ 22: 89 73 f4 mov %esi,-0xc\(%ebx\)
+ 25: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 28: 89 7d f8 mov %edi,-0x8\(%ebp\)
+ 2b: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 2e: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 31: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 34: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 37: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 3a: 5d pop %ebp
+ 3b: 5d pop %ebp
+ 3c: 5d pop %ebp
+ 3d: 74 45 je 84 <foo\+0x84>
+ 3f: 5d pop %ebp
+ 40: 74 42 je 84 <foo\+0x84>
+ 42: 89 44 24 fc mov %eax,-0x4\(%esp\)
+ 46: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 49: 89 7d f8 mov %edi,-0x8\(%ebp\)
+ 4c: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 4f: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 52: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 55: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 58: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 5b: 5d pop %ebp
+ 5c: eb 2c jmp 8a <foo\+0x8a>
+ 5e: 66 90 xchg %ax,%ax
+ 60: eb 28 jmp 8a <foo\+0x8a>
+ 62: eb 26 jmp 8a <foo\+0x8a>
+ 64: 89 45 fc mov %eax,-0x4\(%ebp\)
+ 67: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 6a: 89 7d f8 mov %edi,-0x8\(%ebp\)
+ 6d: 5d pop %ebp
+ 6e: 5d pop %ebp
+ 6f: 40 inc %eax
+ 70: 72 12 jb 84 <foo\+0x84>
+ 72: 36 36 89 45 fc ss mov %eax,%ss:-0x4\(%ebp\)
+ 77: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 7a: 89 7d f8 mov %edi,-0x8\(%ebp\)
+ 7d: 89 75 f4 mov %esi,-0xc\(%ebp\)
+ 80: 21 c3 and %eax,%ebx
+ 82: 7c 06 jl 8a <foo\+0x8a>
+ 84: 8b 45 f4 mov -0xc\(%ebp\),%eax
+ 87: 89 45 fc mov %eax,-0x4\(%ebp\)
+ 8a: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ 90: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ 96: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ 9c: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ a2: 89 75 0c mov %esi,0xc\(%ebp\)
+ a5: e9 fc ff ff ff jmp a6 <foo\+0xa6>
+ aa: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ b0: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ b6: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ bc: 89 b5 50 fb ff ff mov %esi,-0x4b0\(%ebp\)
+ c2: 89 75 00 mov %esi,0x0\(%ebp\)
+ c5: 74 c3 je 8a <foo\+0x8a>
+ c7: 74 c1 je 8a <foo\+0x8a>
+#pass
--- /dev/null
+#as: -mbranches-within-32B-boundaries
+#objdump: -dw
+
+.*: +file format .*
+
+Disassembly of section .text:
+
+0+ <foo>:
+ 0: c1 e9 02 shr \$0x2,%ecx
+ 3: c1 e9 02 shr \$0x2,%ecx
+ 6: c1 e9 02 shr \$0x2,%ecx
+ 9: 89 d1 mov %edx,%ecx
+ b: 31 c0 xor %eax,%eax
+ d: c1 e9 02 shr \$0x2,%ecx
+ 10: c1 e9 02 shr \$0x2,%ecx
+ 13: c1 e9 02 shr \$0x2,%ecx
+ 16: c1 e9 02 shr \$0x2,%ecx
+ 19: c1 e9 02 shr \$0x2,%ecx
+ 1c: c1 e9 02 shr \$0x2,%ecx
+ 1f: 80 fa 02 cmp \$0x2,%dl
+ 22: 70 df jo 3 <foo\+0x3>
+ 24: 2e 2e 2e 2e 31 c0 cs cs cs cs xor %eax,%eax
+ 2a: c1 e9 02 shr \$0x2,%ecx
+ 2d: c1 e9 02 shr \$0x2,%ecx
+ 30: c1 e9 02 shr \$0x2,%ecx
+ 33: 89 d1 mov %edx,%ecx
+ 35: 31 c0 xor %eax,%eax
+ 37: c1 e9 02 shr \$0x2,%ecx
+ 3a: c1 e9 02 shr \$0x2,%ecx
+ 3d: c1 e9 02 shr \$0x2,%ecx
+ 40: f6 c2 02 test \$0x2,%dl
+ 43: 75 e8 jne 2d <foo\+0x2d>
+ 45: 31 c0 xor %eax,%eax
+ 47: c1 e9 02 shr \$0x2,%ecx
+ 4a: c1 e9 02 shr \$0x2,%ecx
+ 4d: 89 d1 mov %edx,%ecx
+ 4f: c1 e9 02 shr \$0x2,%ecx
+ 52: c1 e9 02 shr \$0x2,%ecx
+ 55: 89 d1 mov %edx,%ecx
+ 57: c1 e9 02 shr \$0x2,%ecx
+ 5a: 89 d1 mov %edx,%ecx
+ 5c: 31 c0 xor %eax,%eax
+ 5e: ff c0 inc %eax
+ 60: 76 cb jbe 2d <foo\+0x2d>
+ 62: 31 c0 xor %eax,%eax
+#pass