From: Jan Hubicka Date: Sun, 25 Mar 2001 12:26:42 +0000 (+0200) Subject: i386.md (movstrsi): Move offline. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0945b39d44197d6beffecaec708c89a1695a199b;p=gcc.git i386.md (movstrsi): Move offline. * i386.md (movstrsi): Move offline. (movstrdi): New. (strmovdi_rex64): New. (strmov?i): Accept 64bit. (strmov?i_rex64): New. (strmov?i_rex_1): New. (strmov?i_1): Disable for 64bit. (rep_mov?i_rex64): New. (rep_mov?i): Disable for 64bit. (clrstrsi): Move offline. (strset?i_rex64): New. (strset?i: Accept 64bit. (rep_stos?i): Disable for 64bit. (rep_stos?i_rex64): New. (strset?i_rex_1): New. (strset?i_1): Disable for 64bit. (cmpstrsi): Accept 64bit. (cmpstrsi_nz_1): Rename to cmpstrqi_nz_1; Disable for 64bit. (cmpstrqi_nz_rex_1): New. (cmpstrsi_1): Rename to cmpstrqi_1; Disable for 64bit. (strlensi): Move offline. (strlendi): New. (strlenqi_1): Disable for 64bit; fix constraints. (strlenqi_rex_1): New. * i386.c (ix86_adjust_counter): New static function. (ix86_zero_extend_to_Pmode): Likewise. (ix86_expand_aligntest): Likweise. (ix86_expand_strlensi_unroll_1): Make static; update for 64bit. (ix86_expand_movstr): New global function. (ix86_expand_clrstr): New global function. (ix86_expand_strlen): New global function. * i386-protos.h (ix86_expand_movstr, ix86_expand_clrstr, ix86_expand_strlen): Declare. (ix86_expand_strlensi_unroll_1): Delete. From-SVN: r40826 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index db5a254c640..e681730c832 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,40 @@ +Sun Mar 25 14:25:33 CEST 2001 Jan Hubicka + + * i386.md (movstrsi): Move offline. + (movstrdi): New. + (strmovdi_rex64): New. + (strmov?i): Accept 64bit. + (strmov?i_rex64): New. + (strmov?i_rex_1): New. + (strmov?i_1): Disable for 64bit. + (rep_mov?i_rex64): New. + (rep_mov?i): Disable for 64bit. + (clrstrsi): Move offline. + (strset?i_rex64): New. + (strset?i: Accept 64bit. + (rep_stos?i): Disable for 64bit. + (rep_stos?i_rex64): New. + (strset?i_rex_1): New. + (strset?i_1): Disable for 64bit. + (cmpstrsi): Accept 64bit. + (cmpstrsi_nz_1): Rename to cmpstrqi_nz_1; Disable for 64bit. + (cmpstrqi_nz_rex_1): New. + (cmpstrsi_1): Rename to cmpstrqi_1; Disable for 64bit. + (strlensi): Move offline. + (strlendi): New. + (strlenqi_1): Disable for 64bit; fix constraints. + (strlenqi_rex_1): New. + * i386.c (ix86_adjust_counter): New static function. + (ix86_zero_extend_to_Pmode): Likewise. + (ix86_expand_aligntest): Likweise. + (ix86_expand_strlensi_unroll_1): Make static; update for 64bit. + (ix86_expand_movstr): New global function. + (ix86_expand_clrstr): New global function. + (ix86_expand_strlen): New global function. + * i386-protos.h (ix86_expand_movstr, ix86_expand_clrstr, + ix86_expand_strlen): Declare. + (ix86_expand_strlensi_unroll_1): Delete. + Sat Mar 24 23:15:19 CET 2001 Jan Hubicka * i386.md (cmpdi): Fix operand predicates. diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 0cd90cdbf78..28fc3ffebfe 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -80,6 +80,9 @@ extern int long_memory_operand PARAMS ((rtx, enum machine_mode)); extern int aligned_operand PARAMS ((rtx, enum machine_mode)); extern enum machine_mode ix86_cc_mode PARAMS ((enum rtx_code, rtx, rtx)); +extern int ix86_expand_movstr PARAMS ((rtx, rtx, rtx, rtx)); +extern int ix86_expand_clrstr PARAMS ((rtx, rtx, rtx)); +extern int ix86_expand_strlen PARAMS ((rtx, rtx, rtx, rtx)); extern int legitimate_pic_address_disp_p PARAMS ((rtx)); extern int legitimate_address_p PARAMS ((enum machine_mode, rtx, int)); @@ -119,7 +122,6 @@ extern void ix86_split_long_move PARAMS ((rtx[])); extern void ix86_split_ashldi PARAMS ((rtx *, rtx)); extern void ix86_split_ashrdi PARAMS ((rtx *, rtx)); extern void ix86_split_lshrdi PARAMS ((rtx *, rtx)); -extern void ix86_expand_strlensi_unroll_1 PARAMS ((rtx, rtx, rtx)); extern int ix86_address_cost PARAMS ((rtx)); extern rtx ix86_find_base_term PARAMS ((rtx)); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 92f669bccc8..962d0539949 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -566,6 +566,10 @@ static void ix86_set_move_mem_attrs_1 PARAMS ((rtx, rtx, rtx, rtx, rtx)); static void ix86_sched_reorder_pentium PARAMS((rtx *, rtx *)); static void ix86_sched_reorder_ppro PARAMS((rtx *, rtx *)); static HOST_WIDE_INT ix86_GOT_alias_set PARAMS ((void)); +static void ix86_adjust_counter PARAMS ((rtx, HOST_WIDE_INT)); +static rtx ix86_zero_extend_to_Pmode PARAMS ((rtx)); +static rtx ix86_expand_aligntest PARAMS ((rtx, int)); +static void ix86_expand_strlensi_unroll_1 PARAMS ((rtx, rtx)); struct ix86_address { @@ -7068,6 +7072,542 @@ ix86_split_lshrdi (operands, scratch) } } +/* Helper function for the string operations bellow. Dest VARIABLE whether + it is aligned to VALUE bytes. If true, jump to the label. */ +static rtx +ix86_expand_aligntest (variable, value) + rtx variable; + int value; +{ + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); + if (GET_MODE (variable) == DImode) + emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); + else + emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); + emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), + 1, 0, label); + return label; +} + +/* Adjust COUNTER by the VALUE. */ +static void +ix86_adjust_counter (countreg, value) + rtx countreg; + HOST_WIDE_INT value; +{ + if (GET_MODE (countreg) == DImode) + emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value))); + else + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value))); +} + +/* Zero extend possibly SImode EXP to Pmode register. */ +static rtx +ix86_zero_extend_to_Pmode (exp) + rtx exp; +{ + rtx r; + if (GET_MODE (exp) == VOIDmode) + return force_reg (Pmode, exp); + if (GET_MODE (exp) == Pmode) + return copy_to_mode_reg (Pmode, exp); + r = gen_reg_rtx (Pmode); + emit_insn (gen_zero_extendsidi2 (r, exp)); + return r; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_clrstr contains similar code. */ +int +ix86_expand_movstr (dst, src, count_exp, align_exp) + rtx dst, src, count_exp, align_exp; +{ + rtx srcreg, destreg, countreg; + enum machine_mode counter_mode; + HOST_WIDE_INT align = 0; + unsigned HOST_WIDE_INT count = 0; + rtx insns; + + start_sequence (); + + if (GET_CODE (align_exp) == CONST_INT) + align = INTVAL (align_exp); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 64; + + if (GET_CODE (count_exp) == CONST_INT) + count = INTVAL (count_exp); + + /* Figure out proper mode for counter. For 32bits it is always SImode, + for 64bits use SImode when possible, otherwise DImode. + Set count to number of bytes copied when known at compile time. */ + if (!TARGET_64BIT || GET_MODE (count_exp) == SImode + || x86_64_zero_extended_value (count_exp)) + counter_mode = SImode; + else + counter_mode = DImode; + + if (counter_mode != SImode && counter_mode != DImode) + abort (); + + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + + emit_insn (gen_cld ()); + + /* When optimizing for size emit simple rep ; movsb instruction for + counts not divisible by 4. */ + + if ((!optimize || optimize_size) && (count == 0 || (count & 0x03))) + { + countreg = ix86_zero_extend_to_Pmode (count_exp); + if (TARGET_64BIT) + emit_insn (gen_rep_movqi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + else + emit_insn (gen_rep_movqi (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + + /* For constant aligned (or small unaligned) copies use rep movsl + followed by code copying the rest. For PentiumPro ensure 8 byte + alignment to allow rep movsl acceleration. */ + + else if (count != 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) + || optimize_size || count < (unsigned int)64)) + { + int size = TARGET_64BIT && !optimize_size ? 8 : 4; + if (count & ~(size - 1)) + { + countreg = copy_to_mode_reg (counter_mode, + GEN_INT ((count >> (size == 4 ? 2 : 3)) + & (TARGET_64BIT ? -1 : 0x3fffffff))); + countreg = ix86_zero_extend_to_Pmode (countreg); + if (size == 4) + { + if (TARGET_64BIT) + emit_insn (gen_rep_movsi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + else + emit_insn (gen_rep_movsi (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + else + emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + if (size == 8 && (count & 0x04)) + emit_insn (gen_strmovsi (destreg, srcreg)); + if (count & 0x02) + emit_insn (gen_strmovhi (destreg, srcreg)); + if (count & 0x01) + emit_insn (gen_strmovqi (destreg, srcreg)); + } + /* The generic code based on the glibc implementation: + - align destination to 4 bytes (8 byte alignment is used for PentiumPro + allowing accelerated copying there) + - copy the data using rep movsl + - copy the rest. */ + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + { + end_sequence (); + return 0; + } + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + + countreg2 = gen_reg_rtx (Pmode); + countreg = copy_to_mode_reg (counter_mode, count_exp); + + /* We don't use loops to align destination and to copy parts smaller + than 4 bytes, because gcc is able to optimize such code better (in + the case the destination or the count really is aligned, gcc is often + able to predict the branches) and also it is friendlier to the + hardware branch prediction. + + Using loops is benefical for generic case, because we can + handle small counts using the loops. Many CPUs (such as Athlon) + have large REP prefix setup costs. + + This is quite costy. Maybe we can revisit this decision later or + add some customizability to this code. */ + + if (count == 0 + && align < (TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260) + ? 8 : UNITS_PER_WORD)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1), + LEU, 0, counter_mode, 1, 0, label); + } + if (align <= 1) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strmovqi (destreg, srcreg)); + ix86_adjust_counter (countreg, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strmovhi (destreg, srcreg)); + ix86_adjust_counter (countreg, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 + && ((TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260)) + || TARGET_64BIT)) + { + rtx label = ix86_expand_aligntest (destreg, 4); + emit_insn (gen_strmovsi (destreg, srcreg)); + ix86_adjust_counter (countreg, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), + GEN_INT (3))); + emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg2, + destreg, srcreg, countreg2)); + } + else + { + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_movsi (destreg, srcreg, countreg2, + destreg, srcreg, countreg2)); + } + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + emit_insn (gen_strmovsi (destreg, srcreg)); + if ((align <= 4 || count == 0) && TARGET_64BIT) + { + rtx label = ix86_expand_aligntest (countreg, 4); + emit_insn (gen_strmovsi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count != 0 && (count & 2)) + emit_insn (gen_strmovhi (destreg, srcreg)); + if (align <= 2 || count == 0) + { + rtx label = ix86_expand_aligntest (countreg, 2); + emit_insn (gen_strmovhi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count != 0 && (count & 1)) + emit_insn (gen_strmovqi (destreg, srcreg)); + if (align <= 1 || count == 0) + { + rtx label = ix86_expand_aligntest (countreg, 1); + emit_insn (gen_strmovqi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + + insns = get_insns (); + end_sequence (); + + ix86_set_move_mem_attrs (insns, dst, src, destreg, srcreg); + emit_insns (insns); + return 1; +} + +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. expand_movstr contains similar code. */ +int +ix86_expand_clrstr (src, count_exp, align_exp) + rtx src, count_exp, align_exp; +{ + rtx destreg, zeroreg, countreg; + enum machine_mode counter_mode; + HOST_WIDE_INT align = 0; + unsigned HOST_WIDE_INT count = 0; + + if (GET_CODE (align_exp) == CONST_INT) + align = INTVAL (align_exp); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 32; + + if (GET_CODE (count_exp) == CONST_INT) + count = INTVAL (count_exp); + /* Figure out proper mode for counter. For 32bits it is always SImode, + for 64bits use SImode when possible, otherwise DImode. + Set count to number of bytes copied when known at compile time. */ + if (!TARGET_64BIT || GET_MODE (count_exp) == SImode + || x86_64_zero_extended_value (count_exp)) + counter_mode = SImode; + else + counter_mode = DImode; + + destreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + + emit_insn (gen_cld ()); + + /* When optimizing for size emit simple rep ; movsb instruction for + counts not divisible by 4. */ + + if ((!optimize || optimize_size) && (count == 0 || (count & 0x03))) + { + countreg = ix86_zero_extend_to_Pmode (count_exp); + zeroreg = copy_to_mode_reg (QImode, const0_rtx); + if (TARGET_64BIT) + emit_insn (gen_rep_stosqi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + else + emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg, + destreg, countreg)); + } + else if (count != 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) + || optimize_size || count < (unsigned int)64)) + { + int size = TARGET_64BIT && !optimize_size ? 8 : 4; + zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); + if (count & ~(size - 1)) + { + countreg = copy_to_mode_reg (counter_mode, + GEN_INT ((count >> (size == 4 ? 2 : 3)) + & (TARGET_64BIT ? -1 : 0x3fffffff))); + countreg = ix86_zero_extend_to_Pmode (countreg); + if (size == 4) + { + if (TARGET_64BIT) + emit_insn (gen_rep_stossi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + else + emit_insn (gen_rep_stossi (destreg, countreg, zeroreg, + destreg, countreg)); + } + else + emit_insn (gen_rep_stosdi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + } + if (size == 8 && (count & 0x04)) + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + if (count & 0x02) + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + if (count & 0x01) + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + } + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + return 0; + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + + countreg2 = gen_reg_rtx (Pmode); + countreg = copy_to_mode_reg (counter_mode, count_exp); + zeroreg = copy_to_mode_reg (Pmode, const0_rtx); + + if (count == 0 + && align < (TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260) + ? 8 : UNITS_PER_WORD)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1), + LEU, 0, counter_mode, 1, 0, label); + } + if (align <= 1) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + ix86_adjust_counter (countreg, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + ix86_adjust_counter (countreg, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260)) + { + rtx label = ix86_expand_aligntest (destreg, 4); + emit_insn (gen_strsetsi (destreg, (TARGET_64BIT + ? gen_rtx_SUBREG (SImode, zeroreg, 0) + : zeroreg))); + ix86_adjust_counter (countreg, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), + GEN_INT (3))); + emit_insn (gen_rep_stosdi_rex64 (destreg, countreg2, zeroreg, + destreg, countreg2)); + } + else + { + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg, + destreg, countreg2)); + } + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + if (TARGET_64BIT && (align <= 4 || count == 0)) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count != 0 && (count & 2)) + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + if (align <= 2 || count == 0) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count != 0 && (count & 1)) + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + if (align <= 1 || count == 0) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + return 1; +} +/* Expand strlen. */ +int +ix86_expand_strlen (out, src, eoschar, align) + rtx out, src, eoschar, align; +{ + rtx addr, scratch1, scratch2, scratch3, scratch4; + + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !TARGET_INLINE_ALL_STRINGOPS + && !optimize_size + && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) + return 0; + + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !optimize_size) + { + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + if (TARGET_64BIT) + emit_insn (gen_subdi3 (out, out, addr)); + else + emit_insn (gen_subsi3 (out, out, addr)); + } + else + { + scratch2 = gen_reg_rtx (Pmode); + scratch3 = gen_reg_rtx (Pmode); + scratch4 = force_reg (Pmode, constm1_rtx); + + emit_move_insn (scratch3, addr); + eoschar = force_reg (QImode, eoschar); + + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_strlenqi_rex_1 (scratch1, scratch3, eoschar, + align, scratch4, scratch3)); + emit_insn (gen_one_cmpldi2 (scratch2, scratch1)); + emit_insn (gen_adddi3 (out, scratch2, constm1_rtx)); + } + else + { + emit_insn (gen_strlenqi_1 (scratch1, scratch3, eoschar, + align, scratch4, scratch3)); + emit_insn (gen_one_cmplsi2 (scratch2, scratch1)); + emit_insn (gen_addsi3 (out, scratch2, constm1_rtx)); + } + } + return 1; +} + /* Expand the appropriate insns for doing strlen if not just doing repnz; scasb @@ -7079,9 +7619,9 @@ ix86_split_lshrdi (operands, scratch) This is just the body. It needs the initialisations mentioned above and some address computing at the end. These things are done in i386.md. */ -void -ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) - rtx out, align_rtx, scratch; +static void +ix86_expand_strlensi_unroll_1 (out, align_rtx) + rtx out, align_rtx; { int align; rtx tmp; @@ -7091,6 +7631,7 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) rtx end_0_label = gen_label_rtx (); rtx mem; rtx tmpreg = gen_reg_rtx (SImode); + rtx scratch = gen_reg_rtx (SImode); align = 0; if (GET_CODE (align_rtx) == CONST_INT) @@ -7101,6 +7642,8 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Is there a known alignment and is it less than 4? */ if (align < 4) { + rtx scratch1 = gen_reg_rtx (Pmode); + emit_move_insn (scratch1, out); /* Is there a known alignment and is it not 2? */ if (align != 2) { @@ -7108,26 +7651,26 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ /* Leave just the 3 lower bits. */ - align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (3), + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), NULL_RTX, 0, OPTAB_WIDEN); emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - SImode, 1, 0, align_4_label); + Pmode, 1, 0, align_4_label); emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), EQ, NULL, - SImode, 1, 0, align_2_label); + Pmode, 1, 0, align_2_label); emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), GTU, NULL, - SImode, 1, 0, align_3_label); + Pmode, 1, 0, align_3_label); } else { /* Since the alignment is 2, we have to check 2 or 0 bytes; check if is aligned to 4 - byte. */ - align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (2), + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (2), NULL_RTX, 0, OPTAB_WIDEN); emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - SImode, 1, 0, align_4_label); + Pmode, 1, 0, align_4_label); } mem = gen_rtx_MEM (QImode, out); @@ -7139,7 +7682,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) QImode, 1, 0, end_0_label); /* Increment the address. */ - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); /* Not needed with an alignment of 2 */ if (align != 2) @@ -7149,7 +7695,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 0, end_0_label); - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); emit_label (align_3_label); } @@ -7157,7 +7706,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 0, end_0_label); - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); } /* Generate loop to check 4 bytes at a time. It is not a good idea to @@ -7167,7 +7719,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) mem = gen_rtx_MEM (SImode, out); emit_move_insn (scratch, mem); - emit_insn (gen_addsi3 (out, out, GEN_INT (4))); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, GEN_INT (4))); + else + emit_insn (gen_addsi3 (out, out, GEN_INT (4))); /* This formula yields a nonzero result iff one of the bytes is zero. This saves three branches inside loop and many cycles. */ @@ -7182,6 +7737,7 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) if (TARGET_CMOVE) { rtx reg = gen_reg_rtx (SImode); + rtx reg2 = gen_reg_rtx (Pmode); emit_move_insn (reg, tmpreg); emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); @@ -7194,15 +7750,15 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) reg, tmpreg))); /* Emit lea manually to avoid clobbering of flags. */ - emit_insn (gen_rtx_SET (SImode, reg, - gen_rtx_PLUS (SImode, out, GEN_INT (2)))); + emit_insn (gen_rtx_SET (SImode, reg2, + gen_rtx_PLUS (Pmode, out, GEN_INT (2)))); tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); emit_insn (gen_rtx_SET (VOIDmode, out, - gen_rtx_IF_THEN_ELSE (SImode, tmp, - reg, - out))); + gen_rtx_IF_THEN_ELSE (Pmode, tmp, + reg2, + out))); } else @@ -7221,7 +7777,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Not in the first two. Move two bytes forward. */ emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); - emit_insn (gen_addsi3 (out, out, GEN_INT (2))); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, GEN_INT (2))); + else + emit_insn (gen_addsi3 (out, out, GEN_INT (2))); emit_label (end_2_label); @@ -7230,7 +7789,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Avoid branch in fixing the byte. */ tmpreg = gen_lowpart (QImode, tmpreg); emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg)); - emit_insn (gen_subsi3_carry (out, out, GEN_INT (3))); + if (TARGET_64BIT) + emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3))); + else + emit_insn (gen_subsi3_carry (out, out, GEN_INT (3))); emit_label (end_0_label); } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0a490a61c23..7a4e0fc4e54 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -13340,192 +13340,51 @@ "" " { - rtx srcreg, destreg, countreg; - int align = 0; - int count = -1; - rtx insns; - - start_sequence (); - - if (GET_CODE (operands[3]) == CONST_INT) - align = INTVAL (operands[3]); - - /* This simple hack avoids all inlining code and simplifies code bellow. */ - if (!TARGET_ALIGN_STRINGOPS) - align = 32; - - if (GET_CODE (operands[2]) == CONST_INT) - count = INTVAL (operands[2]); - - destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); - srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0)); - - emit_insn (gen_cld ()); - - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. */ + if (ix86_expand_movstr (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; +}") - if ((!optimize || optimize_size) - && (count < 0 || (count & 0x03))) - { - countreg = copy_to_mode_reg (SImode, operands[2]); - emit_insn (gen_rep_movqi (destreg, srcreg, countreg, - destreg, srcreg, countreg)); - } +(define_expand "movstrdi" + [(use (match_operand:BLK 0 "memory_operand" "")) + (use (match_operand:BLK 1 "memory_operand" "")) + (use (match_operand:DI 2 "nonmemory_operand" "")) + (use (match_operand:DI 3 "const_int_operand" ""))] + "TARGET_64BIT" + " +{ + if (ix86_expand_movstr (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; +}") - /* For constant aligned (or small unaligned) copies use rep movsl - followed by code copying the rest. For PentiumPro ensure 8 byte - alignment to allow rep movsl acceleration. */ +;; Most CPUs don't like single string operations +;; Handle this case here to simplify previous expander. - else if (count >= 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && align >= 4) - || optimize_size || count < 64)) - { - if (count & ~0x03) - { - countreg = copy_to_mode_reg (SImode, - GEN_INT ((count >> 2) - & 0x3fffffff)); - emit_insn (gen_rep_movsi (destreg, srcreg, countreg, - destreg, srcreg, countreg)); - } - if (count & 0x02) - emit_insn (gen_strmovhi (destreg, srcreg)); - if (count & 0x01) - emit_insn (gen_strmovqi (destreg, srcreg)); - } - /* The generic code based on the glibc implementation: - - align destination to 4 bytes (8 byte alignment is used for PentiumPro - allowing accelerated copying there) - - copy the data using rep movsl - - copy the rest. */ - else +(define_expand "strmovdi_rex64" + [(set (match_dup 2) + (mem:DI (match_operand:DI 1 "register_operand" ""))) + (set (mem:DI (match_operand:DI 0 "register_operand" "")) + (match_dup 2)) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 8))) + (clobber (reg:CC 17))]) + (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 8))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) { - rtx countreg2; - rtx label = NULL; - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < 4) - { - end_sequence (); - FAIL; - } - - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - - countreg2 = gen_reg_rtx (SImode); - countreg = copy_to_mode_reg (SImode, operands[2]); - - /* We don't use loops to align destination and to copy parts smaller - than 4 bytes, because gcc is able to optimize such code better (in - the case the destination or the count really is aligned, gcc is often - able to predict the branches) and also it is friendlier to the - hardware branch prediction. - - Using loops is benefical for generic case, because we can - handle small counts using the loops. Many CPUs (such as Athlon) - have large REP prefix setup costs. - - This is quite costy. Maybe we can revisit this decision later or - add some customizability to this code. */ - - if (count < 0 - && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4)) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (3), - LEU, 0, SImode, 1, 0, label); - } - if (align <= 1) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strmovqi (destreg, srcreg)); - emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strmovhi (destreg, srcreg)); - emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260)) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strmovsi (destreg, srcreg)); - emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld()); - emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); - emit_insn (gen_rep_movsi (destreg, srcreg, countreg2, - destreg, srcreg, countreg2)); - - if (label) - { - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 2 && count > 0 && (count & 2)) - emit_insn (gen_strmovhi (destreg, srcreg)); - if (align <= 2 || count < 0) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strmovhi (destreg, srcreg)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 1 && count > 0 && (count & 1)) - emit_insn (gen_strmovsi (destreg, srcreg)); - if (align <= 1 || count < 0) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strmovqi (destreg, srcreg)); - emit_label (label); - LABEL_NUSES (label) = 1; - } + emit_insn (gen_strmovdi_rex_1 (operands[0], operands[1], operands[0], + operands[1])); + DONE; } - - insns = get_insns (); - end_sequence (); - - ix86_set_move_mem_attrs (insns, operands[0], operands[1], destreg, srcreg); - emit_insns (insns); - DONE; + else + operands[2] = gen_reg_rtx (DImode); }") -;; Most CPUs don't like single string operations -;; Handle this case here to simplify previous expander. (define_expand "strmovsi" [(set (match_dup 2) @@ -13539,6 +13398,11 @@ "" " { + if (TARGET_64BIT) + { + emit_insn (gen_strmovsi_rex64 (operands[0], operands[1])); + DONE; + } if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0], @@ -13549,6 +13413,28 @@ operands[2] = gen_reg_rtx (SImode); }") +(define_expand "strmovsi_rex64" + [(set (match_dup 2) + (mem:SI (match_operand:DI 1 "register_operand" ""))) + (set (mem:SI (match_operand:DI 0 "register_operand" "")) + (match_dup 2)) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 4))) + (clobber (reg:CC 17))]) + (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 4))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strmovsi_rex_1 (operands[0], operands[1], operands[0], + operands[1])); + DONE; + } + else + operands[2] = gen_reg_rtx (SImode); +}") + (define_expand "strmovhi" [(set (match_dup 2) (mem:HI (match_operand:SI 1 "register_operand" ""))) @@ -13561,6 +13447,11 @@ "" " { + if (TARGET_64BIT) + { + emit_insn (gen_strmovhi_rex64 (operands[0], operands[1])); + DONE; + } if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strmovhi_1 (operands[0], operands[1], operands[0], @@ -13571,6 +13462,28 @@ operands[2] = gen_reg_rtx (HImode); }") +(define_expand "strmovhi_rex64" + [(set (match_dup 2) + (mem:HI (match_operand:DI 1 "register_operand" ""))) + (set (mem:HI (match_operand:DI 0 "register_operand" "")) + (match_dup 2)) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 2))) + (clobber (reg:CC 17))]) + (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 2))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strmovhi_rex_1 (operands[0], operands[1], operands[0], + operands[1])); + DONE; + } + else + operands[2] = gen_reg_rtx (HImode); +}") + (define_expand "strmovqi" [(set (match_dup 2) (mem:QI (match_operand:SI 1 "register_operand" ""))) @@ -13583,6 +13496,11 @@ "" " { + if (TARGET_64BIT) + { + emit_insn (gen_strmovqi_rex64 (operands[0], operands[1])); + DONE; + } if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strmovqi_1 (operands[0], operands[1], operands[0], @@ -13593,6 +13511,44 @@ operands[2] = gen_reg_rtx (QImode); }") +(define_expand "strmovqi_rex64" + [(set (match_dup 2) + (mem:QI (match_operand:DI 1 "register_operand" ""))) + (set (mem:QI (match_operand:DI 0 "register_operand" "")) + (match_dup 2)) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 1))) + (clobber (reg:CC 17))]) + (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 1))) + (clobber (reg:CC 17))])] + "!TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strmovqi_rex_1 (operands[0], operands[1], operands[0], + operands[1])); + DONE; + } + else + operands[2] = gen_reg_rtx (QImode); +}") + +(define_insn "strmovdi_rex_1" + [(set (mem:DI (match_operand:DI 2 "register_operand" "0")) + (mem:DI (match_operand:DI 3 "register_operand" "1"))) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 2) + (const_int 8))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_dup 3) + (const_int 8))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "movsq" + [(set_attr "type" "str") + (set_attr "mode" "DI") + (set_attr "memory" "both")]) + (define_insn "strmovsi_1" [(set (mem:SI (match_operand:SI 2 "register_operand" "0")) (mem:SI (match_operand:SI 3 "register_operand" "1"))) @@ -13603,8 +13559,24 @@ (plus:SI (match_dup 3) (const_int 4))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" - "movsl" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "movsl|movsd" + [(set_attr "type" "str") + (set_attr "mode" "SI") + (set_attr "memory" "both")]) + +(define_insn "strmovsi_rex_1" + [(set (mem:SI (match_operand:DI 2 "register_operand" "0")) + (mem:SI (match_operand:DI 3 "register_operand" "1"))) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 2) + (const_int 4))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_dup 3) + (const_int 4))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "movsl|movsd" [(set_attr "type" "str") (set_attr "mode" "SI") (set_attr "memory" "both")]) @@ -13619,7 +13591,23 @@ (plus:SI (match_dup 3) (const_int 2))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "movsw" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set_attr "mode" "HI")]) + +(define_insn "strmovhi_rex_1" + [(set (mem:HI (match_operand:DI 2 "register_operand" "0")) + (mem:HI (match_operand:DI 3 "register_operand" "1"))) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 2) + (const_int 2))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_dup 3) + (const_int 2))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" "movsw" [(set_attr "type" "str") (set_attr "memory" "both") @@ -13635,12 +13623,48 @@ (plus:SI (match_dup 3) (const_int 1))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" "movsb" [(set_attr "type" "str") (set_attr "memory" "both") (set_attr "mode" "QI")]) +(define_insn "strmovqi_rex_1" + [(set (mem:QI (match_operand:DI 2 "register_operand" "0")) + (mem:QI (match_operand:DI 3 "register_operand" "1"))) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 2) + (const_int 1))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_dup 3) + (const_int 1))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "movsb" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set_attr "mode" "QI")]) + +(define_insn "rep_movdi_rex64" + [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 5 "register_operand" "2") + (const_int 3)) + (match_operand:DI 3 "register_operand" "0"))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (ashift:DI (match_dup 5) (const_int 3)) + (match_operand:DI 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5)) + (use (reg:SI 19))] + "TARGET_64BIT" + "rep\;movsq|rep movsq" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "DI")]) + (define_insn "rep_movsi" [(set (match_operand:SI 2 "register_operand" "=c") (const_int 0)) (set (match_operand:SI 0 "register_operand" "=D") @@ -13654,7 +13678,27 @@ (mem:BLK (match_dup 4))) (use (match_dup 5)) (use (reg:SI 19))] - "" + "!TARGET_64BIT" + "rep\;movsl|rep movsd" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "SI")]) + +(define_insn "rep_movsi_rex64" + [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 5 "register_operand" "2") + (const_int 2)) + (match_operand:DI 3 "register_operand" "0"))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (ashift:DI (match_dup 5) (const_int 2)) + (match_operand:DI 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5)) + (use (reg:SI 19))] + "TARGET_64BIT" "rep\;movsl|rep movsd" [(set_attr "type" "str") (set_attr "prefix_rep" "1") @@ -13672,7 +13716,25 @@ (mem:BLK (match_dup 4))) (use (match_dup 5)) (use (reg:SI 19))] - "" + "!TARGET_64BIT" + "rep\;movsb|rep movsb" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "SI")]) + +(define_insn "rep_movqi_rex64" + [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_operand:DI 3 "register_operand" "0") + (match_operand:DI 5 "register_operand" "2"))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_operand:DI 4 "register_operand" "1") (match_dup 5))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5)) + (use (reg:SI 19))] + "TARGET_64BIT" "rep\;movsb|rep movsb" [(set_attr "type" "str") (set_attr "prefix_rep" "1") @@ -13682,187 +13744,47 @@ (define_expand "clrstrsi" [(use (match_operand:BLK 0 "memory_operand" "")) (use (match_operand:SI 1 "nonmemory_operand" "")) - (use (match_operand:SI 2 "const_int_operand" ""))] + (use (match_operand 2 "const_int_operand" ""))] "" " { - /* See comments in movstr expanders. The code is mostly identical. */ - - rtx destreg, zeroreg, countreg; - int align = 0; - int count = -1; - rtx insns; - - start_sequence (); - - if (GET_CODE (operands[2]) == CONST_INT) - align = INTVAL (operands[2]); - - /* This simple hack avoids all inlining code and simplifies code bellow. */ - if (!TARGET_ALIGN_STRINGOPS) - align = 32; - - if (GET_CODE (operands[1]) == CONST_INT) - count = INTVAL (operands[1]); - - destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); + if (ix86_expand_clrstr (operands[0], operands[1], operands[2])) + DONE; + else + FAIL; +}") - emit_insn (gen_cld ()); +(define_expand "clrstrdi" + [(use (match_operand:BLK 0 "memory_operand" "")) + (use (match_operand:DI 1 "nonmemory_operand" "")) + (use (match_operand 2 "const_int_operand" ""))] + "TARGET_64BIT" + " +{ + if (ix86_expand_clrstr (operands[0], operands[1], operands[2])) + DONE; + else + FAIL; +}") - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. */ +;; Most CPUs don't like single string operations +;; Handle this case here to simplify previous expander. - if ((!optimize || optimize_size) - && (count < 0 || (count & 0x03))) - { - countreg = copy_to_mode_reg (SImode, operands[1]); - zeroreg = copy_to_mode_reg (QImode, const0_rtx); - emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg, - destreg, countreg)); - } - else if (count >= 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && align >= 4) - || optimize_size || count < 64)) - { - zeroreg = copy_to_mode_reg (SImode, const0_rtx); - if (INTVAL (operands[1]) & ~0x03) - { - countreg = copy_to_mode_reg (SImode, - GEN_INT ((INTVAL (operands[1]) >> 2) - & 0x3fffffff)); - emit_insn (gen_rep_stossi (destreg, countreg, zeroreg, - destreg, countreg)); - } - if (INTVAL (operands[1]) & 0x02) - emit_insn (gen_strsethi (destreg, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - if (INTVAL (operands[1]) & 0x01) - emit_insn (gen_strsetqi (destreg, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - } - else +(define_expand "strsetdi_rex64" + [(set (mem:DI (match_operand:DI 0 "register_operand" "")) + (match_operand:DI 1 "register_operand" "")) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 8))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) { - rtx countreg2; - rtx label = NULL; - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < 4) - { - end_sequence (); - FAIL; - } - - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - - countreg2 = gen_reg_rtx (SImode); - countreg = copy_to_mode_reg (SImode, operands[1]); - zeroreg = copy_to_mode_reg (SImode, const0_rtx); - - if (count < 0 - && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4)) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (3), - LEU, 0, SImode, 1, 0, label); - } - if (align <= 1) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strsetqi (destreg, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strsethi (destreg, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260)) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strsetsi (destreg, zeroreg)); - emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld()); - emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); - emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg, - destreg, countreg2)); - - if (label) - { - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 2 && count > 0 && (count & 2)) - emit_insn (gen_strsethi (destreg, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - if (align <= 2 || count < 0) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strsethi (destreg, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 1 && count > 0 && (count & 1)) - emit_insn (gen_strsetqi (destreg, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - if (align <= 1 || count < 0) - { - rtx label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (SImode); - emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1))); - emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, - SImode, 1, 0, label); - emit_insn (gen_strsetqi (destreg, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } + emit_insn (gen_strsetdi_rex_1 (operands[0], operands[0], operands[1])); + DONE; } - - insns = get_insns (); - end_sequence (); - - ix86_set_move_mem_attrs (insns, operands[0], operands[0], destreg, destreg); - emit_insns (insns); - - DONE; }") -;; Most CPUs don't like single string operations -;; Handle this case here to simplify previous expander. - (define_expand "strsetsi" [(set (mem:SI (match_operand:SI 0 "register_operand" "")) (match_operand:SI 1 "register_operand" "")) @@ -13871,13 +13793,33 @@ "" " { - if (TARGET_SINGLE_STRINGOP || optimize_size) + if (TARGET_64BIT) + { + emit_insn (gen_strsetsi_rex64 (operands[0], operands[1])); + DONE; + } + else if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1])); DONE; } }") +(define_expand "strsetsi_rex64" + [(set (mem:SI (match_operand:DI 0 "register_operand" "")) + (match_operand:SI 1 "register_operand" "")) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 4))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strsetsi_rex_1 (operands[0], operands[0], operands[1])); + DONE; + } +}") + (define_expand "strsethi" [(set (mem:HI (match_operand:SI 0 "register_operand" "")) (match_operand:HI 1 "register_operand" "")) @@ -13886,13 +13828,33 @@ "" " { - if (TARGET_SINGLE_STRINGOP || optimize_size) + if (TARGET_64BIT) + { + emit_insn (gen_strsethi_rex64 (operands[0], operands[1])); + DONE; + } + else if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1])); DONE; } }") +(define_expand "strsethi_rex64" + [(set (mem:HI (match_operand:DI 0 "register_operand" "")) + (match_operand:HI 1 "register_operand" "")) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 2))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strsethi_rex_1 (operands[0], operands[0], operands[1])); + DONE; + } +}") + (define_expand "strsetqi" [(set (mem:QI (match_operand:SI 0 "register_operand" "")) (match_operand:QI 1 "register_operand" "")) @@ -13901,13 +13863,46 @@ "" " { - if (TARGET_SINGLE_STRINGOP || optimize_size) + if (TARGET_64BIT) + { + emit_insn (gen_strsetqi_rex64 (operands[0], operands[1])); + DONE; + } + else if (TARGET_SINGLE_STRINGOP || optimize_size) { emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1])); DONE; } }") +(define_expand "strsetqi_rex64" + [(set (mem:QI (match_operand:DI 0 "register_operand" "")) + (match_operand:QI 1 "register_operand" "")) + (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 1))) + (clobber (reg:CC 17))])] + "TARGET_64BIT" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strsetqi_rex_1 (operands[0], operands[0], operands[1])); + DONE; + } +}") + +(define_insn "strsetdi_rex_1" + [(set (mem:SI (match_operand:DI 1 "register_operand" "0")) + (match_operand:SI 2 "register_operand" "a")) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 1) + (const_int 8))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "stosq" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "DI")]) + (define_insn "strsetsi_1" [(set (mem:SI (match_operand:SI 1 "register_operand" "0")) (match_operand:SI 2 "register_operand" "a")) @@ -13915,8 +13910,21 @@ (plus:SI (match_dup 1) (const_int 4))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" - "stosl" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "stosl|stosd" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "SI")]) + +(define_insn "strsetsi_rex_1" + [(set (mem:SI (match_operand:DI 1 "register_operand" "0")) + (match_operand:SI 2 "register_operand" "a")) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 1) + (const_int 4))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "stosl|stosd" [(set_attr "type" "str") (set_attr "memory" "store") (set_attr "mode" "SI")]) @@ -13928,7 +13936,20 @@ (plus:SI (match_dup 1) (const_int 2))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "stosw" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "HI")]) + +(define_insn "strsethi_rex_1" + [(set (mem:HI (match_operand:DI 1 "register_operand" "0")) + (match_operand:HI 2 "register_operand" "a")) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 1) + (const_int 2))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" "stosw" [(set_attr "type" "str") (set_attr "memory" "store") @@ -13941,12 +13962,43 @@ (plus:SI (match_dup 1) (const_int 1))) (use (reg:SI 19))] - "TARGET_SINGLE_STRINGOP || optimize_size" + "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" "stosb" [(set_attr "type" "str") (set_attr "memory" "store") (set_attr "mode" "QI")]) +(define_insn "strsetqi_rex_1" + [(set (mem:QI (match_operand:DI 1 "register_operand" "0")) + (match_operand:QI 2 "register_operand" "a")) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 1) + (const_int 1))) + (use (reg:SI 19))] + "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)" + "stosb" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "QI")]) + +(define_insn "rep_stosdi_rex64" + [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 4 "register_operand" "1") + (const_int 3)) + (match_operand:DI 3 "register_operand" "0"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:DI 2 "register_operand" "a")) + (use (match_dup 4)) + (use (reg:SI 19))] + "TARGET_64BIT" + "rep\;stosq|rep stosq" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set_attr "mode" "DI")]) + (define_insn "rep_stossi" [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0)) (set (match_operand:SI 0 "register_operand" "=D") @@ -13958,7 +14010,25 @@ (use (match_operand:SI 2 "register_operand" "a")) (use (match_dup 4)) (use (reg:SI 19))] - "" + "!TARGET_64BIT" + "rep\;stosl|rep stosd" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set_attr "mode" "SI")]) + +(define_insn "rep_stossi_rex64" + [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 4 "register_operand" "1") + (const_int 2)) + (match_operand:DI 3 "register_operand" "0"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:SI 2 "register_operand" "a")) + (use (match_dup 4)) + (use (reg:SI 19))] + "TARGET_64BIT" "rep\;stosl|rep stosd" [(set_attr "type" "str") (set_attr "prefix_rep" "1") @@ -13975,7 +14045,24 @@ (use (match_operand:QI 2 "register_operand" "a")) (use (match_dup 4)) (use (reg:SI 19))] - "" + "!TARGET_64BIT" + "rep\;stosb|rep stosb" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set_attr "mode" "QI")]) + +(define_insn "rep_stosqi_rex64" + [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_operand:DI 3 "register_operand" "0") + (match_operand:DI 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:QI 2 "register_operand" "a")) + (use (match_dup 4)) + (use (reg:DI 19))] + "TARGET_64BIT" "rep\;stosb|rep stosb" [(set_attr "type" "str") (set_attr "prefix_rep" "1") @@ -13986,8 +14073,8 @@ [(set (match_operand:SI 0 "register_operand" "") (compare:SI (match_operand:BLK 1 "general_operand" "") (match_operand:BLK 2 "general_operand" ""))) - (use (match_operand:SI 3 "general_operand" "")) - (use (match_operand:SI 4 "immediate_operand" ""))] + (use (match_operand 3 "general_operand" "")) + (use (match_operand 4 "immediate_operand" ""))] "" " { @@ -14001,7 +14088,7 @@ addr2 = copy_to_mode_reg (Pmode, XEXP (operands[2], 0)); count = operands[3]; - countreg = copy_to_mode_reg (SImode, count); + countreg = copy_to_mode_reg (Pmode, count); /* %%% Iff we are testing strict equality, we can use known alignment to good advantage. This may be possible with combine, particularly @@ -14016,14 +14103,27 @@ emit_move_insn (operands[0], const0_rtx); DONE; } - emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align, - addr1, addr2, countreg)); + if (TARGET_64BIT) + emit_insn (gen_cmpstrqi_nz_rex_1 (addr1, addr2, countreg, align, + addr1, addr2, countreg)); + else + emit_insn (gen_cmpstrqi_nz_1 (addr1, addr2, countreg, align, + addr1, addr2, countreg)); } else { - emit_insn (gen_cmpsi_1 (countreg, countreg)); - emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align, - addr1, addr2, countreg)); + if (TARGET_64BIT) + { + emit_insn (gen_cmpdi_1_rex64 (countreg, countreg)); + emit_insn (gen_cmpstrqi_rex_1 (addr1, addr2, countreg, align, + addr1, addr2, countreg)); + } + else + { + emit_insn (gen_cmpsi_1 (countreg, countreg)); + emit_insn (gen_cmpstrqi_1 (addr1, addr2, countreg, align, + addr1, addr2, countreg)); + } } outlow = gen_lowpart (QImode, out); @@ -14054,7 +14154,7 @@ ;; memcmp recognizers. The `cmpsb' opcode does nothing if the count is ;; zero. Emit extra code to make sure that a zero-length compare is EQ. -(define_insn "cmpstrsi_nz_1" +(define_insn "cmpstrqi_nz_1" [(set (reg:CC 17) (compare:CC (mem:BLK (match_operand:SI 4 "register_operand" "0")) (mem:BLK (match_operand:SI 5 "register_operand" "1")))) @@ -14064,7 +14164,23 @@ (clobber (match_operand:SI 0 "register_operand" "=S")) (clobber (match_operand:SI 1 "register_operand" "=D")) (clobber (match_operand:SI 2 "register_operand" "=c"))] - "" + "!TARGET_64BIT" + "repz{\;| }cmpsb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set_attr "prefix_rep" "1")]) + +(define_insn "cmpstrqi_nz_rex_1" + [(set (reg:CC 17) + (compare:CC (mem:BLK (match_operand:DI 4 "register_operand" "0")) + (mem:BLK (match_operand:DI 5 "register_operand" "1")))) + (use (match_operand:DI 6 "register_operand" "2")) + (use (match_operand:SI 3 "immediate_operand" "i")) + (use (reg:SI 19)) + (clobber (match_operand:DI 0 "register_operand" "=S")) + (clobber (match_operand:DI 1 "register_operand" "=D")) + (clobber (match_operand:DI 2 "register_operand" "=c"))] + "TARGET_64BIT" "repz{\;| }cmpsb" [(set_attr "type" "str") (set_attr "mode" "QI") @@ -14072,7 +14188,7 @@ ;; The same, but the count is not known to not be zero. -(define_insn "cmpstrsi_1" +(define_insn "cmpstrqi_1" [(set (reg:CC 17) (if_then_else:CC (ne (match_operand:SI 6 "register_operand" "2") (const_int 0)) @@ -14085,7 +14201,26 @@ (clobber (match_operand:SI 0 "register_operand" "=S")) (clobber (match_operand:SI 1 "register_operand" "=D")) (clobber (match_operand:SI 2 "register_operand" "=c"))] - "" + "!TARGET_64BIT" + "repz{\;| }cmpsb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set_attr "prefix_rep" "1")]) + +(define_insn "cmpstrqi_rex_1" + [(set (reg:CC 17) + (if_then_else:CC (ne (match_operand:DI 6 "register_operand" "2") + (const_int 0)) + (compare:CC (mem:BLK (match_operand:DI 4 "register_operand" "0")) + (mem:BLK (match_operand:DI 5 "register_operand" "1"))) + (const_int 0))) + (use (match_operand:SI 3 "immediate_operand" "i")) + (use (reg:CC 17)) + (use (reg:SI 19)) + (clobber (match_operand:DI 0 "register_operand" "=S")) + (clobber (match_operand:DI 1 "register_operand" "=D")) + (clobber (match_operand:DI 2 "register_operand" "=c"))] + "TARGET_64BIT" "repz{\;| }cmpsb" [(set_attr "type" "str") (set_attr "mode" "QI") @@ -14095,76 +14230,55 @@ [(set (match_operand:SI 0 "register_operand" "") (unspec:SI [(match_operand:BLK 1 "general_operand" "") (match_operand:QI 2 "immediate_operand" "") - (match_operand:SI 3 "immediate_operand" "")] 0))] + (match_operand 3 "immediate_operand" "")] 0))] "" " { - rtx out, addr, scratch1, scratch2, scratch3; - rtx eoschar = operands[2]; - rtx align = operands[3]; - - /* The generic case of strlen expander is long. Avoid it's - expanding unless TARGET_INLINE_ALL_STRINGOPS. */ - - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !TARGET_INLINE_ALL_STRINGOPS - && !optimize_size - && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) - FAIL; - - out = operands[0]; - addr = force_reg (Pmode, XEXP (operands[1], 0)); - scratch1 = gen_reg_rtx (SImode); - - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !optimize_size) - { - /* Well it seems that some optimizer does not combine a call like - foo(strlen(bar), strlen(bar)); - when the move and the subtraction is done here. It does calculate - the length just once when these instructions are done inside of - output_strlen_unroll(). But I think since &bar[strlen(bar)] is - often used and I use one fewer register for the lifetime of - output_strlen_unroll() this is better. */ - - if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4) - emit_move_insn (scratch1, addr); - - emit_move_insn (out, addr); - - ix86_expand_strlensi_unroll_1 (out, align, scratch1); - - /* strlensi_unroll_1 returns the address of the zero at the end of - the string, like memchr(), so compute the length by subtracting - the start address. */ - emit_insn (gen_subsi3 (out, out, addr)); - } - else - { - scratch2 = gen_reg_rtx (SImode); - scratch3 = gen_reg_rtx (SImode); - - emit_move_insn (scratch3, addr); + if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; +}") - emit_insn (gen_cld ()); - emit_insn (gen_strlensi_1 (scratch1, scratch3, eoschar, - align, constm1_rtx, scratch3)); - emit_insn (gen_one_cmplsi2 (scratch2, scratch1)); - emit_insn (gen_addsi3 (out, scratch2, constm1_rtx)); - } - DONE; +(define_expand "strlendi" + [(set (match_operand:DI 0 "register_operand" "") + (unspec:DI [(match_operand:BLK 1 "general_operand" "") + (match_operand:QI 2 "immediate_operand" "") + (match_operand 3 "immediate_operand" "")] 0))] + "" + " +{ + if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; }") -(define_insn "strlensi_1" +(define_insn "strlenqi_1" [(set (match_operand:SI 0 "register_operand" "=&c") (unspec:SI [(mem:BLK (match_operand:SI 5 "register_operand" "1")) - (match_operand:QI 2 "general_operand" "a") + (match_operand:QI 2 "register_operand" "a") (match_operand:SI 3 "immediate_operand" "i") - (match_operand:SI 4 "immediate_operand" "0")] 0)) + (match_operand:SI 4 "register_operand" "0")] 0)) (use (reg:SI 19)) (clobber (match_operand:SI 1 "register_operand" "=D")) (clobber (reg:CC 17))] - "" + "!TARGET_64BIT" + "repnz{\;| }scasb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set_attr "prefix_rep" "1")]) + +(define_insn "strlenqi_rex_1" + [(set (match_operand:DI 0 "register_operand" "=&c") + (unspec:DI [(mem:BLK (match_operand:DI 5 "register_operand" "1")) + (match_operand:QI 2 "register_operand" "a") + (match_operand:DI 3 "immediate_operand" "i") + (match_operand:DI 4 "register_operand" "0")] 0)) + (use (reg:SI 19)) + (clobber (match_operand:DI 1 "register_operand" "=D")) + (clobber (reg:CC 17))] + "TARGET_64BIT" "repnz{\;| }scasb" [(set_attr "type" "str") (set_attr "mode" "QI")