From: Jakub Jelinek Date: Fri, 6 Aug 2004 10:17:14 +0000 (+0200) Subject: i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6b32b6286bcc158ec954b458576d50bcd7ed5df3;p=gcc.git i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed. * config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed. Don't use repz; stosb for -Os with sufficiently small constant sizes. For sufficiently small repz; stos{l,q} repeat counts use a sequence of stos{l,q} instructions instead. From-SVN: r85635 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d83bdbe2e66..eacfe7c2455 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2004-08-06 Jakub Jelinek + + * config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to + the places where it is actually needed. Don't use repz; stosb + for -Os with sufficiently small constant sizes. + For sufficiently small repz; stos{l,q} repeat counts use a sequence + of stos{l,q} instructions instead. + 2004-08-06 Zdenek Dvorak PR tree-optimization/16807 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 22de6e36217..951573415da 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -11508,13 +11508,20 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp) if (destreg != XEXP (dst, 0)) dst = replace_equiv_address_nv (dst, destreg); - emit_insn (gen_cld ()); /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. */ + counts not divisible by 4. The movl $N, %ecx; rep; stosb + sequence is 7 bytes long, so if optimizing for size and count is + small enough that some stosl, stosw and stosb instructions without + rep are shorter, fall back into the next if. */ - if ((!optimize || optimize_size) && (count == 0 || (count & 0x03))) + if ((!optimize || optimize_size) + && (count == 0 + || ((count & 0x03) + && (!optimize_size || (count & 0x03) + (count >> 2) > 7)))) { + emit_insn (gen_cld ()); + countreg = ix86_zero_extend_to_Pmode (count_exp); zeroreg = copy_to_mode_reg (QImode, const0_rtx); destexp = gen_rtx_PLUS (Pmode, destreg, countreg); @@ -11528,17 +11535,54 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp) int size = TARGET_64BIT && !optimize_size ? 8 : 4; unsigned HOST_WIDE_INT offset = 0; + emit_insn (gen_cld ()); + zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); if (count & ~(size - 1)) { - countreg = copy_to_mode_reg (counter_mode, - GEN_INT ((count >> (size == 4 ? 2 : 3)) - & (TARGET_64BIT ? -1 : 0x3fffffff))); - countreg = ix86_zero_extend_to_Pmode (countreg); - destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3)); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp)); - offset = count & ~(size - 1); + unsigned HOST_WIDE_INT repcount; + unsigned int max_nonrep; + + repcount = count >> (size == 4 ? 2 : 3); + if (!TARGET_64BIT) + repcount &= 0x3fffffff; + + /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes. + movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN + bytes. In both cases the latter seems to be faster for small + values of N. */ + max_nonrep = size == 4 ? 7 : 4; + if (!optimize_size) + switch (ix86_tune) + { + case PROCESSOR_PENTIUM4: + case PROCESSOR_NOCONA: + max_nonrep = 3; + break; + default: + break; + } + + if (repcount <= max_nonrep) + while (repcount-- > 0) + { + rtx mem = adjust_automodify_address_nv (dst, + GET_MODE (zeroreg), + destreg, offset); + emit_insn (gen_strset (destreg, mem, zeroreg)); + offset += size; + } + else + { + countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount)); + countreg = ix86_zero_extend_to_Pmode (countreg); + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (size == 4 ? 2 : 3)); + destexp = gen_rtx_PLUS (Pmode, destexp, destreg); + emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, + destexp)); + offset = count & ~(size - 1); + } } if (size == 8 && (count & 0x04)) {