if (destreg != XEXP (dst, 0))
dst = replace_equiv_address_nv (dst, destreg);
- emit_insn (gen_cld ());
/* When optimizing for size emit simple rep ; movsb instruction for
- counts not divisible by 4. */
+ counts not divisible by 4. The movl $N, %ecx; rep; stosb
+ sequence is 7 bytes long, so if optimizing for size and count is
+ small enough that some stosl, stosw and stosb instructions without
+ rep are shorter, fall back into the next if. */
- if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
+ if ((!optimize || optimize_size)
+ && (count == 0
+ || ((count & 0x03)
+ && (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
{
+ emit_insn (gen_cld ());
+
countreg = ix86_zero_extend_to_Pmode (count_exp);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
int size = TARGET_64BIT && !optimize_size ? 8 : 4;
unsigned HOST_WIDE_INT offset = 0;
+ emit_insn (gen_cld ());
+
zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
if (count & ~(size - 1))
{
- countreg = copy_to_mode_reg (counter_mode,
- GEN_INT ((count >> (size == 4 ? 2 : 3))
- & (TARGET_64BIT ? -1 : 0x3fffffff)));
- countreg = ix86_zero_extend_to_Pmode (countreg);
- destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3));
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
- emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
- offset = count & ~(size - 1);
+ unsigned HOST_WIDE_INT repcount;
+ unsigned int max_nonrep;
+
+ repcount = count >> (size == 4 ? 2 : 3);
+ if (!TARGET_64BIT)
+ repcount &= 0x3fffffff;
+
+ /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
+ movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
+ bytes. In both cases the latter seems to be faster for small
+ values of N. */
+ max_nonrep = size == 4 ? 7 : 4;
+ if (!optimize_size)
+ switch (ix86_tune)
+ {
+ case PROCESSOR_PENTIUM4:
+ case PROCESSOR_NOCONA:
+ max_nonrep = 3;
+ break;
+ default:
+ break;
+ }
+
+ if (repcount <= max_nonrep)
+ while (repcount-- > 0)
+ {
+ rtx mem = adjust_automodify_address_nv (dst,
+ GET_MODE (zeroreg),
+ destreg, offset);
+ emit_insn (gen_strset (destreg, mem, zeroreg));
+ offset += size;
+ }
+ else
+ {
+ countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
+ countreg = ix86_zero_extend_to_Pmode (countreg);
+ destexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (size == 4 ? 2 : 3));
+ destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
+ emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
+ destexp));
+ offset = count & ~(size - 1);
+ }
}
if (size == 8 && (count & 0x04))
{