/* Expand setmem/clrmem for a constant length operand without a
loop if it will be shorter that way.
- With a constant length and without pfd argument a
- clrmem loop is 32 bytes -> 5.3 * xc
- setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */
+ clrmem loop (with PFD) is 30 bytes -> 5 * xc
+ clrmem loop (without PFD) is 24 bytes -> 4 * xc
+ setmem loop (with PFD) is 38 bytes -> ~4 * (mvi/stc + mvc)
+ setmem loop (without PFD) is 32 bytes -> ~4 * (mvi/stc + mvc) */
if (GET_CODE (len) == CONST_INT
- && ((INTVAL (len) <= 256 * 5 && val == const0_rtx)
- || INTVAL (len) <= 257 * 3)
+ && ((val == const0_rtx
+ && (INTVAL (len) <= 256 * 4
+ || (INTVAL (len) <= 256 * 5 && TARGET_SETMEM_PFD(val,len))))
+ || (val != const0_rtx && INTVAL (len) <= 257 * 4))
&& (!TARGET_MVCLE || INTVAL (len) <= 256))
{
HOST_WIDE_INT o, l;
emit_label (loop_start_label);
- if (TARGET_Z10
- && (GET_CODE (len) != CONST_INT || INTVAL (len) > 1024))
+ if (TARGET_SETMEM_PFD (val, len))
{
- /* Issue a write prefetch for the +4 cache line. */
- rtx prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr,
- GEN_INT (1024)),
+ /* Issue a write prefetch. */
+ rtx distance = GEN_INT (TARGET_SETMEM_PREFETCH_DISTANCE);
+ rtx prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr, distance),
const1_rtx, const0_rtx);
emit_insn (prefetch);
PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
without loop statements. */
/* { dg-do compile } */
-/* { dg-options "-O3 -mzarch" } */
+/* { dg-options "-O3 -mzarch -march=z13" } */
-/* 1 mvc */
+/* 1 stc */
+void
+*memset0(void *s, int c)
+{
+ return __builtin_memset (s, c, 1);
+}
+
+/* 1 stc 1 mvc */
void
*memset1(void *s, int c)
{
return __builtin_memset (s, c, 42);
}
-/* 3 mvc */
+/* 3 stc 3 mvc */
void
*memset2(void *s, int c)
{
return __builtin_memset (s, c, 0);
}
-/* mvc */
+/* 1 stc 1 mvc */
void
*memset4(void *s, int c)
{
return __builtin_memset (s, c, 256);
}
-/* 2 mvc */
+/* 2 stc 2 mvc */
void
*memset5(void *s, int c)
{
return __builtin_memset (s, c, 512);
}
-/* still 2 mvc through the additional first byte */
+/* 2 stc 2 mvc - still due to the stc bytes */
void
*memset6(void *s, int c)
{
return __builtin_memset (s, c, 514);
}
-/* 3 mvc */
+/* 3 stc 2 mvc */
void
*memset7(void *s, int c)
{
return __builtin_memset (s, c, 515);
}
-/* still 3 mvc through the additional first byte */
+/* 4 stc 4 mvc - 4 * 256 + 4 stc bytes */
void
*memset8(void *s, int c)
{
- return __builtin_memset (s, c, 771);
+ return __builtin_memset (s, c, 1028);
}
-/* Use mvc loop: 2 mvc */
+/* 2 stc 1 pfd 2 mvc - start using mvc loop */
void
*memset9(void *s, int c)
{
- return __builtin_memset (s, c, 772);
+ return __builtin_memset (s, c, 1029);
}
-/* 3 mvc with displacement overflow after the first */
+/* 2 stc 1 stcy 3 mvc - displacement overflow after the first */
void
*memset10(void *s, int c)
{
return __builtin_memset ((char*)s + 4000, c, 700);
}
+/* 1 mvi */
+void
+*clrmem0(void *s)
+{
+ return __builtin_memset (s, 0, 1);
+}
+
/* 1 xc */
void
*clrmem1(void *s)
return __builtin_memset (s, 0, 512);
}
-/* 3 xc */
+/* 4 xc */
void
*clrmem6(void *s)
{
- return __builtin_memset (s, 0, 768);
+ return __builtin_memset (s, 0, 1024);
}
-/* start using xc loop */
+/* 2 xc - start using xc loop*/
void
*clrmem7(void *s)
+{
+ return __builtin_memset (s, 0, 1025);
+}
+
+/* 5 xc - on z10 PFD would be used in the loop body so the unrolled
+ variant would still be shorter. */
+__attribute__ ((target("tune=z10")))
+void
+*clrmem7_z10(void *s)
+{
+ return __builtin_memset (s, 0, 1025);
+}
+
+/* 5 xc */
+__attribute__ ((target("tune=z10")))
+void
+*clrmem8_z10(void *s)
+{
+ return __builtin_memset (s, 0, 1280);
+}
+
+/* 1 pfd 2 xc - start using xc loop also on z10 */
+__attribute__ ((target("tune=z10")))
+void
+*clrmem9_z10(void *s)
{
return __builtin_memset (s, 0, 1281);
}
-/* 3 xc with displacement overflow after the first */
+/* 3 xc - displacement overflow after the first */
void
-*clrmem8(void *s)
+*clrmem10(void *s)
{
return __builtin_memset (s + 4000, 0, 700);
}
-/* { dg-final { scan-assembler-times "mvc" 19 } } */
-/* { dg-final { scan-assembler-times "xc" 15 } } */
+/* { dg-final { scan-assembler-times "mvi\\s" 1 } } */
+/* { dg-final { scan-assembler-times "mvc\\s" 20 } } */
+/* { dg-final { scan-assembler-times "xc\\s" 28 } } */
+/* { dg-final { scan-assembler-times "stc\\s" 21 } } */
+/* { dg-final { scan-assembler-times "stcy\\s" 1 } } */
+/* { dg-final { scan-assembler-times "pfd\\s" 2 } } */