From: Andreas Krebbel Date: Tue, 31 Jul 2018 15:41:59 +0000 (+0000) Subject: S/390: Don't emit prefetch instructions for clrmem X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4de3a1e16a86d23dec7cea6e5f7f710cf5769e71;p=gcc.git S/390: Don't emit prefetch instructions for clrmem gcc/ChangeLog: 2018-07-31 Andreas Krebbel * config/s390/s390.c (s390_expand_setmem): Make the unrolling to depend on whether prefetch instructions will be emitted or not. Use TARGET_SETMEM_PFD for checking whether prefetch instructions will be emitted or not. * config/s390/s390.h (TARGET_SETMEM_PREFETCH_DISTANCE) (TARGET_SETMEM_PFD): New macros. gcc/testsuite/ChangeLog: 2018-07-31 Andreas Krebbel * gcc.target/s390/memset-1.c: Improve testcase. From-SVN: r263165 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 133eaf39cdb..50bd14c7af1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2018-07-31 Andreas Krebbel + + * config/s390/s390.c (s390_expand_setmem): Make the unrolling to + depend on whether prefetch instructions will be emitted or not. + Use TARGET_SETMEM_PFD for checking whether prefetch instructions + will be emitted or not. + * config/s390/s390.h (TARGET_SETMEM_PREFETCH_DISTANCE) + (TARGET_SETMEM_PFD): New macros. + 2018-07-31 Richard Sandiford * tree-vectorizer.h (stmt_vec_info): Turn back into a typedef. diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index a579e9d5a27..ec588a2e72e 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -5499,12 +5499,15 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) /* Expand setmem/clrmem for a constant length operand without a loop if it will be shorter that way. - With a constant length and without pfd argument a - clrmem loop is 32 bytes -> 5.3 * xc - setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */ + clrmem loop (with PFD) is 30 bytes -> 5 * xc + clrmem loop (without PFD) is 24 bytes -> 4 * xc + setmem loop (with PFD) is 38 bytes -> ~4 * (mvi/stc + mvc) + setmem loop (without PFD) is 32 bytes -> ~4 * (mvi/stc + mvc) */ if (GET_CODE (len) == CONST_INT - && ((INTVAL (len) <= 256 * 5 && val == const0_rtx) - || INTVAL (len) <= 257 * 3) + && ((val == const0_rtx + && (INTVAL (len) <= 256 * 4 + || (INTVAL (len) <= 256 * 5 && TARGET_SETMEM_PFD(val,len)))) + || (val != const0_rtx && INTVAL (len) <= 257 * 4)) && (!TARGET_MVCLE || INTVAL (len) <= 256)) { HOST_WIDE_INT o, l; @@ -5618,12 +5621,11 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) emit_label (loop_start_label); - if (TARGET_Z10 - && (GET_CODE (len) != CONST_INT || INTVAL (len) > 1024)) + if (TARGET_SETMEM_PFD (val, len)) { - /* Issue a write prefetch for the +4 cache line. */ - rtx prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr, - GEN_INT (1024)), + /* Issue a write prefetch. */ + rtx distance = GEN_INT (TARGET_SETMEM_PREFETCH_DISTANCE); + rtx prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr, distance), const1_rtx, const0_rtx); emit_insn (prefetch); PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true; diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index 71a12b8c92e..c6aedcd5eaa 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -181,6 +181,16 @@ enum processor_flags #define TARGET_AVOID_CMP_AND_BRANCH (s390_tune == PROCESSOR_2817_Z196) +/* Issue a write prefetch for the +4 cache line. */ +#define TARGET_SETMEM_PREFETCH_DISTANCE 1024 + +/* Expand to a C expressions evaluating to true if a setmem to VAL of + length LEN should be emitted using prefetch instructions. */ +#define TARGET_SETMEM_PFD(VAL,LEN) \ + (TARGET_Z10 \ + && (s390_tune < PROCESSOR_2964_Z13 || (VAL) != const0_rtx) \ + && (!CONST_INT_P (LEN) || INTVAL ((LEN)) > TARGET_SETMEM_PREFETCH_DISTANCE)) + /* Run-time target specification. */ /* Defaults for option flags defined only on some subtargets. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2cc69735ff5..f741d280fd5 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2018-07-31 Andreas Krebbel + + * gcc.target/s390/memset-1.c: Improve testcase. + 2018-07-31 Tom de Vries PR debug/86687 diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c index 7b43b97c208..3e201df1aed 100644 --- a/gcc/testsuite/gcc.target/s390/memset-1.c +++ b/gcc/testsuite/gcc.target/s390/memset-1.c @@ -2,16 +2,23 @@ without loop statements. */ /* { dg-do compile } */ -/* { dg-options "-O3 -mzarch" } */ +/* { dg-options "-O3 -mzarch -march=z13" } */ -/* 1 mvc */ +/* 1 stc */ +void +*memset0(void *s, int c) +{ + return __builtin_memset (s, c, 1); +} + +/* 1 stc 1 mvc */ void *memset1(void *s, int c) { return __builtin_memset (s, c, 42); } -/* 3 mvc */ +/* 3 stc 3 mvc */ void *memset2(void *s, int c) { @@ -25,55 +32,62 @@ void return __builtin_memset (s, c, 0); } -/* mvc */ +/* 1 stc 1 mvc */ void *memset4(void *s, int c) { return __builtin_memset (s, c, 256); } -/* 2 mvc */ +/* 2 stc 2 mvc */ void *memset5(void *s, int c) { return __builtin_memset (s, c, 512); } -/* still 2 mvc through the additional first byte */ +/* 2 stc 2 mvc - still due to the stc bytes */ void *memset6(void *s, int c) { return __builtin_memset (s, c, 514); } -/* 3 mvc */ +/* 3 stc 2 mvc */ void *memset7(void *s, int c) { return __builtin_memset (s, c, 515); } -/* still 3 mvc through the additional first byte */ +/* 4 stc 4 mvc - 4 * 256 + 4 stc bytes */ void *memset8(void *s, int c) { - return __builtin_memset (s, c, 771); + return __builtin_memset (s, c, 1028); } -/* Use mvc loop: 2 mvc */ +/* 2 stc 1 pfd 2 mvc - start using mvc loop */ void *memset9(void *s, int c) { - return __builtin_memset (s, c, 772); + return __builtin_memset (s, c, 1029); } -/* 3 mvc with displacement overflow after the first */ +/* 2 stc 1 stcy 3 mvc - displacement overflow after the first */ void *memset10(void *s, int c) { return __builtin_memset ((char*)s + 4000, c, 700); } +/* 1 mvi */ +void +*clrmem0(void *s) +{ + return __builtin_memset (s, 0, 1); +} + /* 1 xc */ void *clrmem1(void *s) @@ -109,26 +123,55 @@ void return __builtin_memset (s, 0, 512); } -/* 3 xc */ +/* 4 xc */ void *clrmem6(void *s) { - return __builtin_memset (s, 0, 768); + return __builtin_memset (s, 0, 1024); } -/* start using xc loop */ +/* 2 xc - start using xc loop*/ void *clrmem7(void *s) +{ + return __builtin_memset (s, 0, 1025); +} + +/* 5 xc - on z10 PFD would be used in the loop body so the unrolled + variant would still be shorter. */ +__attribute__ ((target("tune=z10"))) +void +*clrmem7_z10(void *s) +{ + return __builtin_memset (s, 0, 1025); +} + +/* 5 xc */ +__attribute__ ((target("tune=z10"))) +void +*clrmem8_z10(void *s) +{ + return __builtin_memset (s, 0, 1280); +} + +/* 1 pfd 2 xc - start using xc loop also on z10 */ +__attribute__ ((target("tune=z10"))) +void +*clrmem9_z10(void *s) { return __builtin_memset (s, 0, 1281); } -/* 3 xc with displacement overflow after the first */ +/* 3 xc - displacement overflow after the first */ void -*clrmem8(void *s) +*clrmem10(void *s) { return __builtin_memset (s + 4000, 0, 700); } -/* { dg-final { scan-assembler-times "mvc" 19 } } */ -/* { dg-final { scan-assembler-times "xc" 15 } } */ +/* { dg-final { scan-assembler-times "mvi\\s" 1 } } */ +/* { dg-final { scan-assembler-times "mvc\\s" 20 } } */ +/* { dg-final { scan-assembler-times "xc\\s" 28 } } */ +/* { dg-final { scan-assembler-times "stc\\s" 21 } } */ +/* { dg-final { scan-assembler-times "stcy\\s" 1 } } */ +/* { dg-final { scan-assembler-times "pfd\\s" 2 } } */