From: Andreas Krebbel Date: Thu, 5 Jan 2017 10:00:34 +0000 (+0000) Subject: S/390: Unroll mvc/xc loop for memset with small constant X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8597cd335e507057e7df5dcc0c157cbd9a5bcbdd;p=gcc.git S/390: Unroll mvc/xc loop for memset with small constant lengths. When expanding a memset we emit a loop of MVCs/XCs instructions dealing with 256 byte blocks. This loop used to get unrolled with older GCCs when using constant length operands. GCC lost this ability probably when more of the loop unrolling stuff has been moved to tree level. With this patch the unrolling is done manually when emitting the RTL insns. 2017-01-05 Andreas Krebbel * gcc.target/s390/memset-1.c: New test. gcc/ChangeLog: 2017-01-05 Andreas Krebbel * config/s390/s390.c (s390_expand_setmem): Unroll the loop for small constant length operands. From-SVN: r244097 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6c47cb876ec..a3e3a6a7d8b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2017-01-05 Andreas Krebbel + + * config/s390/s390.c (s390_expand_setmem): Unroll the loop for + small constant length operands. + 2017-01-05 Andreas Krebbel * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 257bce7fa09..1266f45fb1f 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -5348,34 +5348,46 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) { const int very_unlikely = REG_BR_PROB_BASE / 100 - 1; - if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0) + if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0) return; gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode); - if (GET_CODE (len) == CONST_INT && INTVAL (len) > 0 && INTVAL (len) <= 257) + /* Expand setmem/clrmem for a constant length operand without a + loop if it will be shorter that way. + With a constant length and without pfd argument a + clrmem loop is 32 bytes -> 5.3 * xc + setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */ + if (GET_CODE (len) == CONST_INT + && ((INTVAL (len) <= 256 * 5 && val == const0_rtx) + || INTVAL (len) <= 257 * 3) + && (!TARGET_MVCLE || INTVAL (len) <= 256)) { - if (val == const0_rtx && INTVAL (len) <= 256) - emit_insn (gen_clrmem_short (dst, GEN_INT (INTVAL (len) - 1))); - else - { - /* Initialize memory by storing the first byte. */ - emit_move_insn (adjust_address (dst, QImode, 0), val); + HOST_WIDE_INT o, l; - if (INTVAL (len) > 1) - { - /* Initiate 1 byte overlap move. - The first byte of DST is propagated through DSTP1. - Prepare a movmem for: DST+1 = DST (length = LEN - 1). - DST is set to size 1 so the rest of the memory location - does not count as source operand. */ - rtx dstp1 = adjust_address (dst, VOIDmode, 1); - set_mem_size (dst, 1); - - emit_insn (gen_movmem_short (dstp1, dst, - GEN_INT (INTVAL (len) - 2))); - } - } + if (val == const0_rtx) + /* clrmem: emit 256 byte blockwise XCs. */ + for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256) + { + rtx newdst = adjust_address (dst, BLKmode, o); + emit_insn (gen_clrmem_short (newdst, + GEN_INT (l > 256 ? 255 : l - 1))); + } + else + /* setmem: emit 1(mvi) + 256(mvc) byte blockwise memsets by + setting first byte to val and using a 256 byte mvc with one + byte overlap to propagate the byte. */ + for (l = INTVAL (len), o = 0; l > 0; l -= 257, o += 257) + { + rtx newdst = adjust_address (dst, BLKmode, o); + emit_move_insn (adjust_address (dst, QImode, o), val); + if (l > 1) + { + rtx newdstp1 = adjust_address (dst, BLKmode, o + 1); + emit_insn (gen_movmem_short (newdstp1, newdst, + GEN_INT (l > 257 ? 255 : l - 2))); + } + } } else if (TARGET_MVCLE) diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c new file mode 100644 index 00000000000..7b43b97c208 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/memset-1.c @@ -0,0 +1,134 @@ +/* Make sure that short memset's with constant length are emitted + without loop statements. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch" } */ + +/* 1 mvc */ +void +*memset1(void *s, int c) +{ + return __builtin_memset (s, c, 42); +} + +/* 3 mvc */ +void +*memset2(void *s, int c) +{ + return __builtin_memset (s, c, 700); +} + +/* nop */ +void +*memset3(void *s, int c) +{ + return __builtin_memset (s, c, 0); +} + +/* mvc */ +void +*memset4(void *s, int c) +{ + return __builtin_memset (s, c, 256); +} + +/* 2 mvc */ +void +*memset5(void *s, int c) +{ + return __builtin_memset (s, c, 512); +} + +/* still 2 mvc through the additional first byte */ +void +*memset6(void *s, int c) +{ + return __builtin_memset (s, c, 514); +} + +/* 3 mvc */ +void +*memset7(void *s, int c) +{ + return __builtin_memset (s, c, 515); +} + +/* still 3 mvc through the additional first byte */ +void +*memset8(void *s, int c) +{ + return __builtin_memset (s, c, 771); +} + +/* Use mvc loop: 2 mvc */ +void +*memset9(void *s, int c) +{ + return __builtin_memset (s, c, 772); +} + +/* 3 mvc with displacement overflow after the first */ +void +*memset10(void *s, int c) +{ + return __builtin_memset ((char*)s + 4000, c, 700); +} + +/* 1 xc */ +void +*clrmem1(void *s) +{ + return __builtin_memset (s, 0, 42); +} + +/* 3 xc */ +void +*clrmem2(void *s) +{ + return __builtin_memset (s, 0, 700); +} + +/* nop */ +void +*clrmem3(void *s) +{ + return __builtin_memset (s, 0, 0); +} + +/* 1 xc */ +void +*clrmem4(void *s) +{ + return __builtin_memset (s, 0, 256); +} + +/* 2 xc */ +void +*clrmem5(void *s) +{ + return __builtin_memset (s, 0, 512); +} + +/* 3 xc */ +void +*clrmem6(void *s) +{ + return __builtin_memset (s, 0, 768); +} + +/* start using xc loop */ +void +*clrmem7(void *s) +{ + return __builtin_memset (s, 0, 1281); +} + +/* 3 xc with displacement overflow after the first */ +void +*clrmem8(void *s) +{ + return __builtin_memset (s + 4000, 0, 700); +} + +/* { dg-final { scan-assembler-times "mvc" 19 } } */ +/* { dg-final { scan-assembler-times "xc" 15 } } */