From 587790e60d22605b9b3aa73e7313cc55a6417c30 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Thu, 5 Jan 2017 09:59:32 +0000 Subject: [PATCH] S/390: memset: Avoid overlapping MVC operands between iterations. A memset with a value != 0 is currently implemented using the mvc instruction propagating the first byte through 256 byte blocks. While for the first mvc the byte is written with a separate instruction subsequent MVCs used the last byte of the previous 256 byte block. Starting with z13 this causes a major performance degradation. With this patch we always set the first byte with an mvi or stc in order to avoid the overlapping of the MVC operands between loop iterations. On older machines this basically makes no measurable difference so the patch enables the new behavior for all machine levels in order to make sure that code built for older machine levels runs well when moved to a z13. Bootstrapped and regression tested on s390 and s390x using z900 and z13 as default -march level. No regressions. gcc/ChangeLog: 2017-01-05 Andreas Krebbel * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes between loop iterations. From-SVN: r244096 --- gcc/ChangeLog | 5 +++ gcc/config/s390/s390.c | 95 ++++++++++++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 31 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b9d6cb4d0ff..6c47cb876ec 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2017-01-05 Andreas Krebbel + + * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes + between loop iterations. + 2017-01-05 Martin Liska PR sanitizer/78815 diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 2082cb5e8b8..257bce7fa09 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -5346,6 +5346,8 @@ s390_expand_movmem (rtx dst, rtx src, rtx len) void s390_expand_setmem (rtx dst, rtx len, rtx val) { + const int very_unlikely = REG_BR_PROB_BASE / 100 - 1; + if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0) return; @@ -5391,13 +5393,14 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) { rtx dst_addr, count, blocks, temp, dstp1 = NULL_RTX; rtx_code_label *loop_start_label = gen_label_rtx (); - rtx_code_label *loop_end_label = gen_label_rtx (); - rtx_code_label *end_label = gen_label_rtx (); + rtx_code_label *onebyte_end_label = gen_label_rtx (); + rtx_code_label *zerobyte_end_label = gen_label_rtx (); + rtx_code_label *restbyte_end_label = gen_label_rtx (); machine_mode mode; mode = GET_MODE (len); if (mode == VOIDmode) - mode = Pmode; + mode = Pmode; dst_addr = gen_reg_rtx (Pmode); count = gen_reg_rtx (mode); @@ -5405,39 +5408,56 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) convert_move (count, len, 1); emit_cmp_and_jump_insns (count, const0_rtx, - EQ, NULL_RTX, mode, 1, end_label); + EQ, NULL_RTX, mode, 1, zerobyte_end_label, + very_unlikely); + /* We need to make a copy of the target address since memset is + supposed to return it unmodified. We have to make it here + already since the new reg is used at onebyte_end_label. */ emit_move_insn (dst_addr, force_operand (XEXP (dst, 0), NULL_RTX)); dst = change_address (dst, VOIDmode, dst_addr); - if (val == const0_rtx) - temp = expand_binop (mode, add_optab, count, constm1_rtx, count, 1, - OPTAB_DIRECT); - else + if (val != const0_rtx) { - dstp1 = adjust_address (dst, VOIDmode, 1); + /* When using the overlapping mvc the original target + address is only accessed as single byte entity (even by + the mvc reading this value). */ set_mem_size (dst, 1); - - /* Initialize memory by storing the first byte. */ - emit_move_insn (adjust_address (dst, QImode, 0), val); - - /* If count is 1 we are done. */ - emit_cmp_and_jump_insns (count, const1_rtx, - EQ, NULL_RTX, mode, 1, end_label); - - temp = expand_binop (mode, add_optab, count, GEN_INT (-2), count, 1, - OPTAB_DIRECT); - } + dstp1 = adjust_address (dst, VOIDmode, 1); + emit_cmp_and_jump_insns (count, + const1_rtx, EQ, NULL_RTX, mode, 1, + onebyte_end_label, very_unlikely); + } + + /* There is one unconditional (mvi+mvc)/xc after the loop + dealing with the rest of the bytes, subtracting two (mvi+mvc) + or one (xc) here leaves this number of bytes to be handled by + it. */ + temp = expand_binop (mode, add_optab, count, + val == const0_rtx ? constm1_rtx : GEN_INT (-2), + count, 1, OPTAB_DIRECT); if (temp != count) - emit_move_insn (count, temp); + emit_move_insn (count, temp); temp = expand_binop (mode, lshr_optab, count, GEN_INT (8), blocks, 1, OPTAB_DIRECT); if (temp != blocks) - emit_move_insn (blocks, temp); + emit_move_insn (blocks, temp); emit_cmp_and_jump_insns (blocks, const0_rtx, - EQ, NULL_RTX, mode, 1, loop_end_label); + EQ, NULL_RTX, mode, 1, restbyte_end_label); + + emit_jump (loop_start_label); + + if (val != const0_rtx) + { + /* The 1 byte != 0 special case. Not handled efficiently + since we require two jumps for that. However, this + should be very rare. */ + emit_label (onebyte_end_label); + emit_move_insn (adjust_address (dst, QImode, 0), val); + emit_jump (zerobyte_end_label); + } emit_label (loop_start_label); @@ -5455,26 +5475,39 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) if (val == const0_rtx) emit_insn (gen_clrmem_short (dst, GEN_INT (255))); else - emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (255))); + { + /* Set the first byte in the block to the value and use an + overlapping mvc for the block. */ + emit_move_insn (adjust_address (dst, QImode, 0), val); + emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (254))); + } s390_load_address (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256))); temp = expand_binop (mode, add_optab, blocks, constm1_rtx, blocks, 1, OPTAB_DIRECT); if (temp != blocks) - emit_move_insn (blocks, temp); + emit_move_insn (blocks, temp); emit_cmp_and_jump_insns (blocks, const0_rtx, - EQ, NULL_RTX, mode, 1, loop_end_label); + NE, NULL_RTX, mode, 1, loop_start_label); - emit_jump (loop_start_label); - emit_label (loop_end_label); + emit_label (restbyte_end_label); if (val == const0_rtx) - emit_insn (gen_clrmem_short (dst, convert_to_mode (Pmode, count, 1))); + emit_insn (gen_clrmem_short (dst, convert_to_mode (Pmode, count, 1))); else - emit_insn (gen_movmem_short (dstp1, dst, convert_to_mode (Pmode, count, 1))); - emit_label (end_label); + { + /* Set the first byte in the block to the value and use an + overlapping mvc for the block. */ + emit_move_insn (adjust_address (dst, QImode, 0), val); + /* execute only uses the lowest 8 bits of count that's + exactly what we need here. */ + emit_insn (gen_movmem_short (dstp1, dst, + convert_to_mode (Pmode, count, 1))); + } + + emit_label (zerobyte_end_label); } } -- 2.30.2