From ef4adf1f79e380315124ed8bc98cd5b2a6f027ef Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Fri, 26 Oct 2018 19:33:31 +0000 Subject: [PATCH] rs6000-string.c (expand_strncmp_gpr_sequence): Change to a shorter sequence with fewer branches. 2018-10-26 Aaron Sawdey * config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to a shorter sequence with fewer branches. (emit_final_str_compare_gpr): Ditto. From-SVN: r265546 --- gcc/ChangeLog | 6 + gcc/config/rs6000/rs6000-string.c | 245 +++++++++++++++++------------- 2 files changed, 143 insertions(+), 108 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9e9058c4b62..ae4aa6ebb72 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2018-10-26 Aaron Sawdey + + * config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to + a shorter sequence with fewer branches. + (emit_final_str_compare_gpr): Ditto. + 2018-10-26 Paul A. Clarke * config/rs6000/tmmintrin.h: New file. diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c index 9c25bad97a1..96729d9663c 100644 --- a/gcc/config/rs6000/rs6000-string.c +++ b/gcc/config/rs6000/rs6000-string.c @@ -259,7 +259,7 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode) gcc_assert (mode == E_QImode); emit_move_insn (reg, mem); break; - + default: gcc_unreachable (); break; @@ -726,7 +726,7 @@ expand_compare_loop (rtx operands[]) { if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) /* Do not expect length longer than word_mode. */ - return false; + return false; else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) { bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); @@ -770,7 +770,7 @@ expand_compare_loop (rtx operands[]) rtx j; /* Example of generated code for 35 bytes aligned 1 byte. - + mtctr 8 li 6,0 li 5,8 @@ -798,7 +798,7 @@ expand_compare_loop (rtx operands[]) popcntd 9,9 subfe 10,10,10 or 9,9,10 - + Compiled with -fno-reorder-blocks for clarity. */ /* Structure of what we're going to do: @@ -1041,7 +1041,7 @@ expand_compare_loop (rtx operands[]) if (!bytes_is_const) { /* If we're dealing with runtime length, we have to check if - it's zero after the loop. When length is known at compile + it's zero after the loop. When length is known at compile time the no-remainder condition is dealt with above. By doing this after cleanup_label, we also deal with the case where length is 0 at the start and we bypass the @@ -1411,7 +1411,7 @@ expand_block_compare (rtx operands[]) rtx tmp_reg_src1 = gen_reg_rtx (word_mode); rtx tmp_reg_src2 = gen_reg_rtx (word_mode); /* P7/P8 code uses cond for subfc. but P9 uses - it for cmpld which needs CCUNSmode. */ + it for cmpld which needs CCUNSmode. */ rtx cond; if (TARGET_P9_MISC) cond = gen_reg_rtx (CCUNSmode); @@ -1655,7 +1655,7 @@ expand_block_compare (rtx operands[]) emit_label (convert_label); /* We need to produce DI result from sub, then convert to target SI - while maintaining <0 / ==0 / >0 properties. This sequence works: + while maintaining <0 / ==0 / >0 properties. This sequence works: subfc L,A,B subfe H,H,H popcntd L,L @@ -1740,7 +1740,7 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes to strcmp/strncmp if we have equality at the end of the inline comparison. P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up and generate the final comparison result. - FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just + FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just set the final result. */ static void expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, @@ -1763,12 +1763,9 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, while (bytes_to_compare > 0) { /* GPR compare sequence: - check each 8B with: ld/ld cmpd bne - If equal, use rldicr/cmpb to check for zero byte. + check each 8B with: ld/ld/cmpb/cmpb/orc./bne + cleanup code at end: - cmpb get byte that differs - cmpb look for zero byte - orc combine cntlzd get bit of first zero/diff byte subfic convert for rldcl use rldcl rldcl extract diff/zero byte @@ -1776,7 +1773,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, The last compare can branch around the cleanup code if the result is zero because the strings are exactly equal. */ - + unsigned int align = compute_current_alignment (base_align, offset); load_mode = select_block_compare_mode (offset, bytes_to_compare, align); load_mode_size = GET_MODE_SIZE (load_mode); @@ -1801,34 +1798,49 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, rid of the extra bytes. */ cmp_bytes = bytes_to_compare; - rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)); + rtx offset_reg = gen_reg_rtx (Pmode); + emit_move_insn (offset_reg, GEN_INT (offset)); + + rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg); do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); - rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)); + rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg); do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); /* We must always left-align the data we read, and clear any bytes to the right that are beyond the string. Otherwise the cmpb sequence won't produce the correct - results. The beginning of the compare will be done - with word_mode so will not have any extra shifts or - clear rights. */ + results. However if there is only one byte left, we + can just subtract to get the final result so the shifts + and clears are not needed. */ - if (load_mode_size < word_mode_size) - { - /* Rotate left first. */ - rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size)); - do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); - do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); - } + unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; - if (cmp_bytes < word_mode_size) + /* Loading just a single byte is a special case. If we are + loading more than that, we have to check whether we are + looking at the entire chunk of data. If not, rotate left and + clear right so that bytes we aren't supposed to look at are + zeroed, and the first byte we are supposed to compare is + leftmost. */ + if (load_mode_size != 1) { - /* Now clear right. This plus the rotate can be - turned into a rldicr instruction. */ - HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); - do_and3 (tmp_reg_src1, tmp_reg_src1, mask); - do_and3 (tmp_reg_src2, tmp_reg_src2, mask); + if (load_mode_size < word_mode_size) + { + /* Rotate left first. */ + rtx sh = GEN_INT (BITS_PER_UNIT + * (word_mode_size - load_mode_size)); + do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); + do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); + } + + if (cmp_bytes < word_mode_size) + { + /* Now clear right. This plus the rotate can be + turned into a rldicr instruction. */ + HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); + rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); + do_and3 (tmp_reg_src1, tmp_reg_src1, mask); + do_and3 (tmp_reg_src2, tmp_reg_src2, mask); + } } /* Cases to handle. A and B are chunks of the two strings. @@ -1842,8 +1854,6 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, A == B: branch to result 0. A != B: cleanup code to compute result. */ - unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; - rtx dst_label; if (remain > 0 || equality_compare_rest) { @@ -1857,54 +1867,89 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, /* Branch to end and produce result of 0. */ dst_label = final_move_label; - rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); - rtx cond = gen_reg_rtx (CCmode); + if (load_mode_size == 1) + { + /* Special case for comparing just single byte. */ + if (equality_compare_rest) + { + /* Use subf./bne to branch to final_move_label if the + byte differs, otherwise fall through to the strncmp + call. We must also check for a zero byte here as we + must not make the library call if this is the end of + the string. */ + + rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); + rtx cond = gen_reg_rtx (CCmode); + rtx diff_rtx = gen_rtx_MINUS (word_mode, + tmp_reg_src1, tmp_reg_src2); + rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); + rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, + lab_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL (j) = final_move_label; + LABEL_NUSES (final_move_label) += 1; - /* Always produce the 0 result, it is needed if - cmpb finds a 0 byte in this chunk. */ - rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); - rs6000_emit_dot_insn (result_reg, tmp, 1, cond); + /* Check for zero byte here before fall through to + library call. This catches the case where the + strings are equal and end in a zero byte at this + position. */ - rtx cmp_rtx; - if (remain == 0 && !equality_compare_rest) - cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); - else - cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + rtx cond0 = gen_reg_rtx (CCmode); + emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, + const0_rtx)); - rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, - lab_ref, pc_rtx); - rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); - JUMP_LABEL (j) = dst_label; - LABEL_NUSES (dst_label) += 1; + rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); - if (remain > 0 || equality_compare_rest) + rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, + lab_ref, pc_rtx); + rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); + JUMP_LABEL (j0) = final_move_label; + LABEL_NUSES (final_move_label) += 1; + } + else + { + /* This is the last byte to be compared so we can use + subf to compute the final result and branch + unconditionally to final_move_label. */ + + do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); + + rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); + JUMP_LABEL (j) = final_move_label; + LABEL_NUSES (final_move_label) += 1; + emit_barrier (); + } + } + else { - /* Generate a cmpb to test for a 0 byte and branch - to final result if found. */ rtx cmpb_zero = gen_reg_rtx (word_mode); - rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label); - rtx condz = gen_reg_rtx (CCmode); + rtx cmpb_diff = gen_reg_rtx (word_mode); rtx zero_reg = gen_reg_rtx (word_mode); + rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); + rtx cond = gen_reg_rtx (CCmode); + emit_move_insn (zero_reg, GEN_INT (0)); + do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); + rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); + rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); - if (cmp_bytes < word_mode_size) - { - /* Don't want to look at zero bytes past end. */ - HOST_WIDE_INT mb = - BITS_PER_UNIT * (word_mode_size - cmp_bytes); - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); - do_and3 (cmpb_zero, cmpb_zero, mask); - } + rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); - emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg)); - rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx); - rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx, - lab_ref_fin, pc_rtx); - rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); - JUMP_LABEL (j2) = final_move_label; - LABEL_NUSES (final_move_label) += 1; + rtx cmp_rtx; + if (remain == 0 && !equality_compare_rest) + cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); + else + cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, + lab_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL (j) = dst_label; + LABEL_NUSES (dst_label) += 1; } offset += cmp_bytes; @@ -1915,7 +1960,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, return; } -/* Generate the sequence of compares for strcmp/strncmp using vec/vsx +/* Generate the sequence of compares for strcmp/strncmp using vec/vsx instructions. BYTES_TO_COMPARE is the number of bytes to be compared. @@ -1931,7 +1976,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, to strcmp/strncmp if we have equality at the end of the inline comparison. P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up and generate the final comparison result. - FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just + FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just set the final result. */ static void expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, @@ -1982,12 +2027,12 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, bne 6,.Lmismatch Use the overlapping compare trick for the last block if it is - less than 16 bytes. + less than 16 bytes. */ load_mode = V16QImode; load_mode_size = GET_MODE_SIZE (load_mode); - + if (bytes_to_compare >= load_mode_size) cmp_bytes = load_mode_size; else @@ -2046,10 +2091,10 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, if (branch_to_cleanup) { /* Branch to cleanup code, otherwise fall through to do more - compares. P8 and P9 use different CR bits because on P8 + compares. P8 and P9 use different CR bits because on P8 we are looking at the result of a comparsion vs a register of zeroes so the all-true condition means no - difference or zero was found. On P9, vcmpnezb sets a byte + difference or zero was found. On P9, vcmpnezb sets a byte to 0xff if there is a mismatch or zero, so the all-false condition indicates we found no difference or zero. */ if (!cleanup_label) @@ -2062,7 +2107,7 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, } else { - /* Branch to final return or fall through to cleanup, + /* Branch to final return or fall through to cleanup, result is already set to 0. */ dst_label = final_move_label; if (TARGET_P9_VECTOR) @@ -2088,10 +2133,7 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, /* Generate the final sequence that identifies the differing byte and generates the final result, taking into account zero bytes: - - cmpb cmpb_result1, src1, src2 - cmpb cmpb_result2, src1, zero - orc cmpb_result1, cmp_result1, cmpb_result2 + cntlzd get bit of first zero/diff byte addi convert for rldcl use rldcl rldcl extract diff/zero byte @@ -2105,10 +2147,7 @@ static void emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) { machine_mode m = GET_MODE (str1); - rtx cmpb_diff = gen_reg_rtx (m); - rtx cmpb_zero = gen_reg_rtx (m); rtx rot_amt = gen_reg_rtx (m); - rtx zero_reg = gen_reg_rtx (m); rtx rot1_1 = gen_reg_rtx (m); rtx rot1_2 = gen_reg_rtx (m); @@ -2117,12 +2156,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) if (m == SImode) { - emit_insn (gen_cmpbsi3 (cmpb_diff, str1, str2)); - emit_insn (gen_movsi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbsi3 (cmpb_zero, str1, zero_reg)); - emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff)); - emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero)); - emit_insn (gen_clzsi2 (rot_amt, cmpb_diff)); + emit_insn (gen_clzsi2 (rot_amt, result)); emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); emit_insn (gen_rotlsi3 (rot1_1, str1, gen_lowpart (SImode, rot_amt))); @@ -2134,12 +2168,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) } else if (m == DImode) { - emit_insn (gen_cmpbdi3 (cmpb_diff, str1, str2)); - emit_insn (gen_movdi (zero_reg, GEN_INT (0))); - emit_insn (gen_cmpbdi3 (cmpb_zero, str1, zero_reg)); - emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff)); - emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero)); - emit_insn (gen_clzdi2 (rot_amt, cmpb_diff)); + emit_insn (gen_clzdi2 (rot_amt, result)); emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); emit_insn (gen_rotldi3 (rot1_1, str1, gen_lowpart (SImode, rot_amt))); @@ -2151,7 +2180,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) } else gcc_unreachable (); - + return; } @@ -2169,10 +2198,10 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) lbzx 10,28,9 # use that offset to load differing byte lbzx 3,29,9 subf 3,3,10 # subtract for final result - + P9: vclzlsbb # counts trailing bytes with lsb=0 - vextublx # extract differing byte + vextublx # extract differing byte STR1 is the reg rtx for data from string 1. STR2 is the reg rtx for data from string 2. @@ -2208,7 +2237,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result, gcc_assert (TARGET_P8_VECTOR); rtx diffix = gen_reg_rtx (DImode); rtx result_gbbd = gen_reg_rtx (V16QImode); - /* Since each byte of the input is either 00 or FF, the bytes in + /* Since each byte of the input is either 00 or FF, the bytes in dw0 and dw1 after vgbbd are all identical to each other. */ emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. @@ -2226,7 +2255,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result, else emit_insn (gen_ctzdi2 (count, diffix)); - /* P8 doesn't have a good solution for extracting one byte from + /* P8 doesn't have a good solution for extracting one byte from a vsx reg like vextublx on P9 so we just compute the offset of the differing byte and load it from each string. */ do_add3 (off_reg, off_reg, count); @@ -2247,7 +2276,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result, } /* Expand a string compare operation with length, and return - true if successful. Return false if we should let the + true if successful. Return false if we should let the compiler generate normal code, probably a strncmp call. OPERANDS[0] is the target (result). @@ -2279,9 +2308,9 @@ expand_strn_compare (rtx operands[], int no_length) rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); - /* If we have a length, it must be constant. This simplifies things + /* If we have a length, it must be constant. This simplifies things a bit as we don't have to generate code to check if we've exceeded - the length. Later this could be expanded to handle this case. */ + the length. Later this could be expanded to handle this case. */ if (!no_length && !CONST_INT_P (bytes_rtx)) return false; @@ -2311,7 +2340,7 @@ expand_strn_compare (rtx operands[], int no_length) else bytes = UINTVAL (bytes_rtx); - /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at + /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is at least POWER8. That way we can rely on overlapping compares to do the final comparison of less than 16 bytes. Also I do not @@ -2363,7 +2392,7 @@ expand_strn_compare (rtx operands[], int no_length) rtx final_move_label = gen_label_rtx (); rtx final_label = gen_label_rtx (); rtx begin_compare_label = NULL; - + if (base_align < required_align) { /* Generate code that checks distance to 4k boundary for this case. */ @@ -2472,7 +2501,7 @@ expand_strn_compare (rtx operands[], int no_length) &cleanup_label, final_move_label); offset = compare_length; - + if (equality_compare_rest) { /* Update pointers past what has been compared already. */ -- 2.30.2