From: Aaron Sawdey Date: Fri, 23 Sep 2016 20:34:26 +0000 (-0500) Subject: rs6000.md (cmpmemsi): New define_expand. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9f38dde2306d9a482c03eeaa59688a30d566c8ff;p=gcc.git rs6000.md (cmpmemsi): New define_expand. * config/rs6000/rs6000.md (cmpmemsi): New define_expand. * config/rs6000/rs6000.c (expand_block_compare): New function used by cmpmemsi pattern to do builtin expansion of memcmp (). (compute_current_alignment): Add helper function for expand_block_compare used to compute alignment as the compare proceeds. (select_block_compare_mode): Used by expand_block_compare to select the mode used for reading the next chunk of bytes in the compare. (do_load_for_compare): Used by expand_block_compare to emit the load insns for the compare. (rs6000_emit_dot_insn): Moved this function to avoid a forward reference from expand_block_compare (). * config/rs6000/rs6000-protos.h (expand_block_compare): Add a prototype for this function. * config/rs6000/rs6000.opt (mblock-compare-inline-limit): Add a new target option for controlling how much code inline expansion of memcmp() will be allowed to generate. From-SVN: r240455 --- diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 6ed7024da16..8f07450d72b 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -77,6 +77,7 @@ extern void rs6000_expand_interleave (rtx, rtx, rtx, bool); extern void rs6000_scale_v2df (rtx, rtx, int); extern int expand_block_clear (rtx[]); extern int expand_block_move (rtx[]); +extern bool expand_block_compare (rtx[]); extern const char * rs6000_output_load_multiple (rtx[]); extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode); extern bool rs6000_is_valid_and_mask (rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 7dd7ee51903..7c8a82f2578 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -18423,6 +18423,461 @@ expand_block_clear (rtx operands[]) return 1; } +/* Emit a potentially record-form instruction, setting DST from SRC. + If DOT is 0, that is all; otherwise, set CCREG to the result of the + signed comparison of DST with zero. If DOT is 1, the generated RTL + doesn't care about the DST result; if DOT is 2, it does. If CCREG + is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and + a separate COMPARE. */ + +static void +rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg) +{ + if (dot == 0) + { + emit_move_insn (dst, src); + return; + } + + if (cc_reg_not_cr0_operand (ccreg, CCmode)) + { + emit_move_insn (dst, src); + emit_move_insn (ccreg, gen_rtx_COMPARE (CCmode, dst, const0_rtx)); + return; + } + + rtx ccset = gen_rtx_SET (ccreg, gen_rtx_COMPARE (CCmode, src, const0_rtx)); + if (dot == 1) + { + rtx clobber = gen_rtx_CLOBBER (VOIDmode, dst); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, clobber))); + } + else + { + rtx set = gen_rtx_SET (dst, src); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, set))); + } +} + + +/* Figure out the correct instructions to generate to load data for + block compare. MODE is used for the read from memory, and + data is zero extended if REG is wider than MODE. If LE code + is being generated, bswap loads are used. + + REG is the destination register to move the data into. + MEM is the memory block being read. + MODE is the mode of memory to use for the read. */ +static void +do_load_for_compare (rtx reg, rtx mem, machine_mode mode) +{ + switch (GET_MODE (reg)) + { + case DImode: + switch (mode) + { + case QImode: + emit_insn (gen_zero_extendqidi2 (reg, mem)); + break; + case HImode: + { + rtx src = mem; + if (TARGET_LITTLE_ENDIAN) + { + src = gen_reg_rtx (HImode); + emit_insn (gen_bswaphi2 (src, mem)); + } + emit_insn (gen_zero_extendhidi2 (reg, src)); + break; + } + case SImode: + { + rtx src = mem; + if (TARGET_LITTLE_ENDIAN) + { + src = gen_reg_rtx (SImode); + emit_insn (gen_bswapsi2 (src, mem)); + } + emit_insn (gen_zero_extendsidi2 (reg, src)); + } + break; + case DImode: + if (TARGET_LITTLE_ENDIAN) + emit_insn (gen_bswapdi2 (reg, mem)); + else + emit_insn (gen_movdi (reg, mem)); + break; + default: + gcc_unreachable (); + } + break; + + case SImode: + switch (mode) + { + case QImode: + emit_insn (gen_zero_extendqisi2 (reg, mem)); + break; + case HImode: + { + rtx src = mem; + if (TARGET_LITTLE_ENDIAN) + { + src = gen_reg_rtx (HImode); + emit_insn (gen_bswaphi2 (src, mem)); + } + emit_insn (gen_zero_extendhisi2 (reg, src)); + break; + } + case SImode: + if (TARGET_LITTLE_ENDIAN) + emit_insn (gen_bswapsi2 (reg, mem)); + else + emit_insn (gen_movsi (reg, mem)); + break; + case DImode: + /* DImode is larger than the destination reg so is not expected. */ + gcc_unreachable (); + break; + default: + gcc_unreachable (); + } + break; + default: + gcc_unreachable (); + break; + } +} + +/* Select the mode to be used for reading the next chunk of bytes + in the compare. + + OFFSET is the current read offset from the beginning of the block. + BYTES is the number of bytes remaining to be read. + ALIGN is the minimum alignment of the memory blocks being compared in bytes. + WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is + the largest allowable mode. */ +static machine_mode +select_block_compare_mode (HOST_WIDE_INT offset, HOST_WIDE_INT bytes, + HOST_WIDE_INT align, bool word_mode_ok) +{ + /* First see if we can do a whole load unit + as that will be more efficient than a larger load + shift. */ + + /* If big, use biggest chunk. + If exactly chunk size, use that size. + If remainder can be done in one piece with shifting, do that. + Do largest chunk possible without violating alignment rules. */ + + /* The most we can read without potential page crossing. */ + HOST_WIDE_INT maxread = ROUND_UP (bytes, align); + + if (word_mode_ok && bytes >= UNITS_PER_WORD) + return word_mode; + else if (bytes == GET_MODE_SIZE (SImode)) + return SImode; + else if (bytes == GET_MODE_SIZE (HImode)) + return HImode; + else if (bytes == GET_MODE_SIZE (QImode)) + return QImode; + else if (bytes < GET_MODE_SIZE (SImode) + && offset >= GET_MODE_SIZE (SImode) - bytes) + /* This matches the case were we have SImode and 3 bytes + and offset >= 1 and permits us to move back one and overlap + with the previous read, thus avoiding having to shift + unwanted bytes off of the input. */ + return SImode; + else if (word_mode_ok && bytes < UNITS_PER_WORD + && offset >= UNITS_PER_WORD-bytes) + /* Similarly, if we can use DImode it will get matched here and + can do an overlapping read that ends at the end of the block. */ + return word_mode; + else if (word_mode_ok && maxread >= UNITS_PER_WORD) + /* It is safe to do all remaining in one load of largest size, + possibly with a shift to get rid of unwanted bytes. */ + return word_mode; + else if (maxread >= GET_MODE_SIZE (SImode)) + /* It is safe to do all remaining in one SImode load, + possibly with a shift to get rid of unwanted bytes. */ + return SImode; + else if (bytes > GET_MODE_SIZE (SImode)) + return SImode; + else if (bytes > GET_MODE_SIZE (HImode)) + return HImode; + + /* final fallback is do one byte */ + return QImode; +} + +/* Compute the alignment of pointer+OFFSET where the original alignment + of pointer was BASE_ALIGN. */ +static HOST_WIDE_INT +compute_current_alignment (HOST_WIDE_INT base_align, HOST_WIDE_INT offset) +{ + if (offset == 0) + return base_align; + return min (base_align, offset & -offset); +} + +/* Expand a block compare operation, and return true if successful. + Return false if we should let the compiler generate normal code, + probably a memcmp call. + + OPERANDS[0] is the target (result). + OPERANDS[1] is the first source. + OPERANDS[2] is the second source. + OPERANDS[3] is the length. + OPERANDS[4] is the alignment. */ +bool +expand_block_compare (rtx operands[]) +{ + rtx target = operands[0]; + rtx orig_src1 = operands[1]; + rtx orig_src2 = operands[2]; + rtx bytes_rtx = operands[3]; + rtx align_rtx = operands[4]; + HOST_WIDE_INT cmp_bytes = 0; + rtx src1 = orig_src1; + rtx src2 = orig_src2; + + /* If this is not a fixed size compare, just call memcmp */ + if (!CONST_INT_P (bytes_rtx)) + return false; + + /* This must be a fixed size alignment */ + if (!CONST_INT_P (align_rtx)) + return false; + + int base_align = INTVAL (align_rtx) / BITS_PER_UNIT; + + /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff */ + if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1)) + || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2))) + return false; + + gcc_assert (GET_MODE (target) == SImode); + + /* Anything to move? */ + HOST_WIDE_INT bytes = INTVAL (bytes_rtx); + if (bytes <= 0) + return true; + + rtx tmp_reg_src1 = gen_reg_rtx (word_mode); + rtx tmp_reg_src2 = gen_reg_rtx (word_mode); + + /* If we have an LE target without ldbrx and word_mode is DImode, + then we must avoid using word_mode. */ + int word_mode_ok = !(TARGET_LITTLE_ENDIAN && !TARGET_LDBRX + && word_mode == DImode); + + /* Strategy phase. How many ops will this take and should we expand it? */ + + int offset = 0; + machine_mode load_mode = + select_block_compare_mode (offset, bytes, base_align, word_mode_ok); + int load_mode_size = GET_MODE_SIZE (load_mode); + + /* We don't want to generate too much code. */ + if (ROUND_UP (bytes, load_mode_size) / load_mode_size + > rs6000_block_compare_inline_limit) + return false; + + bool generate_6432_conversion = false; + rtx convert_label = NULL; + rtx final_label = NULL; + + /* Example of generated code for 11 bytes aligned 1 byte: + .L10: + ldbrx 10,6,9 + ldbrx 9,7,9 + subf. 9,9,10 + bne 0,.L8 + addi 9,4,7 + lwbrx 10,0,9 + addi 9,5,7 + lwbrx 9,0,9 + subf 9,9,10 + b .L9 + .L8: # convert_label + cntlzd 9,9 + addi 9,9,-1 + xori 9,9,0x3f + .L9: # final_label + + We start off with DImode and have a compare/branch to something + with a smaller mode then we will need a block with the DI->SI conversion + that may or may not be executed. */ + + while (bytes > 0) + { + int align = compute_current_alignment (base_align, offset); + load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok); + load_mode_size = GET_MODE_SIZE (load_mode); + if (bytes >= load_mode_size) + cmp_bytes = load_mode_size; + else + { + /* Move this load back so it doesn't go past the end. */ + int extra_bytes = load_mode_size - bytes; + cmp_bytes = bytes; + if (extra_bytes < offset) + { + offset -= extra_bytes; + cmp_bytes = load_mode_size; + bytes = cmp_bytes; + } + } + + src1 = adjust_address (orig_src1, load_mode, offset); + src2 = adjust_address (orig_src2, load_mode, offset); + + if (!REG_P (XEXP (src1, 0))) + { + rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); + src1 = replace_equiv_address (src1, src1_reg); + } + set_mem_size (src1, cmp_bytes); + + if (!REG_P (XEXP (src2, 0))) + { + rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); + src2 = replace_equiv_address (src2, src2_reg); + } + set_mem_size (src2, cmp_bytes); + + do_load_for_compare (tmp_reg_src1, src1, load_mode); + do_load_for_compare (tmp_reg_src2, src2, load_mode); + + if (cmp_bytes < load_mode_size) + { + /* Shift unneeded bytes off. */ + rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); + if (word_mode == DImode) + { + emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); + emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); + } + else + { + emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); + emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); + } + } + + /* We previously did a block that need 64->32 conversion but + the current block does not, so a label is needed to jump + to the end. */ + if (generate_6432_conversion && !final_label + && GET_MODE_SIZE (GET_MODE (target)) >= load_mode_size) + final_label = gen_label_rtx (); + + /* Do we need a 64->32 conversion block? */ + int remain = bytes - cmp_bytes; + if (GET_MODE_SIZE (GET_MODE (target)) < GET_MODE_SIZE (load_mode)) + { + generate_6432_conversion = true; + if (remain > 0 && !convert_label) + convert_label = gen_label_rtx (); + } + + if (GET_MODE_SIZE (GET_MODE (target)) >= GET_MODE_SIZE (load_mode)) + { + /* Target is larger than load size so we don't need to + reduce result size. */ + if (remain > 0) + { + /* This is not the last block, branch to the end if the result + of this subtract is not zero. */ + if (!final_label) + final_label = gen_label_rtx (); + rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); + rtx cond = gen_reg_rtx (CCmode); + rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); + rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond); + emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); + rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, + fin_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL (j) = final_label; + LABEL_NUSES (final_label) += 1; + } + else + { + if (word_mode == DImode) + { + emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, + tmp_reg_src2)); + emit_insn (gen_movsi (target, + gen_lowpart (SImode, tmp_reg_src2))); + } + else + emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2)); + + if (final_label) + { + rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); + JUMP_LABEL(j) = final_label; + LABEL_NUSES (final_label) += 1; + emit_barrier (); + } + } + } + else + { + generate_6432_conversion = true; + if (remain > 0) + { + if (!convert_label) + convert_label = gen_label_rtx (); + + /* Compare to zero and branch to convert_label if not zero. */ + rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); + rtx cond = gen_reg_rtx (CCmode); + rtx tmp = gen_rtx_MINUS (DImode, tmp_reg_src1, tmp_reg_src2); + rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond); + rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); + rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, + cvt_ref, pc_rtx); + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); + JUMP_LABEL(j) = convert_label; + LABEL_NUSES (convert_label) += 1; + } + else + { + /* Just do the subtract. Since this is the last block the + convert code will be generated immediately following. */ + emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, + tmp_reg_src2)); + } + } + + offset += cmp_bytes; + bytes -= cmp_bytes; + } + + if (generate_6432_conversion) + { + if (convert_label) + emit_label (convert_label); + + /* We need to produce DI result from sub, then convert to target SI + while maintaining <0 / ==0 / >0 properties. + Segher's sequence: cntlzd 3,3 ; addi 3,3,-1 ; xori 3,3,63 */ + emit_insn (gen_clzdi2 (tmp_reg_src2, tmp_reg_src2)); + emit_insn (gen_adddi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (-1))); + emit_insn (gen_xordi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (63))); + emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); + } + + if (final_label) + emit_label (final_label); + + gcc_assert (bytes == 0); + return true; +} + /* Expand a block move operation, and return 1 if successful. Return 0 if we should let the compiler generate normal code. @@ -19104,42 +19559,6 @@ rs6000_is_valid_2insn_and (rtx c, machine_mode mode) return rs6000_is_valid_and_mask (GEN_INT (val + bit3 - bit2), mode); } -/* Emit a potentially record-form instruction, setting DST from SRC. - If DOT is 0, that is all; otherwise, set CCREG to the result of the - signed comparison of DST with zero. If DOT is 1, the generated RTL - doesn't care about the DST result; if DOT is 2, it does. If CCREG - is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and - a separate COMPARE. */ - -static void -rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg) -{ - if (dot == 0) - { - emit_move_insn (dst, src); - return; - } - - if (cc_reg_not_cr0_operand (ccreg, CCmode)) - { - emit_move_insn (dst, src); - emit_move_insn (ccreg, gen_rtx_COMPARE (CCmode, dst, const0_rtx)); - return; - } - - rtx ccset = gen_rtx_SET (ccreg, gen_rtx_COMPARE (CCmode, src, const0_rtx)); - if (dot == 1) - { - rtx clobber = gen_rtx_CLOBBER (VOIDmode, dst); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, clobber))); - } - else - { - rtx set = gen_rtx_SET (dst, src); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, set))); - } -} - /* Emit the two insns to do an AND in mode MODE, with operands OPERANDS. If EXPAND is true, split rotate-and-mask instructions we generate to their constituent parts as well (this is used during expand); if DOT diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 007d3e66ec9..7b995ac0f25 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8614,6 +8614,27 @@ FAIL; }") +;; String/block compare insn. +;; Argument 0 is the target (result) +;; Argument 1 is the destination +;; Argument 2 is the source +;; Argument 3 is the length +;; Argument 4 is the alignment + +(define_expand "cmpmemsi" + [(parallel [(set (match_operand:SI 0) + (compare:SI (match_operand:BLK 1) + (match_operand:BLK 2))) + (use (match_operand:SI 3)) + (use (match_operand:SI 4))])] + "" +{ + if (expand_block_compare (operands)) + DONE; + else + FAIL; +}) + ;; String/block move insn. ;; Argument 0 is the destination ;; Argument 1 is the source diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index e47f47fb509..c79a439803c 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -333,6 +333,10 @@ mblock-move-inline-limit= Target Report Var(rs6000_block_move_inline_limit) Init(0) RejectNegative Joined UInteger Save Specify how many bytes should be moved inline before calling out to memcpy/memmove. +mblock-compare-inline-limit= +Target Report Var(rs6000_block_compare_inline_limit) Init(5) RejectNegative Joined UInteger Save +Specify the maximum number pairs of load instructions that should be generated inline for the compare. If the number needed exceeds the limit, a call to memcmp will be generated instead. + misel Target Report Mask(ISEL) Var(rs6000_isa_flags) Generate isel instructions.