From: Bin Cheng Date: Fri, 5 Sep 2014 03:45:57 +0000 (+0000) Subject: re PR target/55701 (Inline some instances of memset for ARM) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ad4211596510c04caeee5404d7b066871114b2bb;p=gcc.git re PR target/55701 (Inline some instances of memset for ARM) PR target/55701 * config/arm/arm.md (setmem): New pattern. * config/arm/arm-protos.h (struct tune_params): New fields. (arm_gen_setmem): New prototype. * config/arm/arm.c (arm_slowmul_tune): Initialize new fields. (arm_fastmul_tune, arm_strongarm_tune, arm_xscale_tune): Ditto. (arm_9e_tune, arm_v6t2_tune, arm_cortex_tune): Ditto. (arm_cortex_a8_tune, arm_cortex_a7_tune): Ditto. (arm_cortex_a15_tune, arm_cortex_a53_tune): Ditto. (arm_cortex_a57_tune, arm_cortex_a5_tune): Ditto. (arm_cortex_a9_tune, arm_cortex_a12_tune): Ditto. (arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune): Ditto. (arm_const_inline_cost): New function. (arm_block_set_max_insns): New function. (arm_block_set_non_vect_profit_p): New function. (arm_block_set_vect_profit_p): New function. (arm_block_set_unaligned_vect): New function. (arm_block_set_aligned_vect): New function. (arm_block_set_unaligned_non_vect): New function. (arm_block_set_aligned_non_vect): New function. (arm_block_set_vect, arm_gen_setmem): New functions. testsuite * gcc.target/arm/memset-inline-1.c: New test. * gcc.target/arm/memset-inline-2.c: New test. * gcc.target/arm/memset-inline-3.c: New test. * gcc.target/arm/memset-inline-4.c: New test. * gcc.target/arm/memset-inline-5.c: New test. * gcc.target/arm/memset-inline-6.c: New test. * gcc.target/arm/memset-inline-7.c: New test. * gcc.target/arm/memset-inline-8.c: New test. * gcc.target/arm/memset-inline-9.c: New test. * gcc.target/arm/memset-inline-10.c: New test. From-SVN: r214937 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1e7d9e15dbd..84d7b0b8b5b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,27 @@ +2014-09-05 Bin Cheng + + PR target/55701 + * config/arm/arm.md (setmem): New pattern. + * config/arm/arm-protos.h (struct tune_params): New fields. + (arm_gen_setmem): New prototype. + * config/arm/arm.c (arm_slowmul_tune): Initialize new fields. + (arm_fastmul_tune, arm_strongarm_tune, arm_xscale_tune): Ditto. + (arm_9e_tune, arm_v6t2_tune, arm_cortex_tune): Ditto. + (arm_cortex_a8_tune, arm_cortex_a7_tune): Ditto. + (arm_cortex_a15_tune, arm_cortex_a53_tune): Ditto. + (arm_cortex_a57_tune, arm_cortex_a5_tune): Ditto. + (arm_cortex_a9_tune, arm_cortex_a12_tune): Ditto. + (arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune): Ditto. + (arm_const_inline_cost): New function. + (arm_block_set_max_insns): New function. + (arm_block_set_non_vect_profit_p): New function. + (arm_block_set_vect_profit_p): New function. + (arm_block_set_unaligned_vect): New function. + (arm_block_set_aligned_vect): New function. + (arm_block_set_unaligned_non_vect): New function. + (arm_block_set_aligned_non_vect): New function. + (arm_block_set_vect, arm_gen_setmem): New functions. + 2014-09-05 Bin Cheng * config/arm/arm.md (arm_movqi_insn): Use Uh instead of m constraint. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index d3540c779c0..3bd7d1c086c 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -278,6 +278,10 @@ struct tune_params /* Prefer 32-bit encoding instead of 16-bit encoding where subset of flags would be set. */ bool disparage_partial_flag_setting_t16_encodings; + /* Prefer to inline string operations like memset by using Neon. */ + bool string_ops_prefer_neon; + /* Maximum number of instructions to inline calls to memset. */ + int max_insns_inline_memset; }; extern const struct tune_params *current_tune; @@ -290,6 +294,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, extern bool arm_validize_comparison (rtx *, rtx *, rtx *); #endif /* RTX_CODE */ +extern bool arm_gen_setmem (rtx *); extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 15c634196a0..88f91e6f373 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -1698,7 +1698,9 @@ const struct tune_params arm_slowmul_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_fastmul_tune = @@ -1715,7 +1717,9 @@ const struct tune_params arm_fastmul_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; /* StrongARM has early execution of branches, so a sequence that is worth @@ -1735,7 +1739,9 @@ const struct tune_params arm_strongarm_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_xscale_tune = @@ -1752,7 +1758,9 @@ const struct tune_params arm_xscale_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_9e_tune = @@ -1769,7 +1777,9 @@ const struct tune_params arm_9e_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_v6t2_tune = @@ -1786,7 +1796,9 @@ const struct tune_params arm_v6t2_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; /* Generic Cortex tuning. Use more specific tunings if appropriate. */ @@ -1804,7 +1816,9 @@ const struct tune_params arm_cortex_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a8_tune = @@ -1821,7 +1835,9 @@ const struct tune_params arm_cortex_a8_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + true, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a7_tune = @@ -1838,7 +1854,9 @@ const struct tune_params arm_cortex_a7_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + true, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a15_tune = @@ -1855,7 +1873,9 @@ const struct tune_params arm_cortex_a15_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - true, true /* Prefer 32-bit encodings. */ + true, true, /* Prefer 32-bit encodings. */ + true, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a53_tune = @@ -1872,7 +1892,9 @@ const struct tune_params arm_cortex_a53_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a57_tune = @@ -1889,7 +1911,9 @@ const struct tune_params arm_cortex_a57_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - true, true /* Prefer 32-bit encodings. */ + true, true, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; /* Branches can be dual-issued on Cortex-A5, so conditional execution is @@ -1909,7 +1933,9 @@ const struct tune_params arm_cortex_a5_tune = {false, false}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + true, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a9_tune = @@ -1926,7 +1952,9 @@ const struct tune_params arm_cortex_a9_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_cortex_a12_tune = @@ -1943,7 +1971,9 @@ const struct tune_params arm_cortex_a12_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + true, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; /* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single @@ -1967,7 +1997,9 @@ const struct tune_params arm_v7m_tune = {false, false}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than @@ -1986,7 +2018,9 @@ const struct tune_params arm_v6m_tune = {false, false}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; const struct tune_params arm_fa726te_tune = @@ -2003,7 +2037,9 @@ const struct tune_params arm_fa726te_tune = {true, true}, /* Prefer non short circuit. */ &arm_default_vec_cost, /* Vectorizer costs. */ false, /* Prefer Neon for 64-bits bitops. */ - false, false /* Prefer 32-bit encodings. */ + false, false, /* Prefer 32-bit encodings. */ + false, /* Prefer Neon for stringops. */ + 8 /* Maximum insns to inline memset. */ }; @@ -16903,6 +16939,14 @@ arm_const_double_inline_cost (rtx val) NULL_RTX, NULL_RTX, 0, 0)); } +/* Cost of loading a SImode constant. */ +static inline int +arm_const_inline_cost (enum rtx_code code, rtx val) +{ + return arm_gen_constant (code, SImode, NULL_RTX, INTVAL (val), + NULL_RTX, NULL_RTX, 1, 0); +} + /* Return true if it is worthwhile to split a 64-bit constant into two 32-bit operations. This is the case if optimizing for size, or if we have load delay slots, or if one 32-bit part can be done with @@ -31587,6 +31631,519 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2) } +/* Maximum number of instructions to set block of memory. */ +static int +arm_block_set_max_insns (void) +{ + if (optimize_function_for_size_p (cfun)) + return 4; + else + return current_tune->max_insns_inline_memset; +} + +/* Return TRUE if it's profitable to set block of memory for + non-vectorized case. VAL is the value to set the memory + with. LENGTH is the number of bytes to set. ALIGN is the + alignment of the destination memory in bytes. UNALIGNED_P + is TRUE if we can only set the memory with instructions + meeting alignment requirements. USE_STRD_P is TRUE if we + can use strd to set the memory. */ +static bool +arm_block_set_non_vect_profit_p (rtx val, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT align, + bool unaligned_p, bool use_strd_p) +{ + int num = 0; + /* For leftovers in bytes of 0-7, we can set the memory block using + strb/strh/str with minimum instruction number. */ + const int leftover[8] = {0, 1, 1, 2, 1, 2, 2, 3}; + + if (unaligned_p) + { + num = arm_const_inline_cost (SET, val); + num += length / align + length % align; + } + else if (use_strd_p) + { + num = arm_const_double_inline_cost (val); + num += (length >> 3) + leftover[length & 7]; + } + else + { + num = arm_const_inline_cost (SET, val); + num += (length >> 2) + leftover[length & 3]; + } + + /* We may be able to combine last pair STRH/STRB into a single STR + by shifting one byte back. */ + if (unaligned_access && length > 3 && (length & 3) == 3) + num--; + + return (num <= arm_block_set_max_insns ()); +} + +/* Return TRUE if it's profitable to set block of memory for + vectorized case. LENGTH is the number of bytes to set. + ALIGN is the alignment of destination memory in bytes. + MODE is the vector mode used to set the memory. */ +static bool +arm_block_set_vect_profit_p (unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT align, + enum machine_mode mode) +{ + int num; + bool unaligned_p = ((align & 3) != 0); + unsigned int nelt = GET_MODE_NUNITS (mode); + + /* Instruction loading constant value. */ + num = 1; + /* Instructions storing the memory. */ + num += (length + nelt - 1) / nelt; + /* Instructions adjusting the address expression. Only need to + adjust address expression if it's 4 bytes aligned and bytes + leftover can only be stored by mis-aligned store instruction. */ + if (!unaligned_p && (length & 3) != 0) + num++; + + /* Store the first 16 bytes using vst1:v16qi for the aligned case. */ + if (!unaligned_p && mode == V16QImode) + num--; + + return (num <= arm_block_set_max_insns ()); +} + +/* Set a block of memory using vectorization instructions for the + unaligned case. We fill the first LENGTH bytes of the memory + area starting from DSTBASE with byte constant VALUE. ALIGN is + the alignment requirement of memory. Return TRUE if succeeded. */ +static bool +arm_block_set_unaligned_vect (rtx dstbase, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT value, + unsigned HOST_WIDE_INT align) +{ + unsigned int i, j, nelt_v16, nelt_v8, nelt_mode; + rtx dst, mem; + rtx val_elt, val_vec, reg; + rtx rval[MAX_VECT_LEN]; + rtx (*gen_func) (rtx, rtx); + enum machine_mode mode; + unsigned HOST_WIDE_INT v = value; + + gcc_assert ((align & 0x3) != 0); + nelt_v8 = GET_MODE_NUNITS (V8QImode); + nelt_v16 = GET_MODE_NUNITS (V16QImode); + if (length >= nelt_v16) + { + mode = V16QImode; + gen_func = gen_movmisalignv16qi; + } + else + { + mode = V8QImode; + gen_func = gen_movmisalignv8qi; + } + nelt_mode = GET_MODE_NUNITS (mode); + gcc_assert (length >= nelt_mode); + /* Skip if it isn't profitable. */ + if (!arm_block_set_vect_profit_p (length, align, mode)) + return false; + + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + mem = adjust_automodify_address (dstbase, mode, dst, 0); + + v = sext_hwi (v, BITS_PER_WORD); + val_elt = GEN_INT (v); + for (j = 0; j < nelt_mode; j++) + rval[j] = val_elt; + + reg = gen_reg_rtx (mode); + val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval)); + /* Emit instruction loading the constant value. */ + emit_move_insn (reg, val_vec); + + /* Handle nelt_mode bytes in a vector. */ + for (i = 0; (i + nelt_mode <= length); i += nelt_mode) + { + emit_insn ((*gen_func) (mem, reg)); + if (i + 2 * nelt_mode <= length) + emit_insn (gen_add2_insn (dst, GEN_INT (nelt_mode))); + } + + /* If there are not less than nelt_v8 bytes leftover, we must be in + V16QI mode. */ + gcc_assert ((i + nelt_v8) > length || mode == V16QImode); + + /* Handle (8, 16) bytes leftover. */ + if (i + nelt_v8 < length) + { + emit_insn (gen_add2_insn (dst, GEN_INT (length - i))); + /* We are shifting bytes back, set the alignment accordingly. */ + if ((length & 1) != 0 && align >= 2) + set_mem_align (mem, BITS_PER_UNIT); + + emit_insn (gen_movmisalignv16qi (mem, reg)); + } + /* Handle (0, 8] bytes leftover. */ + else if (i < length && i + nelt_v8 >= length) + { + if (mode == V16QImode) + { + reg = gen_lowpart (V8QImode, reg); + mem = adjust_automodify_address (dstbase, V8QImode, dst, 0); + } + emit_insn (gen_add2_insn (dst, GEN_INT ((length - i) + + (nelt_mode - nelt_v8)))); + /* We are shifting bytes back, set the alignment accordingly. */ + if ((length & 1) != 0 && align >= 2) + set_mem_align (mem, BITS_PER_UNIT); + + emit_insn (gen_movmisalignv8qi (mem, reg)); + } + + return true; +} + +/* Set a block of memory using vectorization instructions for the + aligned case. We fill the first LENGTH bytes of the memory area + starting from DSTBASE with byte constant VALUE. ALIGN is the + alignment requirement of memory. Return TRUE if succeeded. */ +static bool +arm_block_set_aligned_vect (rtx dstbase, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT value, + unsigned HOST_WIDE_INT align) +{ + unsigned int i, j, nelt_v8, nelt_v16, nelt_mode; + rtx dst, addr, mem; + rtx val_elt, val_vec, reg; + rtx rval[MAX_VECT_LEN]; + enum machine_mode mode; + unsigned HOST_WIDE_INT v = value; + + gcc_assert ((align & 0x3) == 0); + nelt_v8 = GET_MODE_NUNITS (V8QImode); + nelt_v16 = GET_MODE_NUNITS (V16QImode); + if (length >= nelt_v16 && unaligned_access && !BYTES_BIG_ENDIAN) + mode = V16QImode; + else + mode = V8QImode; + + nelt_mode = GET_MODE_NUNITS (mode); + gcc_assert (length >= nelt_mode); + /* Skip if it isn't profitable. */ + if (!arm_block_set_vect_profit_p (length, align, mode)) + return false; + + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + + v = sext_hwi (v, BITS_PER_WORD); + val_elt = GEN_INT (v); + for (j = 0; j < nelt_mode; j++) + rval[j] = val_elt; + + reg = gen_reg_rtx (mode); + val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval)); + /* Emit instruction loading the constant value. */ + emit_move_insn (reg, val_vec); + + i = 0; + /* Handle first 16 bytes specially using vst1:v16qi instruction. */ + if (mode == V16QImode) + { + mem = adjust_automodify_address (dstbase, mode, dst, 0); + emit_insn (gen_movmisalignv16qi (mem, reg)); + i += nelt_mode; + /* Handle (8, 16) bytes leftover using vst1:v16qi again. */ + if (i + nelt_v8 < length && i + nelt_v16 > length) + { + emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode))); + mem = adjust_automodify_address (dstbase, mode, dst, 0); + /* We are shifting bytes back, set the alignment accordingly. */ + if ((length & 0x3) == 0) + set_mem_align (mem, BITS_PER_UNIT * 4); + else if ((length & 0x1) == 0) + set_mem_align (mem, BITS_PER_UNIT * 2); + else + set_mem_align (mem, BITS_PER_UNIT); + + emit_insn (gen_movmisalignv16qi (mem, reg)); + return true; + } + /* Fall through for bytes leftover. */ + mode = V8QImode; + nelt_mode = GET_MODE_NUNITS (mode); + reg = gen_lowpart (V8QImode, reg); + } + + /* Handle 8 bytes in a vector. */ + for (; (i + nelt_mode <= length); i += nelt_mode) + { + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, mode, addr, i); + emit_move_insn (mem, reg); + } + + /* Handle single word leftover by shifting 4 bytes back. We can + use aligned access for this case. */ + if (i + UNITS_PER_WORD == length) + { + addr = plus_constant (Pmode, dst, i - UNITS_PER_WORD); + mem = adjust_automodify_address (dstbase, mode, + addr, i - UNITS_PER_WORD); + /* We are shifting 4 bytes back, set the alignment accordingly. */ + if (align > UNITS_PER_WORD) + set_mem_align (mem, BITS_PER_UNIT * UNITS_PER_WORD); + + emit_move_insn (mem, reg); + } + /* Handle (0, 4), (4, 8) bytes leftover by shifting bytes back. + We have to use unaligned access for this case. */ + else if (i < length) + { + emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode))); + mem = adjust_automodify_address (dstbase, mode, dst, 0); + /* We are shifting bytes back, set the alignment accordingly. */ + if ((length & 1) == 0) + set_mem_align (mem, BITS_PER_UNIT * 2); + else + set_mem_align (mem, BITS_PER_UNIT); + + emit_insn (gen_movmisalignv8qi (mem, reg)); + } + + return true; +} + +/* Set a block of memory using plain strh/strb instructions, only + using instructions allowed by ALIGN on processor. We fill the + first LENGTH bytes of the memory area starting from DSTBASE + with byte constant VALUE. ALIGN is the alignment requirement + of memory. */ +static bool +arm_block_set_unaligned_non_vect (rtx dstbase, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT value, + unsigned HOST_WIDE_INT align) +{ + unsigned int i; + rtx dst, addr, mem; + rtx val_exp, val_reg, reg; + enum machine_mode mode; + HOST_WIDE_INT v = value; + + gcc_assert (align == 1 || align == 2); + + if (align == 2) + v |= (value << BITS_PER_UNIT); + + v = sext_hwi (v, BITS_PER_WORD); + val_exp = GEN_INT (v); + /* Skip if it isn't profitable. */ + if (!arm_block_set_non_vect_profit_p (val_exp, length, + align, true, false)) + return false; + + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + mode = (align == 2 ? HImode : QImode); + val_reg = force_reg (SImode, val_exp); + reg = gen_lowpart (mode, val_reg); + + for (i = 0; (i + GET_MODE_SIZE (mode) <= length); i += GET_MODE_SIZE (mode)) + { + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, mode, addr, i); + emit_move_insn (mem, reg); + } + + /* Handle single byte leftover. */ + if (i + 1 == length) + { + reg = gen_lowpart (QImode, val_reg); + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, QImode, addr, i); + emit_move_insn (mem, reg); + i++; + } + + gcc_assert (i == length); + return true; +} + +/* Set a block of memory using plain strd/str/strh/strb instructions, + to permit unaligned copies on processors which support unaligned + semantics for those instructions. We fill the first LENGTH bytes + of the memory area starting from DSTBASE with byte constant VALUE. + ALIGN is the alignment requirement of memory. */ +static bool +arm_block_set_aligned_non_vect (rtx dstbase, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT value, + unsigned HOST_WIDE_INT align) +{ + unsigned int i; + rtx dst, addr, mem; + rtx val_exp, val_reg, reg; + unsigned HOST_WIDE_INT v; + bool use_strd_p; + + use_strd_p = (length >= 2 * UNITS_PER_WORD && (align & 3) == 0 + && TARGET_LDRD && current_tune->prefer_ldrd_strd); + + v = (value | (value << 8) | (value << 16) | (value << 24)); + if (length < UNITS_PER_WORD) + v &= (0xFFFFFFFF >> (UNITS_PER_WORD - length) * BITS_PER_UNIT); + + if (use_strd_p) + v |= (v << BITS_PER_WORD); + else + v = sext_hwi (v, BITS_PER_WORD); + + val_exp = GEN_INT (v); + /* Skip if it isn't profitable. */ + if (!arm_block_set_non_vect_profit_p (val_exp, length, + align, false, use_strd_p)) + { + if (!use_strd_p) + return false; + + /* Try without strd. */ + v = (v >> BITS_PER_WORD); + v = sext_hwi (v, BITS_PER_WORD); + val_exp = GEN_INT (v); + use_strd_p = false; + if (!arm_block_set_non_vect_profit_p (val_exp, length, + align, false, use_strd_p)) + return false; + } + + i = 0; + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + /* Handle double words using strd if possible. */ + if (use_strd_p) + { + val_reg = force_reg (DImode, val_exp); + reg = val_reg; + for (; (i + 8 <= length); i += 8) + { + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, DImode, addr, i); + emit_move_insn (mem, reg); + } + } + else + val_reg = force_reg (SImode, val_exp); + + /* Handle words. */ + reg = (use_strd_p ? gen_lowpart (SImode, val_reg) : val_reg); + for (; (i + 4 <= length); i += 4) + { + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, SImode, addr, i); + if ((align & 3) == 0) + emit_move_insn (mem, reg); + else + emit_insn (gen_unaligned_storesi (mem, reg)); + } + + /* Merge last pair of STRH and STRB into a STR if possible. */ + if (unaligned_access && i > 0 && (i + 3) == length) + { + addr = plus_constant (Pmode, dst, i - 1); + mem = adjust_automodify_address (dstbase, SImode, addr, i - 1); + /* We are shifting one byte back, set the alignment accordingly. */ + if ((align & 1) == 0) + set_mem_align (mem, BITS_PER_UNIT); + + /* Most likely this is an unaligned access, and we can't tell at + compilation time. */ + emit_insn (gen_unaligned_storesi (mem, reg)); + return true; + } + + /* Handle half word leftover. */ + if (i + 2 <= length) + { + reg = gen_lowpart (HImode, val_reg); + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, HImode, addr, i); + if ((align & 1) == 0) + emit_move_insn (mem, reg); + else + emit_insn (gen_unaligned_storehi (mem, reg)); + + i += 2; + } + + /* Handle single byte leftover. */ + if (i + 1 == length) + { + reg = gen_lowpart (QImode, val_reg); + addr = plus_constant (Pmode, dst, i); + mem = adjust_automodify_address (dstbase, QImode, addr, i); + emit_move_insn (mem, reg); + } + + return true; +} + +/* Set a block of memory using vectorization instructions for both + aligned and unaligned cases. We fill the first LENGTH bytes of + the memory area starting from DSTBASE with byte constant VALUE. + ALIGN is the alignment requirement of memory. */ +static bool +arm_block_set_vect (rtx dstbase, + unsigned HOST_WIDE_INT length, + unsigned HOST_WIDE_INT value, + unsigned HOST_WIDE_INT align) +{ + /* Check whether we need to use unaligned store instruction. */ + if (((align & 3) != 0 || (length & 3) != 0) + /* Check whether unaligned store instruction is available. */ + && (!unaligned_access || BYTES_BIG_ENDIAN)) + return false; + + if ((align & 3) == 0) + return arm_block_set_aligned_vect (dstbase, length, value, align); + else + return arm_block_set_unaligned_vect (dstbase, length, value, align); +} + +/* Expand string store operation. Firstly we try to do that by using + vectorization instructions, then try with ARM unaligned access and + double-word store if profitable. OPERANDS[0] is the destination, + OPERANDS[1] is the number of bytes, operands[2] is the value to + initialize the memory, OPERANDS[3] is the known alignment of the + destination. */ +bool +arm_gen_setmem (rtx *operands) +{ + rtx dstbase = operands[0]; + unsigned HOST_WIDE_INT length; + unsigned HOST_WIDE_INT value; + unsigned HOST_WIDE_INT align; + + if (!CONST_INT_P (operands[2]) || !CONST_INT_P (operands[1])) + return false; + + length = UINTVAL (operands[1]); + if (length > 64) + return false; + + value = (UINTVAL (operands[2]) & 0xFF); + align = UINTVAL (operands[3]); + if (TARGET_NEON && length >= 8 + && current_tune->string_ops_prefer_neon + && arm_block_set_vect (dstbase, length, value, align)) + return true; + + if (!unaligned_access && (align & 3) != 0) + return arm_block_set_unaligned_non_vect (dstbase, length, value, align); + + return arm_block_set_aligned_non_vect (dstbase, length, value, align); +} + /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */ static unsigned HOST_WIDE_INT diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index f394855d60e..0e43dd21bff 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -6716,6 +6716,20 @@ }) +(define_expand "setmemsi" + [(match_operand:BLK 0 "general_operand" "") + (match_operand:SI 1 "const_int_operand" "") + (match_operand:SI 2 "const_int_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_32BIT" +{ + if (arm_gen_setmem (operands)) + DONE; + + FAIL; +}) + + ;; Move a block of memory if it is word aligned and MORE than 2 words long. ;; We could let this apply for blocks of less than this, but it clobbers so ;; many registers that there is then probably a better way. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index cf7c849a32a..31c4d21d11b 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,17 @@ +2014-09-05 Bin Cheng + + PR target/55701 + * gcc.target/arm/memset-inline-1.c: New test. + * gcc.target/arm/memset-inline-2.c: New test. + * gcc.target/arm/memset-inline-3.c: New test. + * gcc.target/arm/memset-inline-4.c: New test. + * gcc.target/arm/memset-inline-5.c: New test. + * gcc.target/arm/memset-inline-6.c: New test. + * gcc.target/arm/memset-inline-7.c: New test. + * gcc.target/arm/memset-inline-8.c: New test. + * gcc.target/arm/memset-inline-9.c: New test. + * gcc.target/arm/memset-inline-10.c: New test. + 2014-09-04 Kaz Kojima * gcc.c-torture/execute/pr44683.c: Remove dg-options for sh*-*-*. diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-1.c b/gcc/testsuite/gcc.target/arm/memset-inline-1.c new file mode 100644 index 00000000000..1fe760c1b73 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-1.c @@ -0,0 +1,39 @@ +/* { dg-do run } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +void +foo (void) +{ + memset (a, -1, 14); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo (); + check ((signed char *)a, 14, sizeof (a), -1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { arm_thumb2_ok } } } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-10.c b/gcc/testsuite/gcc.target/arm/memset-inline-10.c new file mode 100644 index 00000000000..d3b777c3eaf --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-10.c @@ -0,0 +1,95 @@ +/* { dg-do compile } */ +/* { dg-options "-march=armv7-a -mfloat-abi=hard -mfpu=neon -O2" } */ + +#define BUF 100 +long a[BUF]; + +typedef unsigned int size_t; +typedef unsigned int wchar_t; +void *memset (void *s, int c, size_t n); +struct printf_info +{ + int prec; + int width; + wchar_t spec; + unsigned int is_long_double:1; + unsigned int is_short:1; + unsigned int is_long:1; + unsigned int alt:1; + unsigned int space:1; + unsigned int left:1; + unsigned int showsign:1; + unsigned int group:1; + unsigned int extra:1; + unsigned int is_char:1; + unsigned int wide:1; + unsigned int i18n:1; + unsigned int __pad:4; + unsigned short int user; + wchar_t pad; +}; + +void bar (int *alt, int *space, int *left, int *showsign, + int *group, + int *is_long_double, + int *is_short, + int *is_long, + int *width, + int *prec, + int *use_outdigits, + unsigned int *pad, + wchar_t *spec); +void __printf_fp (char *s, struct printf_info *pinfo); +int foo(char *s) +{ + int alt = 0; + int space = 0; + int left = 0; + int showsign = 0; + int group = 0; + int is_long_double = 0; + int is_short = 0; + int is_long = 0; + int width = 0; + int prec = -1; + int use_outdigits = 0; + unsigned int pad = L' '; + wchar_t spec; + + bar (&alt, &space, &left, &showsign, &group, &is_long_double, + &is_short, &is_long, &width, &prec, &use_outdigits, &pad, &spec); + + a[1] = a[0] + a[2] + a[3] + a[4] + a[5] + a[6]; + a[2] = a[1] + a[3] + a[5] + a[5] + a[6] + a[7]; + a[3] = a[2] + a[5] + a[7] + a[6] + a[7] + a[8]; + a[4] = a[3] + a[7] + a[11] + a[7] + a[8] + a[9]; + a[5] = a[5] + a[11] + a[13] + a[8] + a[9] + a[10]; + a[6] = a[7] + a[13] + a[17] + a[9] + a[10] + a[11]; + a[7] = a[11] + a[17] + a[19] + a[10] + a[11] + a[12]; + a[8] = a[17] + a[19] + a[23] + a[29] + a[31] + a[37]; + + { + struct printf_info info; + memset (&info, 0, sizeof (struct printf_info)); + info.prec = prec; + info.width = width; + info.spec = spec; + info.is_long_double = is_long_double; + info.is_short = is_short; + info.is_long = is_long; + info.alt = alt; + info.space = space; + info.left = left; + info.showsign = showsign; + info.group = group; + info.pad = pad; + info.extra = 0; + info.i18n = use_outdigits; + info.wide = sizeof (wchar_t) != 1; + + __printf_fp (s, &info); + } + + return 0; +} + diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-2.c b/gcc/testsuite/gcc.target/arm/memset-inline-2.c new file mode 100644 index 00000000000..6deaffe232d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-2.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-options "-save-temps -Os -fno-inline" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +void +foo (void) +{ + memset (a, -1, 14); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo (); + check ((signed char *)a, 14, sizeof (a), -1); + + return 0; +} +/* { dg-final { scan-assembler "bl?\[ \t\]*memset" { target { ! arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-3.c b/gcc/testsuite/gcc.target/arm/memset-inline-3.c new file mode 100644 index 00000000000..0cb0ccd8eec --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-3.c @@ -0,0 +1,40 @@ +/* { dg-do run } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +void +foo (void) +{ + memset (a, -1, 7); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo (); + check ((signed char *)a, 7, sizeof (a), -1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { ! arm_thumb1_ok } } } } */ +/* { dg-final { scan-assembler-not "strh" { target { arm_unaligned } } } } */ +/* { dg-final { scan-assembler-not "strb" { target { arm_unaligned } } } } */ diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-4.c b/gcc/testsuite/gcc.target/arm/memset-inline-4.c new file mode 100644 index 00000000000..381a2c2099b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-4.c @@ -0,0 +1,68 @@ +/* { dg-do run } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options "arm_neon" } */ + +#include +#include + +#define LEN (100) +int a[LEN]; +int b[LEN]; +int c[LEN]; +void +foo1 (void) +{ + memset (a, -1, 8); + return; +} + +void +foo2 (void) +{ + memset (b, 1, 12); + return; +} + +void +foo3 (void) +{ + memset (c, 1, 13); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + int i; + + foo1 (); + check ((signed char *)a, 8, sizeof (a), -1); + + foo2 (); + check ((signed char *)b, 12, sizeof (b), 1); + + foo3 (); + check ((signed char *)c, 13, sizeof (c), 1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { ! arm_thumb1_ok } } } } */ +/* { dg-final { scan-assembler-times "vst1\.8" 1 { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler "vstr" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-5.c b/gcc/testsuite/gcc.target/arm/memset-inline-5.c new file mode 100644 index 00000000000..9107d811a94 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-5.c @@ -0,0 +1,78 @@ +/* { dg-do run } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options "arm_neon" } */ + +#include +#include + +#define LEN (100) +int a[LEN]; +int b[LEN]; +int c[LEN]; +int d[LEN]; +void +foo1 (void) +{ + memset (a, -1, 16); + return; +} + +void +foo2 (void) +{ + memset (b, 1, 25); + return; +} + +void +foo3 (void) +{ + memset (c, -1, 19); + return; +} + +void +foo4 (void) +{ + memset (d, 1, 23); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo1 (); + check ((signed char *)a, 16, sizeof (a), -1); + + foo2 (); + check ((signed char *)b, 25, sizeof (b), 1); + + foo3 (); + check ((signed char *)c, 19, sizeof (c), -1); + + foo4 (); + check ((signed char *)d, 23, sizeof (d), 1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler-not "vstr" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */ + diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-6.c b/gcc/testsuite/gcc.target/arm/memset-inline-6.c new file mode 100644 index 00000000000..fcb2e26a95d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-6.c @@ -0,0 +1,68 @@ +/* { dg-do run } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options "arm_neon" } */ + +#include +#include + +#define LEN (100) +int a[LEN]; +int b[LEN]; +int c[LEN]; +void +foo1 (void) +{ + memset (a, -1, 20); + return; +} + +void +foo2 (void) +{ + memset (b, 1, 24); + return; +} + +void +foo3 (void) +{ + memset (c, -1, 32); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo1 (); + check ((signed char *)a, 20, sizeof (a), -1); + + foo2 (); + check ((signed char *)b, 24, sizeof (b), 1); + + foo3 (); + check ((signed char *)c, 32, sizeof (c), -1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler-times "vst1" 3 { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler-times "vstr" 4 { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */ + + diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-7.c b/gcc/testsuite/gcc.target/arm/memset-inline-7.c new file mode 100644 index 00000000000..7326c5f857c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-7.c @@ -0,0 +1,171 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +int b[LEN]; + +void +init (signed char *arr, int len) +{ + int i; + for (i = 0; i < len; i++) + arr[i] = 0; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +#define TEST(a,l,v) \ + init ((signed char*)(a), sizeof (a)); \ + memset ((a), (v), (l)); \ + check ((signed char *)(a), (l), sizeof (a), (v)); +int +main(void) +{ + TEST (a, 1, -1); + TEST (a, 2, -1); + TEST (a, 3, -1); + TEST (a, 4, -1); + TEST (a, 5, -1); + TEST (a, 6, -1); + TEST (a, 7, -1); + TEST (a, 8, -1); + TEST (a, 9, 1); + TEST (a, 10, -1); + TEST (a, 11, 1); + TEST (a, 12, -1); + TEST (a, 13, 1); + TEST (a, 14, -1); + TEST (a, 15, 1); + TEST (a, 16, -1); + TEST (a, 17, 1); + TEST (a, 18, -1); + TEST (a, 19, 1); + TEST (a, 20, -1); + TEST (a, 21, 1); + TEST (a, 22, -1); + TEST (a, 23, 1); + TEST (a, 24, -1); + TEST (a, 25, 1); + TEST (a, 26, -1); + TEST (a, 27, 1); + TEST (a, 28, -1); + TEST (a, 29, 1); + TEST (a, 30, -1); + TEST (a, 31, 1); + TEST (a, 32, -1); + TEST (a, 33, 1); + TEST (a, 34, -1); + TEST (a, 35, 1); + TEST (a, 36, -1); + TEST (a, 37, 1); + TEST (a, 38, -1); + TEST (a, 39, 1); + TEST (a, 40, -1); + TEST (a, 41, 1); + TEST (a, 42, -1); + TEST (a, 43, 1); + TEST (a, 44, -1); + TEST (a, 45, 1); + TEST (a, 46, -1); + TEST (a, 47, 1); + TEST (a, 48, -1); + TEST (a, 49, 1); + TEST (a, 50, -1); + TEST (a, 51, 1); + TEST (a, 52, -1); + TEST (a, 53, 1); + TEST (a, 54, -1); + TEST (a, 55, 1); + TEST (a, 56, -1); + TEST (a, 57, 1); + TEST (a, 58, -1); + TEST (a, 59, 1); + TEST (a, 60, -1); + TEST (a, 61, 1); + TEST (a, 62, -1); + TEST (a, 63, 1); + TEST (a, 64, -1); + + TEST (b, 1, -1); + TEST (b, 2, -1); + TEST (b, 3, -1); + TEST (b, 4, -1); + TEST (b, 5, -1); + TEST (b, 6, -1); + TEST (b, 7, -1); + TEST (b, 8, -1); + TEST (b, 9, 1); + TEST (b, 10, -1); + TEST (b, 11, 1); + TEST (b, 12, -1); + TEST (b, 13, 1); + TEST (b, 14, -1); + TEST (b, 15, 1); + TEST (b, 16, -1); + TEST (b, 17, 1); + TEST (b, 18, -1); + TEST (b, 19, 1); + TEST (b, 20, -1); + TEST (b, 21, 1); + TEST (b, 22, -1); + TEST (b, 23, 1); + TEST (b, 24, -1); + TEST (b, 25, 1); + TEST (b, 26, -1); + TEST (b, 27, 1); + TEST (b, 28, -1); + TEST (b, 29, 1); + TEST (b, 30, -1); + TEST (b, 31, 1); + TEST (b, 32, -1); + TEST (b, 33, 1); + TEST (b, 34, -1); + TEST (b, 35, 1); + TEST (b, 36, -1); + TEST (b, 37, 1); + TEST (b, 38, -1); + TEST (b, 39, 1); + TEST (b, 40, -1); + TEST (b, 41, 1); + TEST (b, 42, -1); + TEST (b, 43, 1); + TEST (b, 44, -1); + TEST (b, 45, 1); + TEST (b, 46, -1); + TEST (b, 47, 1); + TEST (b, 48, -1); + TEST (b, 49, 1); + TEST (b, 50, -1); + TEST (b, 51, 1); + TEST (b, 52, -1); + TEST (b, 53, 1); + TEST (b, 54, -1); + TEST (b, 55, 1); + TEST (b, 56, -1); + TEST (b, 57, 1); + TEST (b, 58, -1); + TEST (b, 59, 1); + TEST (b, 60, -1); + TEST (b, 61, 1); + TEST (b, 62, -1); + TEST (b, 63, 1); + TEST (b, 64, -1); + + return 0; +} + diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-8.c b/gcc/testsuite/gcc.target/arm/memset-inline-8.c new file mode 100644 index 00000000000..b6e04773ffc --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-8.c @@ -0,0 +1,44 @@ +/* { dg-do run } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options "arm_neon" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +void +foo (void) +{ + memset (a, -1, 14); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo (); + check ((signed char *)a, 14, sizeof (a), -1); + + return 0; +} + +/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { arm_thumb2_ok } } } } */ +/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler-not "vstr" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/memset-inline-9.c b/gcc/testsuite/gcc.target/arm/memset-inline-9.c new file mode 100644 index 00000000000..be9323aae51 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/memset-inline-9.c @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */ +/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */ +/* { dg-options "-save-temps -Os -fno-inline" } */ +/* { dg-add-options "arm_neon" } */ + +#include +#include + +#define LEN (100) +short a[LEN]; +void +foo (void) +{ + memset (a, -1, 14); + return; +} + +void +check (signed char *arr, int idx, int len, int v) +{ + int i; + for (i = 0; i < idx; i++) + if (arr[i] != v) + abort (); + + for (i = idx; i < len; i++) + if (arr[i] != 0) + abort (); +} + +int +main(void) +{ + foo (); + check ((signed char *)a, 14, sizeof (a), -1); + + return 0; +} +/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */ +/* { dg-final { cleanup-saved-temps } } */