bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
void aarch64_expand_call (rtx, rtx, rtx, bool);
bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_setmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
bool aarch64_float_const_rtx_p (rtx);
bool aarch64_function_arg_regno_p (unsigned);
case E_V4SImode:
return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+ case E_V16QImode:
+ return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
+
default:
gcc_unreachable ();
}
return true;
}
+/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
+ SRC is a register we have created with the duplicated value to be set. */
+static void
+aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
+ machine_mode mode)
+{
+ /* If we are copying 128bits or 256bits, we can do that straight from
+ the SIMD register we prepared. */
+ if (known_eq (GET_MODE_BITSIZE (mode), 256))
+ {
+ mode = GET_MODE (src);
+ /* "Cast" the *dst to the correct mode. */
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memset. */
+ emit_insn (aarch64_gen_store_pair (mode, *dst, src,
+ aarch64_progress_pointer (*dst), src));
+
+ /* Move the pointers forward. */
+ *dst = aarch64_move_pointer (*dst, 32);
+ return;
+ }
+ if (known_eq (GET_MODE_BITSIZE (mode), 128))
+ {
+ /* "Cast" the *dst to the correct mode. */
+ *dst = adjust_address (*dst, GET_MODE (src), 0);
+ /* Emit the memset. */
+ emit_move_insn (*dst, src);
+ /* Move the pointers forward. */
+ *dst = aarch64_move_pointer (*dst, 16);
+ return;
+ }
+ /* For copying less, we have to extract the right amount from src. */
+ rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
+
+ /* "Cast" the *dst to the correct mode. */
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memset. */
+ emit_move_insn (*dst, reg);
+ /* Move the pointer forward. */
+ *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand setmem, as if from a __builtin_memset. Return true if
+ we succeed, otherwise return false. */
+
+bool
+aarch64_expand_setmem (rtx *operands)
+{
+ int n, mode_bits;
+ unsigned HOST_WIDE_INT len;
+ rtx dst = operands[0];
+ rtx val = operands[2], src;
+ rtx base;
+ machine_mode cur_mode = BLKmode, next_mode;
+
+ /* We can't do anything smart if the amount to copy is not constant. */
+ if (!CONST_INT_P (operands[1]))
+ return false;
+
+ bool speed_p = !optimize_function_for_size_p (cfun);
+
+ /* Default the maximum to 256-bytes. */
+ unsigned max_set_size = 256;
+
+ /* In case we are optimizing for size or if the core does not
+ want to use STP Q regs, lower the max_set_size. */
+ max_set_size = (!speed_p
+ || (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+ ? max_set_size / 2 : max_set_size;
+
+ len = INTVAL (operands[1]);
+
+ /* Upper bound check. */
+ if (len > max_set_size)
+ return false;
+
+ base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+ dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+ /* Prepare the val using a DUP/MOVI v0.16B, val. */
+ src = expand_vector_broadcast (V16QImode, val);
+ src = force_reg (V16QImode, src);
+
+ /* Convert len to bits to make the rest of the code simpler. */
+ n = len * BITS_PER_UNIT;
+
+ /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
+ AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
+ pattern is only turned on for TARGET_SIMD. */
+ const int copy_limit = (speed_p
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+ ? GET_MODE_BITSIZE (TImode) : 256;
+
+ while (n > 0)
+ {
+ /* Find the largest mode in which to do the copy without
+ over writing. */
+ opt_scalar_int_mode mode_iter;
+ FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+ if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+ cur_mode = mode_iter.require ();
+
+ gcc_assert (cur_mode != BLKmode);
+
+ mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+ aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
+
+ n -= mode_bits;
+
+ /* Do certain trailing copies as overlapping if it's going to be
+ cheaper. i.e. less instructions to do so. For instance doing a 15
+ byte copy it's more efficient to do two overlapping 8 byte copies than
+ 8 + 4 + 2 + 1. */
+ if (n > 0 && n < copy_limit / 2)
+ {
+ next_mode = smallest_mode_for_size (n, MODE_INT);
+ int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
+ gcc_assert (n_bits <= mode_bits);
+ dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
+ n = n_bits;
+ }
+ }
+
+ return true;
+}
+
+
/* Split a DImode store of a CONST_INT SRC to MEM DST as two
SImode stores. Handle the case when the constant has identical
bottom and top halves. This is beneficial when the two stores can be
#define MOVE_RATIO(speed) \
(!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
-/* For CLEAR_RATIO, when optimizing for size, give a better estimate
- of the length of a memset call, but use the default otherwise. */
+/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
+ we would use more than 3 scalar instructions.
+ Otherwise follow a sensible default: when optimizing for size, give a better
+ estimate of the length of a memset call, but use the default otherwise. */
#define CLEAR_RATIO(speed) \
- ((speed) ? 15 : AARCH64_CALL_RATIO)
+ (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
-/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant, so when
- optimizing for size adjust the ratio to account for the overhead of loading
- the constant. */
+/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant. Without
+ -mstrict-align, make decisions in "setmem". Otherwise follow a sensible
+ default: when optimizing for size adjust the ratio to account for the
+ overhead of loading the constant. */
#define SET_RATIO(speed) \
- ((speed) ? 15 : AARCH64_CALL_RATIO - 2)
+ (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
/* Disable auto-increment in move_by_pieces et al. Use of auto-increment is
rarely a good idea in straight-line code since it adds an extra address
}
)
+;; 0 is dst
+;; 1 is val
+;; 2 is size of copy in bytes
+;; 3 is alignment
+
+(define_expand "setmemdi"
+ [(set (match_operand:BLK 0 "memory_operand") ;; Dest
+ (match_operand:QI 2 "nonmemory_operand")) ;; Value
+ (use (match_operand:DI 1 "immediate_operand")) ;; Length
+ (match_operand 3 "immediate_operand")] ;; Align
+ "TARGET_SIMD"
+{
+ if (aarch64_expand_setmem (operands))
+ DONE;
+
+ FAIL;
+})
+
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "load_pair_sw_<SX:mode><SX2:mode>"
// We want to match enough here to capture that we deleted an empty
// constructor store
-// aarch64 and mips will expand to loop to clear because CLEAR_RATIO.
-// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } }
+// mips will expand to loop to clear because CLEAR_RATIO.
+// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { mips*-*-* } } } }
return 0;
}
/* autofdo doesn't support value profiling for now: */
-/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile"} } */
+/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile" { target { ! aarch64*-*-* } } } } */
/* The versioned memset of size 4 should be optimized to an assignment.
- { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" } } */
+ { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" { target { ! aarch64*-*-* } } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/* One byte variable set should be scalar
+**set1byte:
+** strb w1, \[x0\]
+** ret
+*/
+void __attribute__((__noinline__))
+set1byte (int64_t *src, char c)
+{
+ __builtin_memset (src, c, 1);
+}
+
+/* Special cases for setting 0. */
+/* 1-byte should be STRB with wzr
+**set0byte:
+** strb wzr, \[x0\]
+** ret
+*/
+void __attribute__((__noinline__))
+set0byte (int64_t *src)
+{
+ __builtin_memset (src, 0, 1);
+}
+
+/* 35bytes would become 4 scalar instructions. So favour NEON.
+**set0neon:
+** movi v0.4s, 0
+** stp q0, q0, \[x0\]
+** str wzr, \[x0, 31\]
+** ret
+*/
+void __attribute__((__noinline__))
+set0neon (int64_t *src)
+{
+ __builtin_memset (src, 0, 35);
+}
+
+/* 36bytes should be scalar however.
+**set0scalar:
+** stp xzr, xzr, \[x0\]
+** stp xzr, xzr, \[x0, 16\]
+** str wzr, \[x0, 32\]
+** ret
+*/
+void __attribute__((__noinline__))
+set0scalar (int64_t *src)
+{
+ __builtin_memset (src, 0, 36);
+}
+
+
+/* 256-bytes expanded
+**set256byte:
+** dup v0.16b, w1
+** stp q0, q0, \[x0\]
+** stp q0, q0, \[x0, 32\]
+** stp q0, q0, \[x0, 64\]
+** stp q0, q0, \[x0, 96\]
+** stp q0, q0, \[x0, 128\]
+** stp q0, q0, \[x0, 160\]
+** stp q0, q0, \[x0, 192\]
+** stp q0, q0, \[x0, 224\]
+** ret
+*/
+void __attribute__((__noinline__))
+set256byte (int64_t *src, char c)
+{
+ __builtin_memset (src, c, 256);
+}
+
+/* More than 256 bytes goes to memset
+**set257byte:
+** mov x2, 257
+** mov w1, 99
+** b memset
+*/
+void __attribute__((__noinline__))
+set257byte (int64_t *src)
+{
+ __builtin_memset (src, 'c', 257);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/*
+**set128bits:
+** dup v0.16b, w1
+** str q0, \[x0\]
+** ret
+*/
+void __attribute__((__noinline__))
+set128bits (int64_t *src, char c)
+{
+ __builtin_memset (src, c, 2*sizeof(int64_t));
+}
+
+/*
+**set128bitszero:
+** stp xzr, xzr, \[x0\]
+** ret
+*/
+void __attribute__((__noinline__))
+set128bitszero (int64_t *src)
+{
+ __builtin_memset (src, 0, 2*sizeof(int64_t));
+}
+
+/*
+** set128bitsplus:
+** dup v0.16b, w1
+** str q0, \[x0\]
+** str q0, \[x0, 12\]
+** ret
+*/
+void __attribute__((__noinline__))
+set128bitsplus (int64_t *src, char c)
+{
+ __builtin_memset (src, c, 7*sizeof(int32_t));
+}
+
+/*
+** set256bits:
+** movi v0.16b, 0x63
+** stp q0, q0, \[x0\]
+** ret
+*/
+void __attribute__((__noinline__))
+set256bits (int64_t *src)
+{
+ __builtin_memset (src, 'c', 4*sizeof(int64_t));
+}
+
+/*
+**set256bitszero:
+** stp xzr, xzr, \[x0\]
+** stp xzr, xzr, \[x0, 16\]
+** ret
+*/
+void __attribute__((__noinline__))
+set256bitszero (int64_t *src)
+{
+ __builtin_memset (src, 0, 4*sizeof(int64_t));
+}
+
+/*
+** set256bitsplus:
+** movi v0.16b, 0x63
+** stp q0, q0, \[x0\]
+** str q0, \[x0, 32\]
+** str d0, \[x0, 48\]
+** ret
+*/
+void __attribute__((__noinline__))
+set256bitsplus (int64_t *src)
+{
+ __builtin_memset (src, 'c', 7*sizeof(int64_t));
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */