From: Sudakshina Das <sudi.das@arm.com>
Date: Fri, 13 Nov 2020 10:48:27 +0000 (+0000)
Subject: aarch64: Add backend support for expanding __builtin_memset
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=54bbde550ec557e48a67ca1f4036e46710bcfeda;p=gcc.git

aarch64: Add backend support for expanding __builtin_memset

This patch implements aarch64 backend expansion for __builtin_memset. Most of
the implementation is based on the expansion of __builtin_memcpy. We change the
values of SET_RATIO and MOVE_RATIO for cases where we do not have to strictly
align and where we can benefit from NEON instructions in the backend.

gcc/ChangeLog:

	* config/aarch64/aarch64-protos.h (aarch64_expand_setmem): New
	declaration.
	* config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case for
	E_V16QImode.
	(aarch64_set_one_block_and_progress_pointer): New helper for
	aarch64_expand_setmem.
	(aarch64_expand_setmem): Define the expansion for memset.
	* config/aarch64/aarch64.h (CLEAR_RATIO): Tweak to favor
	aarch64_expand_setmem when allowed and profitable.
	(SET_RATIO): Likewise.
	* config/aarch64/aarch64.md: Define pattern for setmemdi.

gcc/testsuite/ChangeLog:

	* g++.dg/tree-ssa/pr90883.C: Remove xfail for aarch64.
	* gcc.dg/tree-prof/stringop-2.c: Add xfail for aarch64.
	* gcc.target/aarch64/memset-corner-cases.c: New test.
	* gcc.target/aarch64/memset-q-reg.c: New test.
---

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 7a34c841355..2aa3f1fddaa 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -510,6 +510,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 97cb68980e9..0e572bac94d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7030,6 +7030,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     case E_V4SImode:
       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
 
+    case E_V16QImode:
+      return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -21276,6 +21279,135 @@ aarch64_expand_cpymem (rtx *operands)
   return true;
 }
 
+/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
+   SRC is a register we have created with the duplicated value to be set.  */
+static void
+aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
+					    machine_mode mode)
+{
+  /* If we are copying 128bits or 256bits, we can do that straight from
+     the SIMD register we prepared.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = GET_MODE (src);
+      /* "Cast" the *dst to the correct mode.  */
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memset.  */
+      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
+					 aarch64_progress_pointer (*dst), src));
+
+      /* Move the pointers forward.  */
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+    {
+      /* "Cast" the *dst to the correct mode.  */
+      *dst = adjust_address (*dst, GET_MODE (src), 0);
+      /* Emit the memset.  */
+      emit_move_insn (*dst, src);
+      /* Move the pointers forward.  */
+      *dst = aarch64_move_pointer (*dst, 16);
+      return;
+    }
+  /* For copying less, we have to extract the right amount from src.  */
+  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
+
+  /* "Cast" the *dst to the correct mode.  */
+  *dst = adjust_address (*dst, mode, 0);
+  /* Emit the memset.  */
+  emit_move_insn (*dst, reg);
+  /* Move the pointer forward.  */
+  *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand setmem, as if from a __builtin_memset.  Return true if
+   we succeed, otherwise return false.  */
+
+bool
+aarch64_expand_setmem (rtx *operands)
+{
+  int n, mode_bits;
+  unsigned HOST_WIDE_INT len;
+  rtx dst = operands[0];
+  rtx val = operands[2], src;
+  rtx base;
+  machine_mode cur_mode = BLKmode, next_mode;
+
+  /* We can't do anything smart if the amount to copy is not constant.  */
+  if (!CONST_INT_P (operands[1]))
+    return false;
+
+  bool speed_p = !optimize_function_for_size_p (cfun);
+
+  /* Default the maximum to 256-bytes.  */
+  unsigned max_set_size = 256;
+
+  /* In case we are optimizing for size or if the core does not
+     want to use STP Q regs, lower the max_set_size.  */
+  max_set_size = (!speed_p
+		  || (aarch64_tune_params.extra_tuning_flags
+		      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+		  ? max_set_size / 2 : max_set_size;
+
+  len = INTVAL (operands[1]);
+
+  /* Upper bound check.  */
+  if (len > max_set_size)
+    return false;
+
+  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+  /* Prepare the val using a DUP/MOVI v0.16B, val.  */
+  src = expand_vector_broadcast (V16QImode, val);
+  src = force_reg (V16QImode, src);
+
+  /* Convert len to bits to make the rest of the code simpler.  */
+  n = len * BITS_PER_UNIT;
+
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
+     pattern is only turned on for TARGET_SIMD.  */
+  const int copy_limit = (speed_p
+			  && (aarch64_tune_params.extra_tuning_flags
+			      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+			  ? GET_MODE_BITSIZE (TImode) : 256;
+
+  while (n > 0)
+    {
+      /* Find the largest mode in which to do the copy without
+	 over writing.  */
+      opt_scalar_int_mode mode_iter;
+      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	  cur_mode = mode_iter.require ();
+
+      gcc_assert (cur_mode != BLKmode);
+
+      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
+
+      n -= mode_bits;
+
+      /* Do certain trailing copies as overlapping if it's going to be
+	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
+	 byte copy it's more efficient to do two overlapping 8 byte copies than
+	 8 + 4 + 2 + 1.  */
+      if (n > 0 && n < copy_limit / 2)
+	{
+	  next_mode = smallest_mode_for_size (n, MODE_INT);
+	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
+	  gcc_assert (n_bits <= mode_bits);
+	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
+	  n = n_bits;
+	}
+    }
+
+  return true;
+}
+
+
 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
    SImode stores.  Handle the case when the constant has identical
    bottom and top halves.  This is beneficial when the two stores can be
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 00b5f843886..d241c5b873d 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1024,16 +1024,19 @@ typedef struct
 #define MOVE_RATIO(speed) \
   (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
 
-/* For CLEAR_RATIO, when optimizing for size, give a better estimate
-   of the length of a memset call, but use the default otherwise.  */
+/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
+   we would use more than 3 scalar instructions.
+   Otherwise follow a sensible default: when optimizing for size, give a better
+   estimate of the length of a memset call, but use the default otherwise.  */
 #define CLEAR_RATIO(speed) \
-  ((speed) ? 15 : AARCH64_CALL_RATIO)
+  (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
 
-/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant, so when
-   optimizing for size adjust the ratio to account for the overhead of loading
-   the constant.  */
+/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant.  Without
+   -mstrict-align, make decisions in "setmem".  Otherwise follow a sensible
+   default: when optimizing for size adjust the ratio to account for the
+   overhead of loading the constant.  */
 #define SET_RATIO(speed) \
-  ((speed) ? 15 : AARCH64_CALL_RATIO - 2)
+  (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
 
 /* Disable auto-increment in move_by_pieces et al.  Use of auto-increment is
    rarely a good idea in straight-line code since it adds an extra address
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 11e0f4612be..eed06de3240 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1571,6 +1571,24 @@
 }
 )
 
+;; 0 is dst
+;; 1 is val
+;; 2 is size of copy in bytes
+;; 3 is alignment
+
+(define_expand "setmemdi"
+  [(set (match_operand:BLK 0 "memory_operand")     ;; Dest
+        (match_operand:QI  2 "nonmemory_operand")) ;; Value
+   (use (match_operand:DI  1 "immediate_operand")) ;; Length
+   (match_operand          3 "immediate_operand")] ;; Align
+  "TARGET_SIMD"
+{
+  if (aarch64_expand_setmem (operands))
+    DONE;
+
+  FAIL;
+})
+
 ;; Operands 1 and 3 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "load_pair_sw_<SX:mode><SX2:mode>"
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr90883.C b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
index 0e622f263d2..37df17d0b16 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
@@ -15,6 +15,6 @@
 
 // We want to match enough here to capture that we deleted an empty
 // constructor store
-// aarch64 and mips will expand to loop to clear because CLEAR_RATIO.
-// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } }
+// mips will expand to loop to clear because CLEAR_RATIO.
+// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { mips*-*-* } } } }
 
diff --git a/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c b/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c
index b7471bffd91..e8b1644e2ba 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c
@@ -20,6 +20,6 @@ main()
    return 0;
 }
 /* autofdo doesn't support value profiling for now: */
-/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile"} } */
+/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile" { target { ! aarch64*-*-* } } } } */
 /* The versioned memset of size 4 should be optimized to an assignment.
-   { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" } } */
+   { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" { target { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
new file mode 100644
index 00000000000..c43f0199adc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/* One byte variable set should be scalar
+**set1byte:
+**	strb	w1, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set1byte (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 1);
+}
+
+/* Special cases for setting 0.  */
+/* 1-byte should be STRB with wzr
+**set0byte:
+**	strb	wzr, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0byte (int64_t *src)
+{
+  __builtin_memset (src, 0, 1);
+}
+
+/* 35bytes would become 4 scalar instructions.  So favour NEON.
+**set0neon:
+**	movi	v0.4s, 0
+**	stp	q0, q0, \[x0\]
+**	str	wzr, \[x0, 31\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0neon (int64_t *src)
+{
+  __builtin_memset (src, 0, 35);
+}
+
+/* 36bytes should be scalar however.
+**set0scalar:
+**	stp	xzr, xzr, \[x0\]
+**	stp	xzr, xzr, \[x0, 16\]
+**	str	wzr, \[x0, 32\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0scalar (int64_t *src)
+{
+  __builtin_memset (src, 0, 36);
+}
+
+
+/* 256-bytes expanded
+**set256byte:
+**	dup	v0.16b, w1
+**	stp	q0, q0, \[x0\]
+**	stp	q0, q0, \[x0, 32\]
+**	stp	q0, q0, \[x0, 64\]
+**	stp	q0, q0, \[x0, 96\]
+**	stp	q0, q0, \[x0, 128\]
+**	stp	q0, q0, \[x0, 160\]
+**	stp	q0, q0, \[x0, 192\]
+**	stp	q0, q0, \[x0, 224\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256byte (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 256);
+}
+
+/* More than 256 bytes goes to memset
+**set257byte:
+**	mov	x2, 257
+**	mov	w1, 99
+**	b	memset
+*/
+void __attribute__((__noinline__))
+set257byte (int64_t *src)
+{
+  __builtin_memset (src, 'c', 257);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-q-reg.c b/gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
new file mode 100644
index 00000000000..156146badbc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/*
+**set128bits:
+**	dup	v0.16b, w1
+**	str	q0, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bits (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 2*sizeof(int64_t));
+}
+
+/*
+**set128bitszero:
+**	stp	xzr, xzr, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bitszero (int64_t *src)
+{
+  __builtin_memset (src, 0, 2*sizeof(int64_t));
+}
+
+/*
+** set128bitsplus:
+**	dup	v0.16b, w1
+**	str	q0, \[x0\]
+**	str	q0, \[x0, 12\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bitsplus (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 7*sizeof(int32_t));
+}
+
+/*
+** set256bits:
+**	movi	v0.16b, 0x63
+**	stp	q0, q0, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bits (int64_t *src)
+{
+  __builtin_memset (src, 'c', 4*sizeof(int64_t));
+}
+
+/*
+**set256bitszero:
+**	stp	xzr, xzr, \[x0\]
+**	stp	xzr, xzr, \[x0, 16\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bitszero (int64_t *src)
+{
+  __builtin_memset (src, 0, 4*sizeof(int64_t));
+}
+
+/*
+** set256bitsplus:
+**	movi	v0.16b, 0x63
+**	stp	q0, q0, \[x0\]
+**	str	q0, \[x0, 32\]
+**	str	d0, \[x0, 48\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bitsplus (int64_t *src)
+{
+  __builtin_memset (src, 'c', 7*sizeof(int64_t));
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */