From: Tamar Christina <tamar.christina@arm.com>
Date: Mon, 1 Oct 2018 12:56:40 +0000 (+0000)
Subject: Add support for SVE stack clash probing.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=eb471ba379597d73fcd79986cca5b8351a32770a;p=gcc.git

Add support for SVE stack clash probing.

This patch adds basic support for SVE stack clash protection.
It is a first implementation and will use a loop to do the
probing and stack adjustments.

An example sequence is:

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSPL0:
        cmp     x16, 61440
        b.lt    .SVLPEND0
        sub     sp, sp, 61440
        str     xzr, [sp, 0]
        sub     x16, x16, 61440
        b      .SVLPSPL0
.SVLPEND0:
        sub     sp, sp, x16
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

for a 64KB guard size, and for a 4KB guard size

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSPL0:
        cmp     x16, 3072
        b.lt    .SVLPEND0
        sub     sp, sp, 3072
        str     xzr, [sp, 0]
        sub     x16, x16, 3072
        b       .SVLPSPL0
.SVLPEND0:
        sub     sp, sp, x16
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

This has about the same semantics as alloca, except we prioritize the common case
where no probe is required.  We also change the amount we adjust the stack and
the probing interval to be the nearest value to `guard size - abi buffer` that
fits in the 12-bit shifted immediate used by cmp.

While this would mean we probe a bit more often than we require, in practice the
amount of SVE vectors you'd need to spill is significant. Even more so to enter the
loop more than once.


gcc/

	PR target/86486
	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
	aarch64_clamp_to_uimm12_shift): New.
	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/

	PR target/86486
	* gcc.target/aarch64/stack-check-prologue-16.c: New test
	* gcc.target/aarch64/stack-check-cfa-3.c: New test.
	* gcc.target/aarch64/sve/struct_vect_24.c: New test.
	* gcc.target/aarch64/sve/struct_vect_24_run.c: New test.

From-SVN: r264749
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 0d02631f38f..4da1c622752 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2018-10-01  Tamar Christina  <tamar.christina@arm.com>
+
+	PR target/86486
+	* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
+	* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
+	aarch64_clamp_to_uimm12_shift): New.
+	(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
+	* config/aarch64/aarch64.md (probe_sve_stack_clash): New.
+
 2018-10-01  Tamar Christina  <tamar.christina@arm.com>
 
 	PR target/86486
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0..5f18837418e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -497,6 +497,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 9507608da9a..1351caa6141 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -166,6 +166,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -4020,6 +4021,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at most
+   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+   at most MIN_PROBE_THRESHOLD.  By the end of this function
+   BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4877,21 +4956,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+     if (dump_file)
+      {
+	fprintf (dump_file, "Stack clash SVE prologue: ");
+	print_dec (poly_size, dump_file);
+	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (dump_file)
     fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+	     " bytes, probing will be required.\n", size);
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
@@ -5494,6 +5625,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
 	  );
 }
 
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert ((val & 0xffffff) == val);
+
+  if (((val & 0xfff) << 0) == val)
+    return val;
+
+  return val & (0xfff << 12);
+}
 
 /* Return true if val is an immediate that can be loaded into a
    register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 77c949738c5..b4a4315ff4b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6514,6 +6514,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=rk")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "const_int_operand" "n")
+			    (match_operand:P 4 "aarch64_plus_immediate" "L")]
+			     UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 9d7f2460645..84c4fa95245 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2018-10-01  Tamar Christina  <tamar.christina@arm.com>
+
+	PR target/86486
+	* gcc.target/aarch64/stack-check-prologue-16.c: New test
+	* gcc.target/aarch64/stack-check-cfa-3.c: New test.
+	* gcc.target/aarch64/sve/struct_vect_24.c: New test.
+	* gcc.target/aarch64/sve/struct_vect_24_run.c: New test.
+
 2018-10-01  Jeff Law  <law@redhat.com>
 	    Richard Sandiford <richard.sandiford@linaro.org>
 	    Tamar Christina  <tamar.christina@arm.com>
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 00000000000..41579f26ba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
+
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 00000000000..d92ef47a57d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+/* Invoke X (P##n) for n in [0, 7].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39].  */
+#define REPEAT40(X) \
+  REPEAT8 (X, 0) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers.  */
+void
+f2 (int x[40][100], int *y)
+{
+  /* Try to force some spilling.  */
+#define DECLARE(N) int y##N = y[N];
+  REPEAT40 (DECLARE);
+#pragma omp simd
+  for (int i = 0; i < 100; ++i)
+    {
+#define INC(N) x[N][i] += y##N;
+      REPEAT40 (INC);
+    }
+}
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
new file mode 100644
index 00000000000..68a9d5e3d2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 64 * 1024
+
+/* Invoke X (P##n) for n in [0, 9].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+  X (P##8)  X (P##9)
+
+/* Invoke X (n) for all n in [0, 49].  */
+#define REPEAT50(X) \
+  REPEAT8 (X, ) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+  /* Try to force some spilling.  */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE)				\
+  void __attribute__ ((noinline, noclone, simd))	\
+  NAME (TYPE *restrict dest, TYPE *restrict src)	\
+  {							\
+    REPEAT50 (DECLARE);					\
+    volatile char foo[S];				\
+    foo[S-1]=1;						\
+    for (int i = 0; i < N; i++)				\
+      {							\
+	REPEAT50 (INC);					\
+      }							\
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i32, int32_t) \
+  TEST_LOOP (NAME##_i64, int64_t) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
new file mode 100644
index 00000000000..e764476facc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_24.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)				\
+  {							\
+    TYPE out[N];					\
+    TYPE in[N * 4];					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	out[i] = i * 7 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    for (int i = 0; i < N * 4; ++i)			\
+      {							\
+	in[i] = i * 9 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    NAME (out, in);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE expected = i * 7 / 2;			\
+	if (out[i] != out[0] + expected)		\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+int __attribute__ ((optimize (0)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}