Add support for SVE stack clash probing.

author Tamar Christina <tamar.christina@arm.com>

Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)

committer Tamar Christina <tnfchris@gcc.gnu.org>

Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)
committer Tamar Christina <tnfchris@gcc.gnu.org>
Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 0d02631f38f4f035faf377584767e41e0c75a9f7..4da1c622752099faa3fda34e65a3373b0b067c5f 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2018-10-01  Tamar Christina  <tamar.christina@arm.com>
+
+       PR target/86486
+       * config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
+       * config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
+       aarch64_clamp_to_uimm12_shift): New.
+       (aarch64_allocate_and_probe_stack_space): Add SVE specific section.
+       * config/aarch64/aarch64.md (probe_sve_stack_clash): New.
+
  2018-10-01  Tamar Christina  <tamar.christina@arm.com>
  
         PR target/86486
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index caf1d2041f0cac8e3f975f8384a167a90dc638e5..5f18837418e1c7950ccf74af0be7e3ae7763ee28 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -497,6 +497,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
  void aarch64_cpu_cpp_builtins (cpp_reader *);
  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
  const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
  void aarch64_err_no_fpadvsimd (machine_mode);
  void aarch64_expand_epilogue (bool);
  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 9507608da9a8d625d04703a180ff9e35f021c666..1351caa61415918e1fd704dc541dd8489723d7bb 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -166,6 +166,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                             aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
  
  /* Major revision number of the ARM Architecture implemented by the target.  */
  unsigned aarch64_architecture_version;
@@ -4020,6 +4021,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
    return "";
  }
  
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at most
+   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+   at most MIN_PROBE_THRESHOLD.  By the end of this function
+   BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+                                     rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
  /* Determine whether a frame chain needs to be generated.  */
  static bool
  aarch64_needs_frame_chain (void)
@@ -4877,21 +4956,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
         }
      }
  
-  HOST_WIDE_INT size;
    /* If SIZE is not large enough to require probing, just adjust the stack and
       exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
        || !flag_stack_clash_protection)
      {
        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
        return;
      }
  
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+     if (dump_file)
+      {
+       fprintf (dump_file, "Stack clash SVE prologue: ");
+       print_dec (poly_size, dump_file);
+       fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+                         poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+       {
+         /* This is done to provide unwinding information for the stack
+            adjustments we're about to do, however to prevent the optimizers
+            from removing the R15 move and leaving the CFA note (which would be
+            very wrong) we tie the old and new stack pointer together.
+            The tie will expand to nothing but the optimizers will not touch
+            the instruction.  */
+         rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+         emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+         emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+         /* We want the CFA independent of the stack pointer for the
+            duration of the loop.  */
+         add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+                                                  stack_pointer_rtx, temp1,
+                                                  probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+       {
+         add_reg_note (insn, REG_CFA_DEF_CFA,
+                       gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+                                     gen_int_mode (poly_size, Pmode)));
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+
+      return;
+    }
+
    if (dump_file)
      fprintf (dump_file,
-            "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-            ", probing will be required.\n", size);
+            "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+            " bytes, probing will be required.\n", size);
  
    /* Round size to the nearest multiple of guard_size, and calculate the
       residual as the difference between the original size and the rounded
@@ -5494,6 +5625,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
           );
  }
  
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert ((val & 0xffffff) == val);
+
+  if (((val & 0xfff) << 0) == val)
+    return val;
+
+  return val & (0xfff << 12);
+}
  
  /* Return true if val is an immediate that can be loaded into a
     register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 77c949738c5dabcb2cbb42815120425899f8433a..b4a4315ff4b3b603299a5718ad5a489a75a8c5c9 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6514,6 +6514,25 @@
    [(set_attr "length" "32")]
  )
  
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=rk")
+       (unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+                           (match_operand:P 2 "register_operand" "r")
+                           (match_operand:P 3 "const_int_operand" "n")
+                           (match_operand:P 4 "aarch64_plus_immediate" "L")]
+                            UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+                                              operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
  ;; Named pattern for expanding thread pointer reference.
  (define_expand "get_thread_pointerdi"
    [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 9d7f24606451d0cba26d6de061c3a1fbce64f4a7..84c4fa9524572315f58f708a9a2e67cc6352d202 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2018-10-01  Tamar Christina  <tamar.christina@arm.com>
+
+       PR target/86486
+       * gcc.target/aarch64/stack-check-prologue-16.c: New test
+       * gcc.target/aarch64/stack-check-cfa-3.c: New test.
+       * gcc.target/aarch64/sve/struct_vect_24.c: New test.
+       * gcc.target/aarch64/sve/struct_vect_24_run.c: New test.
+
  2018-10-01  Jeff Law  <law@redhat.com>
             Richard Sandiford <richard.sandiford@linaro.org>
             Tamar Christina  <tamar.christina@arm.com>
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c

new file mode 100644 (file)

index 0000000..41579f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
+
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c

new file mode 100644 (file)

index 0000000..d92ef47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+/* Invoke X (P##n) for n in [0, 7].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39].  */
+#define REPEAT40(X) \
+  REPEAT8 (X, 0) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers.  */
+void
+f2 (int x[40][100], int *y)
+{
+  /* Try to force some spilling.  */
+#define DECLARE(N) int y##N = y[N];
+  REPEAT40 (DECLARE);
+#pragma omp simd
+  for (int i = 0; i < 100; ++i)
+    {
+#define INC(N) x[N][i] += y##N;
+      REPEAT40 (INC);
+    }
+}
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c

new file mode 100644 (file)

index 0000000..68a9d5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 64 * 1024
+
+/* Invoke X (P##n) for n in [0, 9].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+  X (P##8)  X (P##9)
+
+/* Invoke X (n) for all n in [0, 49].  */
+#define REPEAT50(X) \
+  REPEAT8 (X, ) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+  /* Try to force some spilling.  */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE)                          \
+  void __attribute__ ((noinline, noclone, simd))       \
+  NAME (TYPE *restrict dest, TYPE *restrict src)       \
+  {                                                    \
+    REPEAT50 (DECLARE);                                        \
+    volatile char foo[S];                              \
+    foo[S-1]=1;                                                \
+    for (int i = 0; i < N; i++)                                \
+      {                                                        \
+       REPEAT50 (INC);                                 \
+      }                                                        \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i32, int32_t) \
+  TEST_LOOP (NAME##_i64, int64_t) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c

new file mode 100644 (file)

index 0000000..e764476
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_24.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                          \
+  {                                                    \
+    TYPE out[N];                                       \
+    TYPE in[N * 4];                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       out[i] = i * 7 / 2;                             \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    for (int i = 0; i < N * 4; ++i)                    \
+      {                                                        \
+       in[i] = i * 9 / 2;                              \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    NAME (out, in);                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       TYPE expected = i * 7 / 2;                      \
+       if (out[i] != out[0] + expected)                \
+         __builtin_abort ();                           \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+  }
+
+int __attribute__ ((optimize (0)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)
committer	Tamar Christina <tnfchris@gcc.gnu.org>
	Mon, 1 Oct 2018 12:56:40 +0000 (12:56 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c	[new file with mode: 0644]	patch \| blob