[AARCH64] Use STLUR for atomic_store
authorMatthew Malcomson <matthew.malcomson@arm.com>
Wed, 19 Sep 2018 10:24:59 +0000 (10:24 +0000)
committerMatthew Malcomson <matmal01@gcc.gnu.org>
Wed, 19 Sep 2018 10:24:59 +0000 (10:24 +0000)
Use the STLUR instruction introduced in Armv8.4-a.
This instruction has the store-release semantic like STLR but can take a
9-bit unscaled signed immediate offset.

Example test case:
```
void
foo ()
{
    int32_t *atomic_vals = calloc (4, sizeof (int32_t));
    atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
}
```

Before patch generates
```
foo:
stp x29, x30, [sp, -16]!
mov x1, 4
mov x0, x1
mov x29, sp
bl calloc
mov w1, 2
add x0, x0, 4
stlr w1, [x0]
ldp x29, x30, [sp], 16
ret
```

After patch generates
```
foo:
stp x29, x30, [sp, -16]!
mov x1, 4
mov x0, x1
mov x29, sp
bl calloc
mov w1, 2
stlur w1, [x0, 4]
ldp x29, x30, [sp], 16
ret
```

We introduce a new feature flag to indicate the presence of this instruction.
The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting
armv8.4 architecture.

We also introduce an "arch" attribute to be checked called "rcpc8_4" after this
feature flag.

gcc/

2018-09-19  Matthew Malcomson  <matthew.malcomson@arm.com>

* config/aarch64/aarch64-protos.h
(aarch64_offset_9bit_signed_unscaled_p): New declaration.
* config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
(arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
* config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
(AARCH64_FL_PROFILE): Move index so flags are ordered.
(AARCH64_ISA_RCPC8_4): New flag.
* config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
to aarch64_offset_9bit_signed_unscaled_p.
* config/aarch64/atomics.md (atomic_store<mode>): Allow offset
and use stlur.
* config/aarch64/constraints.md (Ust): New constraint.
* config/aarch64/predicates.md.
(aarch64_9bit_offset_memory_operand): New predicate.
(aarch64_rcpc_memory_operand): New predicate.

gcc/testsuite/

2018-09-19  Matthew Malcomson  <matthew.malcomson@arm.com>

* gcc.target/aarch64/atomic-store.c: New.

From-SVN: r264421

gcc/ChangeLog
gcc/config/aarch64/aarch64-protos.h
gcc/config/aarch64/aarch64.c
gcc/config/aarch64/aarch64.h
gcc/config/aarch64/aarch64.md
gcc/config/aarch64/atomics.md
gcc/config/aarch64/constraints.md
gcc/config/aarch64/predicates.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/atomic-store.c [new file with mode: 0644]

index 1ad3fd05002aefcc3e77a69307452785c2c1687b..619019d454c9e79f0d2ab2890ab13c6c40e4b739 100644 (file)
@@ -1,3 +1,22 @@
+2018-09-19  Matthew Malcomson  <matthew.malcomson@arm.com>
+
+       * config/aarch64/aarch64-protos.h
+       (aarch64_offset_9bit_signed_unscaled_p): New declaration.
+       * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
+       (arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
+       * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
+       (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
+       (AARCH64_FL_PROFILE): Move index so flags are ordered.
+       (AARCH64_ISA_RCPC8_4): New flag.
+       * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
+       to aarch64_offset_9bit_signed_unscaled_p.
+       * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
+       and use stlur.
+       * config/aarch64/constraints.md (Ust): New constraint.
+       * config/aarch64/predicates.md.
+       (aarch64_9bit_offset_memory_operand): New predicate.
+       (aarch64_rcpc_memory_operand): New predicate.
+
 2018-09-19  Eric Botcazou  <ebotcazou@adacore.com>
 
        PR rtl-optimization/87361
index b26e46f81a414bf71762527f84fd9ac38b81b829..caf1d2041f0cac8e3f975f8384a167a90dc638e5 100644 (file)
@@ -436,6 +436,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
index 8cc738c11f0581fdad0c14385826392eeb1816ad..cbf9d0c09b23712a67a5f0781c247cc859ade18d 100644 (file)
@@ -4452,9 +4452,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
 
 /* Return true if OFFSET is a signed 9-bit value.  */
 
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
-                              poly_int64 offset)
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+                                      poly_int64 offset)
 {
   HOST_WIDE_INT const_offset;
   return (offset.is_constant (&const_offset)
@@ -5721,7 +5721,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
             instruction memory accesses.  */
          if (mode == TImode || mode == TFmode)
            return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-                   && (offset_9bit_signed_unscaled_p (mode, offset)
+                   && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
                        || offset_12bit_unsigned_scaled_p (mode, offset)));
 
          /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5735,7 +5735,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
             ldr/str instructions (only big endian will get here).  */
          if (mode == CImode)
            return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-                   && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+                   && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
+                                                              offset + 32)
                        || offset_12bit_unsigned_scaled_p (V16QImode,
                                                           offset + 32)));
 
@@ -5775,7 +5776,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
                     || known_eq (GET_MODE_SIZE (mode), 16))
                    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
          else
-           return (offset_9bit_signed_unscaled_p (mode, offset)
+           return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
                    || offset_12bit_unsigned_scaled_p (mode, offset));
        }
 
@@ -5828,7 +5829,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
           */
          if (mode == TImode || mode == TFmode)
            return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-                   && offset_9bit_signed_unscaled_p (mode, offset));
+                   && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
          if (load_store_pair_p)
            return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5836,7 +5837,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
                     || known_eq (GET_MODE_SIZE (mode), 16))
                    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
          else
-           return offset_9bit_signed_unscaled_p (mode, offset);
+           return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
        }
       return false;
 
index 5c5a2268de98f4cbaaba7e198209d8949ad15062..e5cdb1d54f4ee96140202ea21a9478438d208f45 100644 (file)
@@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_SM4       (1 << 17)  /* Has ARMv8.4-A SM3 and SM4.  */
 #define AARCH64_FL_SHA3              (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
 #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
+#define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
 
 /* Statistical Profiling extensions.  */
-#define AARCH64_FL_PROFILE    (1 << 20)
+#define AARCH64_FL_PROFILE    (1 << 21)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4                 \
   (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
-   | AARCH64_FL_DOTPROD)
+   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
 
 /* Macros to test ISA flags.  */
 
@@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SM4                   (aarch64_isa_flags & AARCH64_FL_SM4)
 #define AARCH64_ISA_SHA3          (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML        (aarch64_isa_flags & AARCH64_FL_F16FML)
+#define AARCH64_ISA_RCPC8_4       (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
index 2c0dbab92aaf2aa97096d1f1438354a03b5f2149..7e7ca15c088f902dd20ad810f743b6e87828cf2c 100644 (file)
 ;; alternative). This attribute is used to compute attribute "enabled", use type
 ;; "any" to enable an alternative in all cases.
 
-(define_enum "arches" [ any fp simd sve fp16])
+(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
 
 (define_enum_attr "arch" "arches" (const_string "any"))
 
     (ior
        (eq_attr "arch" "any")
 
+       (and (eq_attr "arch" "rcpc8_4")
+            (match_test "AARCH64_ISA_RCPC8_4"))
+
        (and (eq_attr "arch" "fp")
             (match_test "TARGET_FLOAT"))
 
index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..bba8e9e9c8e61d95fcfb61e650e7e76671c8f996 100644 (file)
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_rcpc_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]                       ;; model
       UNSPECV_STL))]
   ""
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
+  [(set_attr "arch" "*,rcpc8_4")]
 )
 
 (define_insn "@aarch64_load_exclusive<mode>"
index 31fc3eafd8bba03cc773e226223a6293c6dde8d4..99dac3be807e6e0f399b3e273df6d54e95ef5a2e 100644 (file)
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_memory_constraint "Ust"
+  "@internal
+  A memory address with 9bit unscaled offset."
+  (match_operand 0 "aarch64_9bit_offset_memory_operand"))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
index d8f377b9603e76a29dd92f95e9905121eaf7b800..5b08b03c5868c7aa86f8844e3219a6e82717d4f0 100644 (file)
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+(define_predicate "aarch64_9bit_offset_memory_operand"
+  (and (match_operand 0 "memory_operand")
+       (ior (match_code "reg" "0")
+           (and (match_code "plus" "0")
+                (match_code "reg"  "00")
+                (match_code "const_int" "01"))))
+{
+  rtx mem_op = XEXP (op, 0);
+
+  if (REG_P (mem_op))
+    return GET_MODE (mem_op) == DImode;
+
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  if (!poly_int_rtx_p (plus_op1, &offset))
+    gcc_unreachable ();
+
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+
+(define_predicate "aarch64_rcpc_memory_operand"
+  (if_then_else (match_test "AARCH64_ISA_RCPC8_4")
+    (match_operand 0 "aarch64_9bit_offset_memory_operand")
+    (match_operand 0 "aarch64_sync_memory_operand")))
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
index f26e1e29fc21771bb85007067c6aa80ef50206f2..c2739e8b7bbe4bcd96686a55cc0c46f813eb46ac 100644 (file)
@@ -1,3 +1,7 @@
+2018-09-19  Matthew Malcomson  <matthew.malcomson@arm.com>
+
+       * gcc.target/aarch64/atomic-store.c: New.
+
 2018-09-19  Richard Biener  <rguenther@suse.de>
 
        PR tree-optimization/87349
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644 (file)
index 0000000..8cabc05
--- /dev/null
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdatomic.h>
+
+typedef __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __INT64_TYPE__ int64_t;
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size (int##size##_t *atomic_vals) \
+{ \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset (int64_t *atomic_vals)
+{
+  /* 9bit signed unscaled immediate =>
+       largest representable value +255.
+       smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch (int64_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */