From 3c5af60836eed835b818f2a87480155a497139a4 Mon Sep 17 00:00:00 2001 From: Matthew Malcomson Date: Wed, 19 Sep 2018 10:24:59 +0000 Subject: [PATCH] [AARCH64] Use STLUR for atomic_store Use the STLUR instruction introduced in Armv8.4-a. This instruction has the store-release semantic like STLR but can take a 9-bit unscaled signed immediate offset. Example test case: ``` void foo () { int32_t *atomic_vals = calloc (4, sizeof (int32_t)); atomic_store_explicit (atomic_vals + 1, 2, memory_order_release); } ``` Before patch generates ``` foo: stp x29, x30, [sp, -16]! mov x1, 4 mov x0, x1 mov x29, sp bl calloc mov w1, 2 add x0, x0, 4 stlr w1, [x0] ldp x29, x30, [sp], 16 ret ``` After patch generates ``` foo: stp x29, x30, [sp, -16]! mov x1, 4 mov x0, x1 mov x29, sp bl calloc mov w1, 2 stlur w1, [x0, 4] ldp x29, x30, [sp], 16 ret ``` We introduce a new feature flag to indicate the presence of this instruction. The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting armv8.4 architecture. We also introduce an "arch" attribute to be checked called "rcpc8_4" after this feature flag. gcc/ 2018-09-19 Matthew Malcomson * config/aarch64/aarch64-protos.h (aarch64_offset_9bit_signed_unscaled_p): New declaration. * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value. (arch_enabled): Add check for "rcpc8_4" attribute value of "arch". * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield. (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4. (AARCH64_FL_PROFILE): Move index so flags are ordered. (AARCH64_ISA_RCPC8_4): New flag. * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed to aarch64_offset_9bit_signed_unscaled_p. * config/aarch64/atomics.md (atomic_store): Allow offset and use stlur. * config/aarch64/constraints.md (Ust): New constraint. * config/aarch64/predicates.md. (aarch64_9bit_offset_memory_operand): New predicate. (aarch64_rcpc_memory_operand): New predicate. gcc/testsuite/ 2018-09-19 Matthew Malcomson * gcc.target/aarch64/atomic-store.c: New. From-SVN: r264421 --- gcc/ChangeLog | 19 +++++ gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.c | 17 +++-- gcc/config/aarch64/aarch64.h | 6 +- gcc/config/aarch64/aarch64.md | 5 +- gcc/config/aarch64/atomics.md | 9 ++- gcc/config/aarch64/constraints.md | 5 ++ gcc/config/aarch64/predicates.md | 30 ++++++++ gcc/testsuite/ChangeLog | 4 + .../gcc.target/aarch64/atomic-store.c | 75 +++++++++++++++++++ 10 files changed, 157 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/atomic-store.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ad3fd05002..619019d454c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,22 @@ +2018-09-19 Matthew Malcomson + + * config/aarch64/aarch64-protos.h + (aarch64_offset_9bit_signed_unscaled_p): New declaration. + * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value. + (arch_enabled): Add check for "rcpc8_4" attribute value of "arch". + * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield. + (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4. + (AARCH64_FL_PROFILE): Move index so flags are ordered. + (AARCH64_ISA_RCPC8_4): New flag. + * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed + to aarch64_offset_9bit_signed_unscaled_p. + * config/aarch64/atomics.md (atomic_store): Allow offset + and use stlur. + * config/aarch64/constraints.md (Ust): New constraint. + * config/aarch64/predicates.md. + (aarch64_9bit_offset_memory_operand): New predicate. + (aarch64_rcpc_memory_operand): New predicate. + 2018-09-19 Eric Botcazou PR rtl-optimization/87361 diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b26e46f81a4..caf1d2041f0 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -436,6 +436,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx); bool aarch64_mov_operand_p (rtx, machine_mode); rtx aarch64_reverse_mask (machine_mode, unsigned int); bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64); +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64); char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx); char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx); char *aarch64_output_sve_inc_dec_immediate (const char *, rtx); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 8cc738c11f0..cbf9d0c09b2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -4452,9 +4452,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset) /* Return true if OFFSET is a signed 9-bit value. */ -static inline bool -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, - poly_int64 offset) +bool +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, + poly_int64 offset) { HOST_WIDE_INT const_offset; return (offset.is_constant (&const_offset) @@ -5721,7 +5721,7 @@ aarch64_classify_address (struct aarch64_address_info *info, instruction memory accesses. */ if (mode == TImode || mode == TFmode) return (aarch64_offset_7bit_signed_scaled_p (DImode, offset) - && (offset_9bit_signed_unscaled_p (mode, offset) + && (aarch64_offset_9bit_signed_unscaled_p (mode, offset) || offset_12bit_unsigned_scaled_p (mode, offset))); /* A 7bit offset check because OImode will emit a ldp/stp @@ -5735,7 +5735,8 @@ aarch64_classify_address (struct aarch64_address_info *info, ldr/str instructions (only big endian will get here). */ if (mode == CImode) return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) - && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32) + && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, + offset + 32) || offset_12bit_unsigned_scaled_p (V16QImode, offset + 32))); @@ -5775,7 +5776,7 @@ aarch64_classify_address (struct aarch64_address_info *info, || known_eq (GET_MODE_SIZE (mode), 16)) && aarch64_offset_7bit_signed_scaled_p (mode, offset)); else - return (offset_9bit_signed_unscaled_p (mode, offset) + return (aarch64_offset_9bit_signed_unscaled_p (mode, offset) || offset_12bit_unsigned_scaled_p (mode, offset)); } @@ -5828,7 +5829,7 @@ aarch64_classify_address (struct aarch64_address_info *info, */ if (mode == TImode || mode == TFmode) return (aarch64_offset_7bit_signed_scaled_p (mode, offset) - && offset_9bit_signed_unscaled_p (mode, offset)); + && aarch64_offset_9bit_signed_unscaled_p (mode, offset)); if (load_store_pair_p) return ((known_eq (GET_MODE_SIZE (mode), 4) @@ -5836,7 +5837,7 @@ aarch64_classify_address (struct aarch64_address_info *info, || known_eq (GET_MODE_SIZE (mode), 16)) && aarch64_offset_7bit_signed_scaled_p (mode, offset)); else - return offset_9bit_signed_unscaled_p (mode, offset); + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); } return false; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 5c5a2268de9..e5cdb1d54f4 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version; #define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4. */ #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and SHA512. */ #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions. */ +#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions. */ /* Statistical Profiling extensions. */ -#define AARCH64_FL_PROFILE (1 << 20) +#define AARCH64_FL_PROFILE (1 << 21) /* Has FP and SIMD. */ #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version; (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3) #define AARCH64_FL_FOR_ARCH8_4 \ (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \ - | AARCH64_FL_DOTPROD) + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4) /* Macros to test ISA flags. */ @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version; #define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4) #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3) #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML) +#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4) /* Crypto is an optional extension to AdvSIMD. */ #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 2c0dbab92aa..7e7ca15c088 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -263,7 +263,7 @@ ;; alternative). This attribute is used to compute attribute "enabled", use type ;; "any" to enable an alternative in all cases. -(define_enum "arches" [ any fp simd sve fp16]) +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16]) (define_enum_attr "arch" "arches" (const_string "any")) @@ -285,6 +285,9 @@ (ior (eq_attr "arch" "any") + (and (eq_attr "arch" "rcpc8_4") + (match_test "AARCH64_ISA_RCPC8_4")) + (and (eq_attr "arch" "fp") (match_test "TARGET_FLOAT")) diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md index 36c06756a1f..bba8e9e9c8e 100644 --- a/gcc/config/aarch64/atomics.md +++ b/gcc/config/aarch64/atomics.md @@ -481,9 +481,9 @@ ) (define_insn "atomic_store" - [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q") + [(set (match_operand:ALLI 0 "aarch64_rcpc_memory_operand" "=Q,Ust") (unspec_volatile:ALLI - [(match_operand:ALLI 1 "general_operand" "rZ") + [(match_operand:ALLI 1 "general_operand" "rZ,rZ") (match_operand:SI 2 "const_int_operand")] ;; model UNSPECV_STL))] "" @@ -491,9 +491,12 @@ enum memmodel model = memmodel_from_int (INTVAL (operands[2])); if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model)) return "str\t%1, %0"; - else + else if (which_alternative == 0) return "stlr\t%1, %0"; + else + return "stlur\t%1, %0"; } + [(set_attr "arch" "*,rcpc8_4")] ) (define_insn "@aarch64_load_exclusive" diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 31fc3eafd8b..99dac3be807 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -225,6 +225,11 @@ (and (match_code "mem") (match_test "REG_P (XEXP (op, 0))"))) +(define_memory_constraint "Ust" + "@internal + A memory address with 9bit unscaled offset." + (match_operand 0 "aarch64_9bit_offset_memory_operand")) + (define_memory_constraint "Ump" "@internal A memory address suitable for a load/store pair operation." diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index d8f377b9603..5b08b03c586 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -359,6 +359,36 @@ (and (match_operand 0 "memory_operand") (match_code "reg" "0"))) +(define_predicate "aarch64_9bit_offset_memory_operand" + (and (match_operand 0 "memory_operand") + (ior (match_code "reg" "0") + (and (match_code "plus" "0") + (match_code "reg" "00") + (match_code "const_int" "01")))) +{ + rtx mem_op = XEXP (op, 0); + + if (REG_P (mem_op)) + return GET_MODE (mem_op) == DImode; + + rtx plus_op0 = XEXP (mem_op, 0); + rtx plus_op1 = XEXP (mem_op, 1); + + if (GET_MODE (plus_op0) != DImode) + return false; + + poly_int64 offset; + if (!poly_int_rtx_p (plus_op1, &offset)) + gcc_unreachable (); + + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); +}) + +(define_predicate "aarch64_rcpc_memory_operand" + (if_then_else (match_test "AARCH64_ISA_RCPC8_4") + (match_operand 0 "aarch64_9bit_offset_memory_operand") + (match_operand 0 "aarch64_sync_memory_operand"))) + ;; Predicates for parallel expanders based on mode. (define_special_predicate "vect_par_cnst_hi_half" (match_code "parallel") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f26e1e29fc2..c2739e8b7bb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2018-09-19 Matthew Malcomson + + * gcc.target/aarch64/atomic-store.c: New. + 2018-09-19 Richard Biener PR tree-optimization/87349 diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c new file mode 100644 index 00000000000..8cabc05b0d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c @@ -0,0 +1,75 @@ +/* { dg-do compile } */ +/* { dg-options "-march=armv8.4-a -O2" } */ + +#include + +typedef __INT8_TYPE__ int8_t; +typedef __INT16_TYPE__ int16_t; +typedef __INT32_TYPE__ int32_t; +typedef __INT64_TYPE__ int64_t; + +#define STORE_TESTS(size) \ + void \ + foo##size (int##size##_t *atomic_vals) \ +{ \ + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \ + atomic_store_explicit (atomic_vals, 2, memory_order_release); \ + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \ + atomic_store ((atomic_vals + 2), 2); \ + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \ +} + +STORE_TESTS (8); +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 { target { ! ilp32 } } } } */ +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 { target { ilp32 } } } } */ +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */ + +STORE_TESTS (16); +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */ + +STORE_TESTS (32); +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */ + +STORE_TESTS (64); +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */ + +void +foo_toolarge_offset (int64_t *atomic_vals) +{ + /* 9bit signed unscaled immediate => + largest representable value +255. + smallest representable value -256. */ + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release); + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release); +} + +void +foo_negative (int8_t *atomic_vals) +{ + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release); +} +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */ + +#pragma GCC target ("arch=armv8.3-a") +void +foo_older_arch (int64_t *atomic_vals) +{ + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release); +} + +/* Three times, one for each of the three above functions. */ +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */ -- 2.30.2