From d2ed233cb940aa3eecc163d98b47979dd81dbc0a Mon Sep 17 00:00:00 2001 From: Andrea Corallo Date: Tue, 26 May 2020 17:47:13 +0100 Subject: [PATCH] arm: Implement Armv8.1-M low overhead loops gcc/ChangeLog 2020-06-18 Andrea Corallo Mihail-Calin Ionescu Iain Apreotesei * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): New prototype. * config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP): Define. (arm_invalid_within_doloop): Implement invalid_within_doloop hook. (arm_target_insn_ok_for_lob): New function. * config/arm/arm.h (TARGET_HAVE_LOB): Define macro. * config/arm/thumb2.md (*doloop_end_internal, doloop_begin) (dls_insn): Add new patterns. (doloop_end): Modify to select LR when LOB is available. * config/arm/unspecs.md: Add new unspec. * doc/sourcebuild.texi (arm_v8_1_lob_ok) (arm_thumb2_ok_no_arm_v8_1_lob): Document new target supports options. gcc/testsuite/ChangeLog 2020-06-18 Andrea Corallo Mihail-Calin Ionescu Iain Apreotesei * gcc.target/arm/lob.h: New header. * gcc.target/arm/lob1.c: New testcase. * gcc.target/arm/lob2.c: Likewise. * gcc.target/arm/lob3.c: Likewise. * gcc.target/arm/lob4.c: Likewise. * gcc.target/arm/lob5.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/unsigned-extend-2.c: Do not run when generating low loop overhead. * gcc.target/arm/ivopts.c: Fix check for low loop overhead. * lib/target-supports.exp (check_effective_target_arm_v8_1_lob) (check_effective_target_arm_thumb2_ok_no_arm_v8_1_lob): New procs. --- gcc/config/arm/arm-protos.h | 1 + gcc/config/arm/arm.c | 37 +++++++ gcc/config/arm/arm.h | 3 + gcc/config/arm/thumb2.md | 49 +++++++++- gcc/config/arm/unspecs.md | 1 + gcc/doc/sourcebuild.texi | 11 +++ gcc/testsuite/gcc.target/arm/ivopts.c | 2 +- gcc/testsuite/gcc.target/arm/lob.h | 15 +++ gcc/testsuite/gcc.target/arm/lob1.c | 85 ++++++++++++++++ gcc/testsuite/gcc.target/arm/lob2.c | 32 ++++++ gcc/testsuite/gcc.target/arm/lob3.c | 27 ++++++ gcc/testsuite/gcc.target/arm/lob4.c | 34 +++++++ gcc/testsuite/gcc.target/arm/lob5.c | 35 +++++++ gcc/testsuite/gcc.target/arm/lob6.c | 97 +++++++++++++++++++ .../gcc.target/arm/unsigned-extend-2.c | 2 +- gcc/testsuite/lib/target-supports.exp | 34 +++++++ 16 files changed, 459 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/lob.h create mode 100644 gcc/testsuite/gcc.target/arm/lob1.c create mode 100644 gcc/testsuite/gcc.target/arm/lob2.c create mode 100644 gcc/testsuite/gcc.target/arm/lob3.c create mode 100644 gcc/testsuite/gcc.target/arm/lob4.c create mode 100644 gcc/testsuite/gcc.target/arm/lob5.c create mode 100644 gcc/testsuite/gcc.target/arm/lob6.c diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 33d162c3e00..d52e8bfd37c 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -62,6 +62,7 @@ extern void arm_emit_speculation_barrier_function (void); extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); +extern bool arm_target_insn_ok_for_lob (rtx); #ifdef RTX_CODE enum reg_class diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index e15d2868435..ea0ac01e68c 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -833,6 +833,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_CONSTANT_ALIGNMENT #define TARGET_CONSTANT_ALIGNMENT arm_constant_alignment +#undef TARGET_INVALID_WITHIN_DOLOOP +#define TARGET_INVALID_WITHIN_DOLOOP arm_invalid_within_doloop + #undef TARGET_MD_ASM_ADJUST #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust @@ -33308,6 +33311,40 @@ arm_ge_bits_access (void) return true; } +/* NULL if insn INSN is valid within a low-overhead loop. + Otherwise return why doloop cannot be applied. */ + +static const char * +arm_invalid_within_doloop (const rtx_insn *insn) +{ + if (!TARGET_HAVE_LOB) + return default_invalid_within_doloop (insn); + + if (CALL_P (insn)) + return "Function call in the loop."; + + if (reg_mentioned_p (gen_rtx_REG (SImode, LR_REGNUM), insn)) + return "LR is used inside loop."; + + return NULL; +} + +bool +arm_target_insn_ok_for_lob (rtx insn) +{ + basic_block bb = BLOCK_FOR_INSN (insn); + /* Make sure the basic block of the target insn is a simple latch + having as single predecessor and successor the body of the loop + itself. Only simple loops with a single basic block as body are + supported for 'low over head loop' making sure that LE target is + above LE itself in the generated code. */ + + return single_succ_p (bb) + && single_pred_p (bb) + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src + && contains_no_active_insn_p (bb); +} + #if CHECKING_P namespace selftest { diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 30e1d6dc994..3887c51eebe 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -621,6 +621,9 @@ extern const int arm_arch_cde_coproc_bits[]; /* Target machine storage Layout. */ +/* Nonzero if this chip provides Armv8.1-M Mainline + LOB (low overhead branch features) extension instructions. */ +#define TARGET_HAVE_LOB (arm_arch8_1m_main) /* Define this macro if it is advisable to hold scalars in registers in a wider mode than that declared by the program. In such cases, diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 793f6706868..1a5f24e5276 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1555,8 +1555,11 @@ using a certain 'count' register and (2) the loop count can be adjusted by modifying this register prior to the loop. ??? The possible introduction of a new block to initialize the - new IV can potentially affect branch optimizations. */ - if (optimize > 0 && flag_modulo_sched) + new IV can potentially affect branch optimizations. + + Also used to implement the low over head loops feature, which is part of + the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) { rtx s0; rtx bcomp; @@ -1569,6 +1572,11 @@ FAIL; s0 = operands [0]; + + /* Low over head loop instructions require the first operand to be LR. */ + if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) + s0 = gen_rtx_REG (SImode, LR_REGNUM); + if (TARGET_THUMB2) insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); else @@ -1582,8 +1590,9 @@ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, loc_ref, pc_rtx))); DONE; - }else - FAIL; + } + else + FAIL; }") (define_insn "*clear_apsr" @@ -1650,3 +1659,35 @@ "TARGET_HAVE_MVE" "lsrl%?\\t%Q0, %R0, %1" [(set_attr "predicable" "yes")]) + +;; Originally expanded by 'doloop_end'. +(define_insn "*doloop_end_internal" + [(parallel [(set (pc) + (if_then_else + (ne (reg:SI LR_REGNUM) (const_int 1)) + (label_ref (match_operand 0 "" "")) + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])] + "TARGET_32BIT && TARGET_HAVE_LOB" + "le\t%|lr, %l0") + +(define_expand "doloop_begin" + [(match_operand 0 "" "") + (match_operand 1 "" "")] + "TARGET_32BIT && TARGET_HAVE_LOB" + { + if (REGNO (operands[0]) == LR_REGNUM) + { + emit_insn (gen_dls_insn (operands[0])); + DONE; + } + else + FAIL; + }) + +(define_insn "dls_insn" + [(set (reg:SI LR_REGNUM) + (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] UNSPEC_DLS))] + "TARGET_32BIT && TARGET_HAVE_LOB" + "dls\t%|lr, %0") diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 3250b0319e3..0a2399d4fb7 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -158,6 +158,7 @@ UNSPEC_CDEA ; Custom Datapath Extension instruction. UNSPEC_VCDE ; Custom Datapath Extension instruction. UNSPEC_VCDEA ; Custom Datapath Extension instruction. + UNSPEC_DLS ; Used for DLS (Do Loop Start), Armv8.1-M Mainline instruction ]) diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index 5b5b845afe6..9f37ac26241 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -2009,6 +2009,17 @@ ARM target supports the @code{-mfloat-abi=softfp} option. @anchor{arm_hard_ok} ARM target supports the @code{-mfloat-abi=hard} option. +@item arm_v8_1_lob_ok +@anchor{arm_v8_1_lob_ok} +ARM Target supports executing the Armv8.1-M Mainline Low Overhead Loop +instructions @code{DLS} and @code{LE}. +Some multilibs may be incompatible with these options. + +@item arm_thumb2_ok_no_arm_v8_1_lob +ARM target generates Thumb-2 code for @code{-mthumb} but does not +support executing the Armv8.1-M Mainline Low Overhead Loop +instructions @code{DLS} and @code{LE}. + @end table @subsubsection AArch64-specific attributes diff --git a/gcc/testsuite/gcc.target/arm/ivopts.c b/gcc/testsuite/gcc.target/arm/ivopts.c index 5d272405ec6..2733e66988e 100644 --- a/gcc/testsuite/gcc.target/arm/ivopts.c +++ b/gcc/testsuite/gcc.target/arm/ivopts.c @@ -11,6 +11,6 @@ tr5 (short array[], int n) } /* { dg-final { scan-tree-dump-times "PHI <" 1 "ivopts"} } */ -/* { dg-final { object-size text <= 20 { target arm_thumb2 } } } */ +/* { dg-final { object-size text <= 20 { target { arm_thumb2_ok_no_arm_v8_1_lob } } } } */ /* { dg-final { object-size text <= 32 { target { arm_nothumb && { ! arm_iwmmxt_ok } } } } } */ /* { dg-final { object-size text <= 36 { target { arm_nothumb && arm_iwmmxt_ok } } } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h new file mode 100644 index 00000000000..feaae7cc899 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob.h @@ -0,0 +1,15 @@ +#include + +/* Common code for lob tests. */ + +#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" ) + +#define N 10000 + +static void +reset_data (int *a, int *b, int *c) +{ + memset (a, -1, N * sizeof (*a)); + memset (b, -1, N * sizeof (*b)); + memset (c, -1, N * sizeof (*c)); +} diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c new file mode 100644 index 00000000000..b92dc551d50 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob1.c @@ -0,0 +1,85 @@ +/* Check that GCC generates Armv8.1-M low over head loop instructions + for some simple loops. */ +/* { dg-do run } */ +/* { dg-require-effective-target arm_v8_1_lob_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */ +#include +#include "lob.h" + +int a[N]; +int b[N]; +int c[N]; + +int +foo (int a, int b) +{ + return a + b; +} + +void __attribute__((noinline)) +loop1 (int *a, int *b, int *c) +{ + for (int i = 0; i < N; i++) + { + a[i] = i; + b[i] = i * 2; + c[i] = a[i] + b[i]; + } +} + +void __attribute__((noinline)) +loop2 (int *a, int *b, int *c) +{ + int i = 0; + while (i < N) + { + a[i] = i - 2; + b[i] = i * 5; + c[i] = a[i] + b[i]; + i++; + } +} + +void __attribute__((noinline)) +loop3 (int *a, int *b, int *c) +{ + int i = 0; + do + { + a[i] = i - 4; + b[i] = i * 3; + c[i] = a[i] + b[i]; + i++; + } while (i < N); +} + +void +check (int *a, int *b, int *c) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (c[i] != a[i] + b[i]) + abort (); + } +} + +int +main (void) +{ + reset_data (a, b, c); + loop1 (a, b ,c); + check (a, b ,c); + reset_data (a, b, c); + loop2 (a, b ,c); + check (a, b ,c); + reset_data (a, b, c); + loop3 (a, b ,c); + check (a, b ,c); + + return 0; +} + +/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 3 } } */ +/* { dg-final { scan-assembler-times {le\slr,\s\S*} 3 } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob2.c b/gcc/testsuite/gcc.target/arm/lob2.c new file mode 100644 index 00000000000..1fe9a9d82bb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob2.c @@ -0,0 +1,32 @@ +/* Check that GCC does not generate Armv8.1-M low over head loop instructions + if a non-inlineable function call takes place inside the loop. */ +/* { dg-do compile } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */ +#include +#include "lob.h" + +int a[N]; +int b[N]; +int c[N]; + +int __attribute__ ((noinline)) +foo (int a, int b) +{ + return a + b; +} + +int +main (void) +{ + for (int i = 0; i < N; i++) + { + a[i] = i; + b[i] = i * 2; + c[i] = foo (a[i], b[i]); + } + + return 0; +} +/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */ +/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob3.c b/gcc/testsuite/gcc.target/arm/lob3.c new file mode 100644 index 00000000000..17cba007ccb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob3.c @@ -0,0 +1,27 @@ +/* Check that GCC does not generate Armv8.1-M low over head loop instructions + if causes VFP emulation library calls to happen inside the loop. */ +/* { dg-do compile } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */ +/* { dg-require-effective-target arm_softfloat } */ +#include +#include "lob.h" + +double a[N]; +double b[N]; +double c[N]; + +int +main (void) +{ + for (int i = 0; i < N; i++) + { + a[i] = i; + b[i] = i * 2; + c[i] = a[i] + b[i]; + } + + return 0; +} +/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */ +/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob4.c b/gcc/testsuite/gcc.target/arm/lob4.c new file mode 100644 index 00000000000..444a2c7b4bf --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob4.c @@ -0,0 +1,34 @@ +/* Check that GCC does not generate Armv8.1-M low over head loop instructions + if LR is modified within the loop. */ +/* { dg-do compile } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */ +/* { dg-require-effective-target arm_softfloat } */ +#include +#include "lob.h" + +int a[N]; +int b[N]; +int c[N]; + +static __attribute__ ((always_inline)) inline int +foo (int a, int b) +{ + NO_LOB; + return a + b; +} + +int +main (void) +{ + for (int i = 0; i < N; i++) + { + a[i] = i; + b[i] = i * 2; + c[i] = foo(a[i], b[i]); + } + + return 0; +} +/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */ +/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob5.c b/gcc/testsuite/gcc.target/arm/lob5.c new file mode 100644 index 00000000000..c4f46e41532 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob5.c @@ -0,0 +1,35 @@ +/* Check that GCC does not generates Armv8.1-M low over head loop + instructions. Innermost loop has no fixed number of iterations + therefore is not optimizable. Outer loops are not optimized. */ +/* { dg-do compile } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */ +#include +#include "lob.h" + +int a[N]; +int b[N]; +int c[N]; + +int +main (void) +{ + for (int i = 0; i < N; i++) + { + a[i] = i; + b[i] = i * 2; + + int k = b[i]; + while (k != 0) + { + if (k % 2 == 0) + c[i - 1] = k % 2; + k /= 2; + } + c[i] = a[i] - b[i]; + } + + return 0; +} +/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */ +/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */ diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c new file mode 100644 index 00000000000..56126769460 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/lob6.c @@ -0,0 +1,97 @@ +/* Check that GCC generates Armv8.1-M low over head loop instructions + with some less trivial loops and the result is correct. */ +/* { dg-do run } */ +/* { dg-require-effective-target arm_v8_1_lob_ok } */ +/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */ +/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */ +#include +#include "lob.h" + +#define TEST_CODE1 \ + { \ + for (int i = 0; i < N; i++) \ + { \ + a[i] = i; \ + b[i] = i * 2; \ + \ + for (int k = 0; k < N; k++) \ + { \ + MAYBE_LOB; \ + c[k] = k / 2; \ + } \ + c[i] = a[i] - b[i]; \ + } \ + } + +#define TEST_CODE2 \ + { \ + for (int i = 0; i < N / 2; i++) \ + { \ + MAYBE_LOB; \ + if (c[i] % 2 == 0) \ + break; \ + a[i]++; \ + b[i]++; \ + } \ + } + +int a1[N]; +int b1[N]; +int c1[N]; + +int a2[N]; +int b2[N]; +int c2[N]; + +#define MAYBE_LOB +void __attribute__((noinline)) +loop1 (int *a, int *b, int *c) + TEST_CODE1; + +void __attribute__((noinline)) +loop2 (int *a, int *b, int *c) + TEST_CODE2; + +#undef MAYBE_LOB +#define MAYBE_LOB NO_LOB + +void +ref1 (int *a, int *b, int *c) + TEST_CODE1; + +void +ref2 (int *a, int *b, int *c) + TEST_CODE2; + +void +check (void) +{ + for (int i = 0; i < N; i++) + { + NO_LOB; + if (a1[i] != a2[i] + && b1[i] != b2[i] + && c1[i] != c2[i]) + abort (); + } +} + +int +main (void) +{ + reset_data (a1, b1, c1); + reset_data (a2, b2, c2); + loop1 (a1, b1, c1); + ref1 (a2, b2, c2); + check (); + + reset_data (a1, b1, c1); + reset_data (a2, b2, c2); + loop2 (a1, b1, c1); + ref2 (a2, b2, c2); + check (); + + return 0; +} +/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 1 } } */ +/* { dg-final { scan-assembler-times {le\slr,\s\S*} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c b/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c index 5668eb1eac4..9272e4ce9b8 100644 --- a/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c +++ b/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-require-effective-target arm_thumb2_ok } */ +/* { dg-require-effective-target arm_thumb2_ok_no_arm_v8_1_lob } */ /* { dg-options "-O" } */ unsigned short foo (unsigned short x, unsigned short c) diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 19aa51d3e3f..aeb0351073d 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -10251,6 +10251,40 @@ proc check_effective_target_arm_v8_3a_bkey_directive { } { }] } +# Return 1 if the target supports executing the Armv8.1-M Mainline Low +# Overhead Loop, 0 otherwise. The test is valid for ARM. + +proc check_effective_target_arm_v8_1_lob_ok { } { + if { ![istarget arm*-*-*] } { + return 0; + } else { + return [check_runtime arm_v8_1_lob_hw_available { + int + main (void) + { int i = 0; + asm ("movw r3, #10\n\t" /* movs? */ + "dls lr, r3" : : : "r3", "lr"); + loop: + i++; + asm goto ("le lr, %l0" : : : "lr" : loop); + return i != 10; + } + } "-march=armv8.1-m.main" ] + } +} + +# Return 1 is this is an ARM target where -mthumb causes Thumb-2 to be +# used and the target does not support executing the Armv8.1-M +# Mainline Low Overhead Loop, 0 otherwise. The test is valid for ARM. + +proc check_effective_target_arm_thumb2_ok_no_arm_v8_1_lob { } { + if { [check_effective_target_arm_thumb2_ok] + && ![check_effective_target_arm_v8_1_lob_ok] } { + return 1 + } + return 0 +} + # Returns 1 if the target is using glibc, 0 otherwise. proc check_effective_target_glibc { } { -- 2.30.2