From e140f5fd3e235c5a37dc99b79f37a5ad4dc59064 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 20 Jan 2021 18:11:20 +0000 Subject: [PATCH] aarch64: Split vec_selects of bottom elements into simple move In certain intrinsics use cases GCC leaves SETs of a bottom-element vec select lying around: (vec_select:DI (reg:V2DI 34 v2 [orig:128 __o ] [128]) (parallel [ (const_int 0 [0]) ]))) This can be treated as a simple move in aarch64 when done between SIMD registers for all normal widths. These go through the aarch64_get_lane pattern. This patch adds a splitter there to simplify these extracts to a move that can, perhaps, be optimised a way. Another benefit is if the destination is memory we can use a simpler STR instruction rather than ST1-lane. gcc/ * config/aarch64/aarch64-simd.md (aarch64_get_lane): Convert to define_insn_and_split. Split into simple move when moving bottom element. gcc/testsuite/ * gcc.target/aarch64/vdup_lane_2.c: Scan for fmov rather than dup. --- gcc/config/aarch64/aarch64-simd.md | 10 +++++++++- gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 41071b668fd..d7acd72bede 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3312,7 +3312,9 @@ ;; Lane extraction of a value, neither sign nor zero extension ;; is guaranteed so upper bits should be considered undefined. ;; RTL uses GCC vector extension indices throughout so flip only for assembly. -(define_insn "aarch64_get_lane" +;; Extracting lane zero is split into a simple move when it is between SIMD +;; registers or a store. +(define_insn_and_split "aarch64_get_lane" [(set (match_operand: 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv") (vec_select: (match_operand:VALL_F16 1 "register_operand" "w, w, w") @@ -3332,6 +3334,12 @@ gcc_unreachable (); } } + "&& reload_completed + && ENDIAN_LANE_N (, INTVAL (operands[2])) == 0" + [(set (match_dup 0) (match_dup 1))] + { + operands[1] = aarch64_replace_reg_mode (operands[1], mode); + } [(set_attr "type" "neon_to_gp, neon_dup, neon_store1_one_lane")] ) diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c index a49db3e963d..16f4808c1da 100644 --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c @@ -333,7 +333,7 @@ main () /* Asm check for vdups_lane_f32, vdups_lane_s32, vdups_lane_u32. */ /* Can't generate "dup s, v[0]" for vdups_lane_s32 and vdups_lane_u32. */ -/* { dg-final { scan-assembler-times "dup\\ts\[0-9\]+, v\[0-9\]+\.s\\\[0\\\]" 1} } */ +/* { dg-final { scan-assembler-times {fmov\ts0, s1} 1 } } */ /* { dg-final { scan-assembler-times "dup\\ts\[0-9\]+, v\[0-9\]+\.s\\\[1\\\]" 3 } } */ /* Asm check for vdupd_lane_f64, vdupd_lane_s64, vdupd_lane_u64. */ -- 2.30.2